# CrypticLoad #
# CrypticLoad.py <BAM> <Ref Fasta> <Ref GTF> <ncores> #
# Require: regtools, featurecounts and samtools #

import os, sys, glob, tempfile, commands
import pandas as pd
from collections import defaultdict
from StringIO import StringIO


def get_Junctions(sample):
	out = sample.replace(".sorted.bam",".bed")
	cmd = 'regtools junctions extract -o '+out+' '+sample
	os.system(cmd)

def annotate_Junctions(sample,fasta,gtf):
	os.system("regtools junctions annotate -E -o "+sample.replace(".bed",".JunAnno.txt")+" "+sample+" "+fasta+" "+gtf)
		
def get_Exon_Counts(bam_file, ncores):
	tmp_path = os.path.join(tempfile.gettempdir(), "tmp.saf")
	# Get 5' Exon Coordinates as SAF file #
	#with open(bam_file.replace(".sorted.bam",".JunAnno.txt"), "r") as inp, open(tmp_path, "w") as oup:
	#	inp.readline()
	inp=open(bam_file.replace(".sorted.bam",".JunAnno.txt"), "r").readlines()[1:]
	oup=open(tmp_path, "w")
	for line in inp:
		data=line.split("\t")
		pos = str(int(data[1])-3 if data[5] == "+" else int(data[2])+3)
		record = [data[3], data[0], pos, pos, data[5]]
		oup.write("\t".join(record)+"\n")
	#FeatureCounts #
	ec_file_path = bam_file.replace(".sorted.bam","_EC.txt")
	cmd="featureCounts -a {} -F SAF -f -O --readExtension5 100 --readExtension3 100 -M -s 0 -p -B -C -T {}"\
			" -o {} {}".format(tmp_path,
							   ncores,
							   ec_file_path,
							   bam_file)
	os.system(cmd)
	oup.close()
	os.unlink(bam_file.replace(".sorted.bam","_EC.txt.summary"))
	os.unlink(tmp_path)

def make_Matrix(JnAnnotations, ExCounts):
	out=JnAnnotations.replace(".JunAnno.txt",".CrypticLoad.csv")
	jcount=JnAnnotations
	ecount=ExCounts
	janno=JnAnnotations
	# Red junction count #
	df=pd.DataFrame.from_csv(jcount,sep="\t",index_col=None)
	df=df.iloc[:, range(0,15)] # Remove Transcript column
	# Read exon count #
	data = pd.read_csv(ecount, skiprows=range(0,1), sep="\t",index_col=None)
	# Check any record number differences #
	if data.shape[0]+1 == df.shape[0]:
		# All good !! #
		df['EX'] = data.iloc[:,-1].tolist()
		df = df.rename(columns={'score':'JC'})
		# Get junction strength columns and diff #
		df['JS'] = df['JC']/df['EX'] # Check if the order is the same to append the column #
	else:
		data = data.rename(columns={'Geneid':'name'})
		df = pd.merge(df, data, on='name')
		#print df.columns
		df = df.rename(columns={'score':'JC'})
		df = df.rename(columns={bam_file:'EX'})
		#print df.columns
		df['JS'] = df['JC']/df['EX'] # Check if the order is the same to append the column #
	#print df.shape[0]
	df=df.fillna("NA")
	cols = [15,16,17,18,19]
	df.drop(df.columns[cols],axis=1,inplace=True)
	df.to_csv(out, index=False)
	return df
	#df3.loc[df3['known_junction'] == 1].to_csv(file_path_csv2, index=False)

def get_SequencingDepth(bam_file):
	status, output = commands.getstatusoutput("samtools flagstat "+bam_file)
	sd=round(float(output.split("\n")[4].split()[0])/1000000.00,0)
	return sd

def compute_CrypticLoad(df,sd):
	cdf=df.loc[(df['anchor'] == "N") | (df['anchor'] == "D") | (df['anchor'] == "A")]
	v1=(cdf.shape[0])/sd
	#print v1
	v2=(cdf['JS'].sum())/float(sd)
	#print v2
	v3=((cdf['JS'].sum())/v1)/float(sd)
	#print v3
	cdf2=cdf.loc[cdf['JC'] > sd/10]
	v4=cdf2.shape[0]/sd
	v5=(cdf2['JS'].sum())/float(sd)
	v6=((cdf2['JS'].sum())/v4)/float(sd)
	return[str(v1),str(v2),str(v3),str(v4),str(v5),str(v6)]


# ********************** MAIN ****************** #
if len(sys.argv) != 5:
	print "\nInvalid number of arguments exiting...\n"
	exit()

if ".bam" not in sys.argv[1] and ".fasta" not in sys.argv[2] and ".gtf" not in sys.argv[3] and int(sys.argv[4]) < 1:
	print "\nInvalid arguments. Please check and rerun, exiting...\n"
	exit()

bam_file = os.path.abspath(sys.argv[1])
fasta=os.path.abspath(sys.argv[2])
gtf=os.path.abspath(sys.argv[3])
ncores=int(sys.argv[4])

get_Junctions(bam_file)
annotate_Junctions(bam_file.replace(".sorted.bam",".bed"),fasta,gtf)
get_Exon_Counts(bam_file, ncores)
df=make_Matrix(bam_file.replace(".sorted.bam",".JunAnno.txt"),bam_file.replace(".sorted.bam","_EC.txt"))
sd=get_SequencingDepth(bam_file)
#print "\nComputed Sequencing Depth: "+str(sd)
CrypticLoad=compute_CrypticLoad(df,sd)
print bam_file.replace(".sorted.bam","")+"\t"+"\t".join(CrypticLoad)
