# Add Skipped Junctions for D, A and NDA, and closest Jn to CrypSplice results #

import os, sys, glob
import pandas as pd
import numpy as np

def skippedJN(row,rdf):
	c=str(row['anchor'])
	x=0;y=0;
	if (c == 'A'and row['strand'] == "+") or (c == 'D'and row['strand'] == "-") or (c =='NDA'and row['strand'] == "+"):
		trdf=rdf[(rdf['chr']==row['chr']) & (rdf['chr_ed']==row['chr_ed'])]
		if trdf.shape[0] > 0:
			trdf['distance']=abs(trdf['chr_st']-row['chr_st'])
			trdf=trdf.sort_values(['distance'],ascending=1)
			x=str(trdf.iloc[0,1])
		else:
			return ("NA_NA_NA_NA")
		if c == 'NDA':
			trdf=rdf[(rdf['chr']==row['chr']) & (rdf['chr_st']==row['chr_st'])]
			if trdf.shape[0] > 0:
				trdf['distance']=abs(trdf['chr_ed']-row['chr_ed'])
				trdf=trdf.sort_values(['distance'],ascending=1)
				y= str(trdf.iloc[0,2])
			else:
				return ("NA_NA_NA_NA")
		else:
			y=str(row['chr_ed'])
	if (c == 'A'and row['strand'] == "-") or (c == 'D'and row['strand'] == "+") or (c =='NDA'and row['strand'] == "-"):
		trdf=rdf[(rdf['chr']==row['chr']) & (rdf['chr_st']==row['chr_st'])]
		if trdf.shape[0] > 0:
			trdf['distance']=abs(trdf['chr_ed']-row['chr_ed'])
			trdf=trdf.sort_values(['distance'],ascending=1)
			y= str(trdf.iloc[0,2])
		else:
			return ("NA_NA_NA_NA")
		if c == 'NDA':
			trdf=rdf[(rdf['chr']==row['chr']) & (rdf['chr_ed']==row['chr_ed'])]
			if trdf.shape[0] > 0:
				trdf['distance']=abs(trdf['chr_st']-row['chr_st'])
				trdf=trdf.sort_values(['distance'],ascending=1)
				x=str(trdf.iloc[0,1])
			else:
				return ("NA_NA_NA_NA")
		else:
			x=str(row['chr_st'])
	if c == "N":
		return("NA_NA_NA_NA")
	return (row['chr']+"_"+x+"_"+y+"_"+row['strand'])

def add_skippedJN(df,kj):
	classification=['A','D','NDA','N']
	rdf=pd.read_csv(kj,sep="\t",index_col=None,header=None,names=['chr','chr_st','chr_ed','gene','isoform','strand'])
	df['SkippedJN'] = df.apply(lambda row: skippedJN(row,rdf),axis=1)
	return df

files=glob.glob("*UN.2.txt")
for file in files:
	df=pd.DataFrame.from_csv(file,sep="\t",index_col=None)
	# Add skipped JN #
	df=add_skippedJN(df,sys.argv[1])
	df.fillna('NA', inplace=True)
	# Split the column #
	df = df.join(df['SkippedJN'].str.split('_', expand=True).add_prefix('SkippedJN'))
	df.to_csv(file.replace(".txt",".3.txt"),sep="\t",index=False)

	# Extract bed #
	f5=open("5pSS.bed","w")
	f3=open("3pSS.bed","w")
	lines=open(file.replace(".txt",".3.txt"),"r").readlines()
	for line in lines:
		data=line.strip().split("\t")
		print(data)
		if data[-1] == "+":
			p5=data[-4]+"\t"+str(int(data[-3])-3)+"\t"+str(int(data[-3])+6)+"\t"+data[0]+"\t"+data[0]+"\t"+data[-1]
			f5.write(p5+"\n")
			p3=data[-4]+"\t"+str(int(data[-2])-20)+"\t"+str(int(data[-2])+3)+"\t"+data[0]+"\t"+data[0]+"\t"+data[-1]
			f3.write(p3+"\n")
		if data[-1] == "-":
			p5=data[-4]+"\t"+str(int(data[-2])-6)+"\t"+str(int(data[-2])+3)+"\t"+data[0]+"\t"+data[0]+"\t"+data[-1]
			f5.write(p5+"\n")
			p3=data[-4]+"\t"+str(int(data[-3])-3)+"\t"+str(int(data[-3])+20)+"\t"+data[0]+"\t"+data[0]+"\t"+data[-1]
			f3.write(p3+"\n")
	f5.close();f3.close();

	# Get fasta Sequences #

	os.system("bedtools getfasta -fo 5pSS.fasta -name -s -fi ACScodes/dm6.fasta -bed 5pSS.bed")
	os.system("bedtools getfasta -fo 3pSS.fasta -name -s -fi ACScodes/dm6.fasta -bed 3pSS.bed")
	os.system("rm 5pSS.bed 3pSS.bed")

	# Run entropy #
	os.system("perl score5.pl 5pSS.fasta > 5pSSscores.txt")
	os.system("perl score3.pl 3pSS.fasta > 3pSSscores.txt")

	# get ID to seq map#
	idhash5={}
	idhash3={}
	lines5=open("5pSS.fasta","r").readlines()
	lines3=open("3pSS.fasta","r").readlines()
	for i in range (0,len(lines5),2):
		idhash5[lines5[i].strip().replace(">","")]=lines5[i+1].strip()
		idhash3[lines3[i].strip().replace(">","")]=lines3[i+1].strip()
	os.system("rm 5pSS.fasta 3pSS.fasta")

	# map id seq and score #
	scorehash5={}
	scorehash3={}
	lines=open("5pSSscores.txt","r").readlines()
	for line in lines:
		scorehash5[line.strip().split("\t")[0]]=line.strip().split("\t")[1]
	lines=open("3pSSscores.txt","r").readlines()
	for line in lines:
		scorehash3[line.strip().split("\t")[0]]=line.strip().split("\t")[1]

	fw=open("Entropy.txt","w")
	fw.write("ID\tS5pSequence\tS5pScore\tS3pSequence\tS3pScore\n")
	for k in idhash5.keys():
		if "N" not in idhash5[k] and "N" not in idhash3[k]:
			fw.write(k+"\t"+idhash5[k]+"\t"+scorehash5[idhash5[k]]+"\t"+idhash3[k]+"\t"+scorehash3[idhash3[k]]+"\n")
		elif "N" in idhash5[k] and "N" in idhash3[k]:
			fw.write(k+"\t"+idhash5[k]+"\t"+"NA"+"\t"+idhash3[k]+"\t"+"NA"+"\n")
		elif "N" in idhash3[k]:
			fw.write(k+"\t"+idhash5[k]+"\t"+scorehash5[idhash5[k]]+"\t"+idhash3[k]+"\t"+"NA"+"\n")
		else:
			fw.write(k+"\t"+idhash5[k]+"\t"+"NA"+"\t"+idhash3[k]+"\t"+scorehash3[idhash3[k]]+"\n")
	fw.close()

	cryp = pd.read_csv(file.replace(".txt",".3.txt"), sep = "\t")
	ft = pd.read_table("Entropy.txt", sep = "\t", header = 0)
	cryp_new = pd.merge(cryp, ft, how = 'left' , on = ['ID'])
	cryp_new.fillna('NA', inplace=True)
	cryp_new.to_csv(file.replace(".txt",".3.txt"), sep = "\t", index = False)
	os.system("rm 5pSSscores.txt 3pSSscores.txt Entropy.txt")


