import os, sys, glob
import pandas as pd
import numpy as np
from collections import defaultdict
#from StringIO import StringIO

def add_gene_name(df,genes):
	t=df[['chr','chr_st','chr_ed']]
	t.insert(3, "Dummy1", 0)
	t.insert(3, "Dummy2", 0)
	t.to_csv("temp342.txt",sep="\t",index=False,header=False)
	s=open("temp342.txt","r").read()
	s=s.replace(".0","")
	fw=open("temp342.txt","w")
	fw.write(s)
	fw.close()
	os.system("bedtools sort -i temp342.txt > temp34.txt")
	os.system("bedtools closest -a temp34.txt -b "+genes+" -d -t first -k 1 > temp342.bed")
	t2=pd.DataFrame.from_csv("temp342.bed",sep="\t",index_col=None,header=None)
	t2=t2.iloc[:,[0,1,2,-2,-3,-1]]
	t2.columns=['chr','chr_st','chr_ed','strand','Gene','Distance']
	#print t2.shape[0]
	#print df.columns
	#print t2.columns
	df=df.merge(t2, on=['chr','chr_st','chr_ed'],how='inner')
	#print df.columns
	os.system("rm temp34*.*")
	return df

def fill_col(st, ed, colm0, colm1):
	if st >= colm0 and st <= colm1:
		return 1
	elif ed >= colm0 and ed <= colm1:
		return 1
	else:
		return 0

# Clean IROut and add gene  #

files=glob.glob("*.DEIn.txt")
for file in files:
	print("processing "+file)
	df = pd.read_table(file, sep = "\t")
	cols=df.columns
	temp = df.Gene.str.split('_', expand=True)
	df[['chr','chr_st','chr_ed']] = df.Gene.str.split('_', expand=True)
	t=list(cols)
	t.pop(0)
	df=df[['ID','chr', 'chr_st','chr_ed']+t]
	df=df.drop(columns=['Gene'])
	df.chr_st = df.chr_st.astype(int)
	df.chr_ed = df.chr_ed.astype(int)
	df=add_gene_name(df,"AIRcodes/dm6.ucsc.refseq.bed")
	df.fillna('NA', inplace=True)
	cols=list(df.columns)
	t=cols[-3]
	cols.pop(-3)
	cols.insert(4, t)
	df=df[cols]
	df.to_csv(file.replace(".txt",".2.txt"),sep="\t",index=False)

# Ectract Features #
#os.system("python AIRcodes/Gene2IntronPositions.py AIRcodes/dm6.corrected.gtf AIRcodes/dm6.bed AIRcodes/dm6.intron.bed > dm6.gene.features.txt")

# Add fearures to IR out #

files=glob.glob("*DEIn.2.txt")
for file in files:
	cryp = pd.read_csv(file, sep = "\t")
	ft = pd.read_table("dm6.gene.features.txt", sep = "\t", header = 0)
	cols = ['1st Intron', '2nd Intron', '2nd Last', '1st Last']
	ft_new = ft
	for i in cols:
		ft_new[i] = ft_new[i].fillna('0-0')
		ft_new = ft_new.join(ft[i].str.split('-', expand=True).add_prefix(i).fillna(0).astype('int'))
	cryp_new = pd.merge(cryp, ft_new, how = 'left' , on = ['Gene'])
	
	cryp_new['1status']= cryp_new.apply(lambda x:fill_col(x['chr_st'], x['chr_ed'], x['1st Intron0'], x['1st Intron1']), axis=1)
	cryp_new['2status']= cryp_new.apply(lambda x:fill_col(x['chr_st'], x['chr_ed'], x['2nd Intron0'], x['2nd Intron1']), axis=1)
	cryp_new['-2status']= cryp_new.apply(lambda x:fill_col(x['chr_st'], x['chr_ed'], x['2nd Last0'], x['2nd Last1']), axis=1)
	cryp_new['-1status']= cryp_new.apply(lambda x:fill_col(x['chr_st'], x['chr_ed'], x['1st Last0'], x['1st Last1']), axis=1)

	cryp_new.to_csv(file, sep = "\t", index = False)

# Extract 5p 3p seq of IR #
files=glob.glob("*.DEIn.2.txt")
for file in files:
	f5=open("5pSS.bed","w")
	f3=open("3pSS.bed","w")
	lines=open(file,"r").readlines()
	for line in lines:
		data=line.split("\t")
		if data[4] == "+":
			p5=data[1]+"\t"+str(int(data[2])-3)+"\t"+str(int(data[2])+6)+"\t"+data[0]+"\t"+data[0]+"\t"+data[4]
			f5.write(p5+"\n")
			p3=data[1]+"\t"+str(int(data[3])-20)+"\t"+str(int(data[3])+3)+"\t"+data[0]+"\t"+data[0]+"\t"+data[4]
			f3.write(p3+"\n")		
		if data[4] == "-":
			p5=data[1]+"\t"+str(int(data[3])-6)+"\t"+str(int(data[3])+3)+"\t"+data[0]+"\t"+data[0]+"\t"+data[4]
			f5.write(p5+"\n")	
			p3=data[1]+"\t"+str(int(data[2])-3)+"\t"+str(int(data[2])+20)+"\t"+data[0]+"\t"+data[0]+"\t"+data[4]
			f3.write(p3+"\n")				
	f5.close();f3.close();

	# Get fasta Sequences #

	os.system("bedtools getfasta -fo 5pSS.fasta -name -s -fi AIRcodes/dm6.fasta -bed 5pSS.bed")
	os.system("bedtools getfasta -fo 3pSS.fasta -name -s -fi AIRcodes/dm6.fasta -bed 3pSS.bed")
	os.system("rm 5pSS.bed 3pSS.bed")

	# Run entropy #
	os.system("perl score5.pl 5pSS.fasta > 5pSSscores.txt")
	os.system("perl score3.pl 3pSS.fasta > 3pSSscores.txt")

	# get ID to seq map#
	idhash5={}
	idhash3={}
	lines5=open("5pSS.fasta","r").readlines()
	lines3=open("3pSS.fasta","r").readlines()
	for i in range (0,len(lines5),2):
		idhash5[lines5[i].strip().replace(">","")]=lines5[i+1].strip()
		idhash3[lines3[i].strip().replace(">","")]=lines3[i+1].strip()
	os.system("rm 5pSS.fasta 3pSS.fasta")

	# map id seq and score #
	scorehash5={}
	scorehash3={}
	lines=open("5pSSscores.txt","r").readlines()
	for line in lines:
		scorehash5[line.strip().split("\t")[0]]=line.strip().split("\t")[1]
	lines=open("3pSSscores.txt","r").readlines()
	for line in lines:
		scorehash3[line.strip().split("\t")[0]]=line.strip().split("\t")[1]

	fw=open("Entropy.txt","w")
	fw.write("ID\t5pSequence\t5pScore\t3pSequence\t3pScore\n")
	for k in idhash5.keys():
		if "N" not in idhash5[k] and "N" not in idhash3[k]:
			fw.write(k+"\t"+idhash5[k]+"\t"+scorehash5[idhash5[k]]+"\t"+idhash3[k]+"\t"+scorehash3[idhash3[k]]+"\n")
		elif "N" in idhash5[k] and "N" in idhash3[k]:
			fw.write(k+"\t"+idhash5[k]+"\t"+"NA"+"\t"+idhash3[k]+"\t"+"NA"+"\n")
		elif "N" in idhash3[k]:
			fw.write(k+"\t"+idhash5[k]+"\t"+scorehash5[idhash5[k]]+"\t"+idhash3[k]+"\t"+"NA"+"\n")
		else:
			fw.write(k+"\t"+idhash5[k]+"\t"+"NA"+"\t"+idhash3[k]+"\t"+scorehash3[idhash3[k]]+"\n")
	fw.close()

	cryp = pd.read_csv(file, sep = "\t")
	ft = pd.read_table("Entropy.txt", sep = "\t", header = 0)
	cryp_new = pd.merge(cryp, ft, how = 'left' , on = ['ID'])
	cryp_new=cryp_new.drop(["1st Intron","2nd Intron","2nd Last","1st Last","1st Intron0","1st Intron1","2nd Intron0","2nd Intron1","2nd Last0","2nd Last1","1st Last0","1st Last1"],axis=1)
	cryp_new.to_csv(file, sep = "\t", index = False)
	os.system("rm 5pSSscores.txt 3pSSscores.txt Entropy.txt")

