# IR pipeline using beedtools, bwtools and DESeq2 -Hari Krishna Y #
# ColData should ne in the same directory #
# note: change colData files accordingly #
# Usage: <base DIR> <TR key> <Name>

import os, sys, glob
import pandas as pd

# Convert Bam to BW #
files=glob.glob(sys.argv[1]+"/*.sorted.bam")
for file in files:
	out=file.split('/')[-1].replace(".sorted.bam",".bw")
	# Convert bam to bw #
	cmd="bamCoverage -p 26 -bs 2 --ignoreForNormalization chrx chr2L chr2R chr3L chr4 chr3R chrY MT -b "+file+" -o "+out
	os.system(cmd)

# Get Intron Counts #
files=glob.glob("*.bw")
for file in files:
	out=file.split('/')[-1].replace(".bw",".IntronCounts.txt")
	# Extract bw coverage #
	cmd="bwtool summary dm6_introns.bed "+file+" -header -with-sum -fill=0 "+out
	os.system(cmd)

# Extract Intron Coverage # 
files=glob.glob("*.bw")
for file in files:
	out=file.split('/')[-1].replace(".bw",".IntronCov.txt")
	# Extract bw coverage #
	cmd="bwtool extract bed dm6_introns.bed "+file+" "+out
	os.system(cmd)

# Get Fully Covered Introns #
files=glob.glob("*.IntronCov.txt")
for file in files:
	out1=file.replace(".IntronCov.txt",".FullCovIntron.txt")
	fw1=open(out1,"w")
	lines=open(file,"r").readlines()
	for line in lines:
		zc=0
		fc=1
		data=line.strip().split("\t")
		intron="\t".join(data[0:7])
		il=int(data[6])
		cov=data[7].split(",")
		for ele in cov:
			if ele =="0.00":
				zc=zc+1
		if zc == 0:
			fw1.write(intron+"\n")

# Make union of fully covered Introns in TR samples #
files=glob.glob(sys.argv[1]+"/"+sys.argv[2]+"*.FullCovIntron.txt")
union={}
for file in files:
	lines=open(file,"r").readlines()
	for line in lines:
		data=line.strip().split("\t")
		union["_".join(data[0:3])]=1

# Merging Data #
files=glob.glob(sys.argv[1]+"/*.IntronCounts.txt")
files.sort()
samples=[]
for f in files:
	samples.append(f.split("/")[-1])
fw=open(sys.argv[1]+"/"+sys.argv[3]+".ICounts.txt","w")
h="\t".join(samples)
fw.write("Intron\t"+h.replace(".IntronCounts.txt","")+"\n")
os.system('paste '+' '.join(files)+' -d "\t" > Temp101.txt')

lines=open("Temp101.txt","r").readlines()[1:]
os.system("rm Temp101.txt")
for line in lines:
	data=line.strip().split("\t")
	fw.write("_".join(data[:3])+"\t"+data[9]+"\t"+data[19]+"\t"+data[29]+"\t"+data[39]+"\t"+data[49]+"\t"+data[59]+"\n")
fw.close()

# Filter out non fully covered records #
df = pd.read_csv(sys.argv[1]+"/"+sys.argv[3]+".ICounts.txt",sep="\t",index_col=None)
mask = df['Intron'].isin(union.keys())
df=df[mask]
df=df.drop_duplicates()
df.to_csv(sys.argv[1]+"/"+sys.argv[3]+".FC.ICounts.txt", sep="\t", index=False)

# run differential test #
os.system("Rscript DEIntron.r ColData.txt "+sys.argv[1]+"/"+sys.argv[3]+".FC.ICounts.txt "+sys.argv[1]+"/"+sys.argv[3]+".DEI.txt")

