# Get 1st, 2nd penultimate and ultimate Introns #
# < gtf> <isoform bed> <intron bed>
import os, sys, glob

# Read GTF to map Isoform to Gene  #
ihash={}
ghash={}
lines=open(sys.argv[1],"r").readlines()
for line in lines:
	gene=line.strip().split("\t")[8].split(";")[0].replace("gene_id ","").replace('"','').replace(' ','')
	isoform=line.strip().split("\t")[8].split(";")[2].replace("transcript_id ",'').replace(' ','').replace('"','')
	ihash[isoform]=gene
	try:
		ghash[gene].append(isoform)
	except:
		ghash[gene]=[]
		ghash[gene].append(isoform)

# Write Gene to Isoform Counts #
giscount={}
for k in ghash.keys():
	giscount[k]=str(len(set(ghash[k])))


# Isoform to Intron counts # 
iihash={}
lines=open(sys.argv[2],"r").readlines()
for line in lines:
	isoform=line.split("\t")[3].split(".")[0]
	iihash[isoform]=int(line.split("\t")[9])-1

# Max Intron Isoform to Gene #
gicount={}
for k in ghash.keys():
	ghash[k]=list(set(ghash[k]))
	max=0
	for e in ghash[k]:
		try:
			if int(iihash[e]) > max:
				max=int(iihash[e])
		except:
			pass
	gicount[k]=str(max)

# Read introns and append to gene Hash #
gip={}
hash={}
lines=open(sys.argv[3],"r").readlines()
for line in lines:
	isoform=line.split("\t")[3].split(".")[0]
	intron=line.split("\t")[1]+"-"+line.split("\t")[2]
	try:
		hash[ihash[isoform]].append(intron)
	except:
		hash[ihash[isoform]]=[]
		hash[ihash[isoform]].append(intron)

for k in hash.keys():
	hash[k]=list(set(hash[k]))
	strand=hash[k][0].split("\t")[-1]
	if strand == "+":
		hash[k]=hash[k].sort()
	if strand == "-":
		hash[k]=hash[k].sort(reverse=True)
	if len(hash[k]) >=2:
		gip[k]=hash[k][0]+"\t"+hash[k][1]+"\t"+hash[k][-2]+"\t"+hash[k][-1]
	elif len(hash[k]) ==1:
		gip[k]=hash[k][0]+"\t"+"NA"+"\t"+"NA"+"\t"+hash[k][-1]
	else:
		gip[k]="NA"+"\t"+"NA"+"\t"+"NA"+"\t"+"NA"

print "Gene	# of Isoforms	# of Introns	1st Intron	2nd Intron	2nd Last	1st Last"
for k in gip.keys():
	print k+"\t"+giscount[k]+"\t"+gicount[k]+"\t"+gip[k]
