# Given a UTR branch length feel and the name of a transcript, segments the UTR # into smaller regions of distinct conservation and plots output # USAGE: plot_transitions.py INFILE TRANSCRIPT OUTFILE # INFILE is a file of base wise 3` UTR branch length scores, provided in the repo # TRANSCRIPT is a transcript name from a line in INFILE # OUTFIELD is the .pdf filename for the output segmentation plot import sys import matplotlib.pyplot as plt import matplotlib.colors as colors import numpy as np from scipy.stats import ttest_ind COLOR1 = '#1b9e77' COLOR2 = '#d95f02' #COLOR1 = colors.to_hex('m') #COLOR2 = colors.to_hex('y') COLOR1= 'm' COLOR2= 'y' INFILE, TRANSNAME, OUTFILE = sys.argv[1:] def find_transitions(obs, tol=-15, window=50, min_peak_space=100): logpvals = [] for i in range(2,len(obs)-2): # obtain right and left windows left = np.array(obs[max(0,i-window):i]) right = np.array(obs[i:min(i+window, len(obs))]) # calculate t-statistic and p val t, p = ttest_ind(left, right, equal_var=False) # record log p-value, scale by window size for truncated windows logpvals.append(np.log(p)*(min(len(left),len(right))/(window*1.0))) # obtain the highest peaks, as defined by the tolerance locs = [i+2 for (i,x) in enumerate(logpvals) if x <= tol] peaks = [x for x in logpvals if x <= tol] i = 0 while i < (len(locs)-1): if ((locs[i+1] - locs[i]) < min_peak_space): if peaks[i+1] < peaks[i]: peaks.pop(i) locs.pop(i) else: peaks.pop(i+1) locs.pop(i+1) else: i += 1 # return peak locations and probs return locs + [len(obs)], np.array(logpvals) found = False with open(INFILE, 'r') as infile: for line in infile: line = line.split(',') # extract the gene we want if line[0] == TRANSNAME: obs = [float(x) for x in line[1:]] peaks, logp = find_transitions(obs) # plot results #fig = plt.figure(figsize=(10,5)) #ax = fig.add_subplot(1, 1, 1) fig, (ax1, ax2) = plt.subplots(2,sharex=True,figsize=(10,5)) ax2.plot(np.arange(len(logp)) + 2, logp, color='grey') ax2.axhline(y=-15,ls='--',color='black') # plot segments, alternating between the two colors prev = 0 for i,peak in enumerate(peaks): if i % 2 == 0: color = COLOR1 else: color = COLOR2 ax1.scatter(np.arange(prev, peak), obs[prev: peak], color=color, alpha=0.5) prev = peak # Eliminate upper and right axes ax1.spines['right'].set_color('none') ax1.spines['top'].set_color('none') ax2.spines['right'].set_color('none') ax2.spines['bottom'].set_color('none') # Show ticks in the left and lower axes only ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax2.xaxis.set_ticks_position('top') ax2.yaxis.set_ticks_position('left') ax1.set_ylabel('Branch length score') ax2.set_ylabel('log(transition p-value)') ax1.set_ylim(bottom=0) plt.xlim(-30, len(obs) + 30) plt.xlabel('Position along 3\'UTR') plt.savefig(OUTFILE) plt.close() found = True break if not found: print('{} not found'.format(TRANSNAME))