Notebook test
In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import seaborn
import numpy
from collections import deque
In [2]:
def len_consec_iiloci(data, n=2):
"""
Compute combined lengths for each set of n consecutive iiLoci.
Iterate over iiLoci, accumulating sets of n consecutive iiLoci in a deque
and yield the aggregate length of each set of n. Reset the deque when a new
sequence is encountered.
`data` is a pandas.DataFrame object, each row corresponding to a single
iiLocus.
"""
iiloci = deque()
for index, row in data.iterrows():
if len(iiloci) > 0 and iiloci[0]['Seqid'] != row['Seqid']:
iiloci = deque()
iiloci.append(row)
if len(iiloci) == n:
aggregate_length = sum([row['iiLocusLen'] for row in iiloci])
yield aggregate_length
iiloci.popleft()
In [17]:
df = pandas.read_table('ilens.tsv')
In [20]:
pdom = df.loc[(df.Species == 'Pdom')]
for k in range(2, 13):
agglens = [numpy.log10(length+1) for length in n_adj_iiloci(pdom, n=k)]
plt.hist(agglens, bins=40)
plt.show()
In [21]:
amel = df.loc[(df.Species == 'Amel')]
for k in range(2, 13):
agglens = [numpy.log10(length+1) for length in n_adj_iiloci(amel, n=k)]
plt.hist(agglens, bins=40)
plt.show()
In [22]:
ador = df.loc[(df.Species == 'Ador')]
for k in range(2, 13):
agglens = [numpy.log10(length+1) for length in n_adj_iiloci(ador, n=k)]
plt.hist(agglens, bins=40)
plt.show()
In [ ]: