In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import seaborn
import numpy
from collections import deque
In [2]:
def len_consec_iiloci(data, n=2):
    """
    Compute combined lengths for each set of n consecutive iiLoci.

    Iterate over iiLoci, accumulating sets of n consecutive iiLoci in a deque
    and yield the aggregate length of each set of n. Reset the deque when a new
    sequence is encountered.
    
    `data` is a pandas.DataFrame object, each row corresponding to a single
    iiLocus.
    """
    iiloci = deque()
    for index, row in data.iterrows():
        if len(iiloci) > 0 and iiloci[0]['Seqid'] != row['Seqid']:
            iiloci = deque()
        iiloci.append(row)
        if len(iiloci) == n:
            aggregate_length = sum([row['iiLocusLen'] for row in iiloci])
            yield aggregate_length
            iiloci.popleft()
In [17]:
df = pandas.read_table('ilens.tsv')
In [20]:
pdom = df.loc[(df.Species == 'Pdom')]
for k in range(2, 13):
    agglens = [numpy.log10(length+1) for length in n_adj_iiloci(pdom, n=k)]
    plt.hist(agglens, bins=40)
    plt.show()
In [21]:
amel = df.loc[(df.Species == 'Amel')]
for k in range(2, 13):
    agglens = [numpy.log10(length+1) for length in n_adj_iiloci(amel, n=k)]
    plt.hist(agglens, bins=40)
    plt.show()
In [22]:
ador = df.loc[(df.Species == 'Ador')]
for k in range(2, 13):
    agglens = [numpy.log10(length+1) for length in n_adj_iiloci(ador, n=k)]
    plt.hist(agglens, bins=40)
    plt.show()
In [ ]:
 

Comments