Source code for nxrepair

import pysam
import collections
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from intervalNode import IntervalNode


def meansd(frq):

    """

    Function to calculate mean and standard deviation from a dictionary of frequencies.

    Return a tuple of (mean, std).

    Arguments:
    frq: a dictionary of frequencies: key = insert size, value = frequency

    """

    keys = frq.keys()
    keys.sort()
    w = np.empty(len(keys),np.float)
    for i,k in enumerate(keys):
        w[i] = frq[k]

    x = np.abs(keys)
    xbar = np.average(x,weights=w)
    xsd = np.average(np.sqrt(np.power((x - xbar),2)),weights=w)
    return (xbar,xsd)

def MAD(frq):

    """

    Function to calculate median and median absolute deviation (MAD) from a dictionary of frequencies.

    Return a tuple of (mean, MAD).

    Arguments:
    frq: a dictionary of frequencies: key = insert size, value = frequency
     
    """

    all_lengths = []
    for k, v in frq.iteritems():
        new_vals = [k] * int(v)
        all_lengths.extend(new_vals)
    all_lengths = np.array(sorted(all_lengths))
    mid = len(all_lengths)/2
    median = all_lengths[mid]
    residuals = sorted(abs(all_lengths - median)) # difference between val and median
    MAD = residuals[mid] # median of residuals
    print MAD
    return median, MAD

def find_intersections(tree, s, e):

    """

    Function to find inserts that bridge a region of a contig.

    Arguments:
    tree: interval tree of start/end positions of mate pair inserts
    s: start position of interval on contig 
    e: end position of interval on contig
     
    Return a list of nodes from tree whose start and end positions span the interval given by s and e. 
    
    """

    # find all reads that bridge a gap
    intersections = []
    tree.intersect(s, e, lambda x: intersections.append(x)) # see interval node for implementation
    return intersections

def get_insertlengths(reads):

    """

    Function to calculate interval sizes of a set of mate pairs.

    Arguments:
    reads: a list of mate pairs as interal tree node objects

    Return two numpy arrays: an array of insert sizes (integers) and an array of strand alignments (boolean)  
    """

    distances = []
    strands = []
    for read in reads:
        distances.append(read.end - read.start) # insert length
        strands.append(read.other[1]) # boolean: correct alignment
    return np.array(distances), np.array(strands)

def probability_of_readlength(read_length, mu, sigma, pi1, L):

    """

    Function to calculate the probability that mate pair insert sizes are not anomalous.
    Return an array of probabilites.

    Arguments:
    read_length: a numpy array of insert sizes
    mu: mean insert size (float)
    sigma: insert size standard deviation (float)
    pi1: prior probability of being anomalous (float)
    L: length of contig to which the reads in read_length are aligned

    """

    p_0 = pi1 * (1 / float(L))
    # probability of drawing from a gaussian with mean mu and std sigma
    p_total = p_1 / (p_0 + p_1)
    return p_total
    p_1 = (1 - pi1) * stats.norm.pdf(read_length, loc=mu, scale=sigma) 

[docs]class aligned_assembly:

    """

    Class to hold a set of mate pair or paired end reads aligned to the scaffolded genome assembly

    """    

    def __init__(self, bamfile, fastafile, min_size, threshold, step, window, minmapq, maxinsert, fraction, prior):

        """

        Initialiser function for the aligned assembly class.

        Arguments:
        bamfile: a sorted bam file of reads aligned to the scaffolded assembly
        fastafile: the scaffolded assembly in fasta format
        min_size: the minimum size contig to consider for analysis (integer)
        threshold: threshold in Z score below which a misassembly is called (float)
        step: step size to walk contigs (integer)
        window: width of window around each position from which mate pair insert sizes are fetched (integer)
        minmapq: the minimum mapq value for which paired reads are evaluated (float)
        maxinsert: the maximum insert size for which genome population statistics are calculated 
        fraction: minimum fraction of read pairs with correct orientation to call support for the assembly.

        """
        # initialising user parameters
        self.minmapq = minmapq
        self.maxinsert = maxinsert
        self.threshold = threshold
        self.step = step
        self.window = window
        self.fraction = fraction
        self.prior = prior
        self.sam = pysam.Samfile(bamfile, "rb" )
        self.fasta = pysam.Fastafile(fastafile)
        self.min_size = min_size

        # getting reads from bamfile
        self.all_reads = self.sam.fetch()
        self.references = self.sam.references
        self.lengths = self.sam.lengths
        
        # refdict: key=contig, val=contig length
        # read_stock: key=contig, val=aligned reads
        self.refdict = {}
        self.read_stock = {}
        for k,v in zip(self.references,self.lengths):
            self.refdict[k]=v
            self.read_stock[k] = self.get_reads(k, 0, v) 
        self.sizes = self.get_read_size_distribution()
        self.isize_mean, self.isize_sd = meansd(self.sizes)
        self.isize_median, self.isize_MAD = MAD(self.sizes)
        print self.isize_sd, self.isize_MAD

[docs]    def get_read_size_distribution(self):

        """

        Function to calculate global insert size distribution across the whole assembly
        Return a frequency table of insert sizes as a  dictionary with key = insert size, value = frequency

        """

        frq = collections.defaultdict(int) # dictionary of insert sizes
        found = {}
        for read in self.all_reads:
            # accept read based on mapq, contig alignemnt and insert size
            if (read.mapq > self.minmapq) and (read.rnext == read.tid) and (abs(read.tlen) < self.maxinsert):
                if read.qname in found and found[read.qname][0]==read.tid:
                    mate = found[read.qname]
                    isize = abs(max( mate[1]+mate[2]-read.pos,read.pos+read.rlen-mate[1] ))
                    frq[isize] += 1
                else:
                    found[read.qname] = (read.tid,read.pos,read.rlen)
        return frq

[docs]    def get_reads(self, ref, start, end):

        """

        Function to fetch reads aligned to a specific part of the assembled genome and return a list of aligned reads, where each list entry is a tuple:
        (read start position, read end position, read name, strand alignment) and strand alignment is a boolean indicating whether the two reads of a read pair align correctly to opposite strands.
        Reads are fetched that align to contig "ref" between positions "start" and "end".

        Arguments:
        ref: the name of the contig from which aligned reads are to be fetched.
        start: the position on the contig from which to start fetching aligned reads
        end: the position on the contig from which to end fetching aligned reads

        """

        # fetch all reads within a region
        # insert size: gap between end of one mate and start of next
        reads = self.sam.fetch(ref, start, end)
        read_stock = []
        found = {}
        for read in reads:
            if (read.rnext == read.tid):
                if read.qname in found and found[read.qname][0]==read.tid: # if mate maps to same contig
                    mate = found[read.qname] # fetch mate

                    # correctly ordering mates
                    if mate[1] > read.pos:
                        start_pos = read.pos + read.rlen
                        end_pos = mate[1]
                    else:
                        start_pos = mate[1] + mate[2]
                        end_pos = read.pos
                    # add mates to list of mates on that contig
                    # include strand orientation info
                    correct_strands = ((read.is_reverse) and not (read.mate_is_reverse)) or ((read.mate_is_reverse) and not (read.is_reverse))
                    read_stock.append((start_pos, end_pos, read.qname, correct_strands))
                else:
                    found[read.qname] = (read.tid,read.pos,read.rlen) # haven't reached mate yet
        return read_stock

[docs]    def make_tree(self, ref):

        """

        Function to construct an interval tree from reads aligning to a contig and return the interval tree.

        The interval tree stores nodes with properties start (start postition of interval), end (end position of interval) and other,
        which is a tuple of the mate pair name (string) and the strand alignment of the two paired reads (boolean).

        Arguments:
        ref: Reference ID of the contig for which the interval tree is to be constructed

        """

        bridges = self.read_stock[ref] 
        # insert first interval into tree
        s1, e1, name, correct_strands = bridges[0]
        tree = IntervalNode(s1, e1, other=(name, correct_strands))

        # insert the rest of the intervals
        for (start, end, name, correct_strands) in bridges[1:]:
            tree = tree.insert(start, end, other=(name, correct_strands))
        return tree

[docs]    def get_read_mappings(self, ref):

        """

        Function to calculate the fraction of reads pairs within a contig that align correctly to opposite strands.

        Return five arrays: the positions at which strand alignment was evaluated, the fraction correctly aligned, the fraction incorrectly aligned to the same strand, the unmapped 
        fraction and the fraction that have some other alignment issue.

        Arguments:
        ref: the reference id of the contig to be evaulated

        """
        dump_val = self.step
        positions = []
        same_strand = 0
        opp_strand = 0
        unmapped = 0
        other = 0

        # arrays of read mapping behaviour
        good_ratio = []
        unmapped_ratio = []
        bad_ratio = []
        other_ratio = []
        mini_pos = []

        reads = self.sam.fetch(reference = ref)
        # note that iterating in this manner works because the bam file is sorted.
        # create arrays containing fraction of correctly / incorrectly alinged reads
        for i, r in enumerate(reads):    
            mini_pos.append(r.pos)
            if r.mate_is_unmapped:
                unmapped += 1
            elif ((r.is_reverse) and not (r.mate_is_reverse)) or ((r.mate_is_reverse) and not (r.is_reverse)):
                same_strand += 1
            elif((r.is_reverse) and (r.mate_is_reverse)) or (not (r.mate_is_reverse) and not (r.is_reverse)):
                opp_strand += 1
            else:
                other += 1

            if (i+1) % dump_val == 0:
                total = same_strand + opp_strand + unmapped + other
                good_ratio.append(float(same_strand) / total)
                bad_ratio.append(float(opp_strand) / total)
                unmapped_ratio.append(float(unmapped) / total)
                other_ratio.append(float(other) / total)

                same_strand = 0
                opp_strand = 0
                unmapped = 0
                other = 0
                positions.append(np.mean(mini_pos))
                mini_pos = []

        return np.array(positions), np.array(good_ratio), np.array(bad_ratio), np.array(unmapped_ratio), np.array(other_ratio)

[docs]    def get_mapping_anomalies(self):

        """

        Function to determine the frequency of strand mapping anomalies across the entire genome assembly.

        Calls get_read_mappings for each contig larger than the aligned_assembly.min_size and returns:
        1) a dictionary with keys = contig reference IDs; values = list of positions and strand alignment ratios as described in get_read_mappings   
        2) a dictionary of anomalies wiht keys = contig reference IDs, values = [list of positions for which the ratio of correctly aligned strands < 0.75 (currently hard-coded), corresponding ratio of correctly aligned strands]

        """

        mapping_ratios = {} # key=contig, val=list of arrays of mapping behaviours
        anomalies = {}
        for w, (ref, length) in enumerate(self.refdict.iteritems()):
            if length > self.min_size: # consider only big contigs
                positions, good_ratio, bad_ratio, unmapped_ratio, other_ratio = self.get_read_mappings(ref)
                map_criterion = good_ratio < self.fraction
                pos_anomalies = positions[map_criterion]
                map_anomalies = good_ratio[map_criterion]
                mapping_ratios[ref] = [positions, good_ratio, bad_ratio, unmapped_ratio, other_ratio]
                anomalies[ref] = [pos_anomalies, map_anomalies]
        return mapping_ratios, anomalies

[docs]    def get_size_anomalies(self):

        """

        Function to determine the frequency of insert size anomalies across the entire genome assembly.

        Calls probability_of_readlength for each contig larger than the aligned_assembly.min_size and returns:
        1) a dictionary with keys = contig reference IDs; values = array of Zscores as described in probability_of_readlength   
        2) a dictionary of anomalies wiht keys = contig reference IDs, values = [list of positions for which  abs(z-score) > 2 (currently hard-coded), corresponding z-score value]

        """

        anomalies = {}
        zscores = {}
        all_probabilities = []
        stock_probabilities = {}

        for w, (ref, length) in enumerate(self.refdict.iteritems()):
            if length > self.min_size:
                tree = self.make_tree(ref) # build tree from all reads aligning to a contig
                positions = np.arange(self.step, length - self.window, self.step)
                probabilities = []
                for pos in positions:
                    if pos % 10000 == 0:
                        print pos
                    bridges = np.array(find_intersections(tree, pos-self.window, pos+self.window)) # fetch reads in windows across contig
                    bridge_lengths, strand_alignment = get_insertlengths(bridges) # get insert sizes and mapping behaviour
                    prob_lengths = probability_of_readlength(bridge_lengths, self.isize_mean, self.isize_MAD, self.prior, length) # get prob. insert sizes from null
                    condition = strand_alignment == 1
                    D = np.sum(prob_lengths[condition]) # D is total assembly support
                    probabilities.append(D)
                    all_probabilities.append(D)
                stock_probabilities[ref] = [positions, np.array(probabilities)]
        p_mean = np.mean(np.array(all_probabilities)) # get contig mean and variance
        p_std = np.std(np.array(all_probabilities))

        for ref, [positions, probs] in stock_probabilities.iteritems():
            zscore = (probs - p_mean) / p_std # calculate position z score from contig mean, std
            # anomalies have Zscore < Threshold.
            # Note: threshold should be negative
            z_criterion = (zscore < self.threshold)
            z_anomalies = zscore[z_criterion]
            print ref, z_anomalies
            pos_anomalies = positions[z_criterion]
            zscores[ref] = [positions, zscore]
            anomalies[ref] = [pos_anomalies, z_anomalies] # list of anomaly locations and socres
        return zscores, anomalies, tree

[docs]    def get_anomalies(self, outfile, trim, img_name=None):

        """

        Function to determine the frequency of anomalous mate pair behaviour across the entire genome assembly and return a dictionary where:
        key = contig reference IDs, 
        value = list of postions within that contig where an assembly error is identified and the contig should be broken.

        Calls get_size_anomalies and get_mapping_anomalies for each contig larger than the aligned_assembly.min_size; makes a .csv file listing for each contig the positions of identified misassemblies and their corresponding anomalous scores.

        Arguments:
        outfile: name of file (including filepath) to store the list of contig misassemblies.

        Keyword Arguments:
        img_name: name of file (including filepath, not including filetype) to store plots of alignment quality

        """

        print "Anomaly detection"
        # get anomaly positions
        zscores, size_anomalies, tree = self.get_size_anomalies()
        map_ratios, map_anomalies = self.get_mapping_anomalies()

        break_points = {}

        for w, (ref, [positions, probs]) in enumerate(zscores.iteritems()):
            # write all Z scores to a csv file
            print "Writing Z scores to file (%s)" % ref
            for pos, prob in zip(positions, probs):
                outfile.write("%s %s %s\n" %(ref, pos, prob))
            z_pos, z_anomalies = size_anomalies[ref] 
            print "z_anomalies:", z_anomalies
            print ref, z_pos
            map_positions, good_ratio, bad_ratio, unmapped_ratio, other_ratio = map_ratios[ref] 
            pos_anomalies, map_anom = map_anomalies[ref]

            anomaly_positions = sorted(z_pos.tolist())
            print ref, anomaly_positions

            # make list of positions to break this contig
            break_points[ref] = []
            
            if len(anomaly_positions) != 0:
                current = []
                for p in range(len(anomaly_positions)):
                    anom_pos = anomaly_positions[p]
                    if current == []:
                        current.append(anom_pos)
                    else:
                        if anom_pos - current[-1] <= trim:
                        # if anomalies are well separated flush current values to break_point
                            current.append(anom_pos)
                        else:
                            break_points[ref].append(np.mean(current))
                            current = [anom_pos]
                if current != []:
                    break_points[ref].append(np.mean(current))
                print ref, break_points[ref]

            # plot zscores and anomalies
            fig, ax1 = plt.subplots()
            plt.subplots_adjust(bottom=0.15)
            ax1.set_xlim([0, max(map_positions)])
            ax1.set_xlabel("Position",size=24)
            ax1.set_ylabel("Assembly Support", size=24)
            plt.tick_params(axis='both', which='major', labelsize=20)
            lns1 = ax1.plot(positions, probs, c="k", label="Support")
            #plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
            anomalies_to_plot = sorted(break_points[ref])
            anomalies_y = [-10] * len(anomalies_to_plot)

            ax1.scatter(anomalies_to_plot, anomalies_y, c="r", marker = "o")
            print "anomalies", anomalies_to_plot

            if img_name != None:
                # if name given, save image as .pdf and .png
                name = img_name + "_%s.pdf" % ref
                plt.savefig(name)
                name = img_name + "_%s.png" % ref
                plt.savefig(name)
                plt.cla()
        
        return break_points

[docs]    def breakContigs_double(self,outfile, breakpoints, trim):

        """

        Function to break a contigs at positions identified as assembly errors and write a new fasta file containing all contigs (both altered and unaltered).

        Makes a two-point break at the identified misassembly position, splitting at 5 Kb upstream and downstream of the misassembly and (currently) excluding the misassembled region.

        Arguments:
        outfile: name of the new fasta file (including filepath)
        breakpoints: dictionary of misassemblies. key = contig reference ID, value = list of misassembly positions within the contig
        trim: distance, in bases, to trim from each each edge of a breakpoint to remove misassembly (integer)

        """
        
        for k, v in breakpoints.iteritems():
            # print breakpoints
            print k, v
        newcontigs = []
        for contig, length in self.refdict.iteritems():
            dna = self.fasta[contig] # sequence of contig
            if contig in breakpoints:
                splits = breakpoints[contig]
                splits.sort()
                prev = 0
                for s in splits: # iterate through breakpoints
                    print s
                    print "breaking contig"
                    if (s - prev > trim) and ((length - s) > trim): 
                        print "breaking here:", s
                        newcontigs.append(dna[int(prev):int(s-trim)]) # trim and append section before break
                        prev = s + trim # trim other end of break
                    else:
                        print "Too small!"
                newcontigs.append(dna[int(prev):])
            else:
                newcontigs.append(dna)

        # write new contigs to file
        newcontigs.sort(lambda x,y: cmp(len(x), len(y)),reverse=True)
        print "Writing new fasta..."
        for count, dna in enumerate(newcontigs):
            name = ">CONTIG_%d_length_%d"%(count,len(dna))
            print name
            outfile.write(name)
            outfile.write("\n")
            outfile.write(dna)
            outfile.write("\n")    

def main():
    # read command line arguments
    import argparse
    parser = argparse.ArgumentParser(description='Routine to identify and correct large-scale misassemblies in de novo assemblies')
    parser.add_argument('bam', metavar='bam', type=str, help='bam')
    parser.add_argument('fasta', metavar='fasta', type=str, help='scaffold fasta')
    parser.add_argument('outfile', metavar='outfile', type=str, help='Output file name')
    parser.add_argument('newfasta', metavar='newfasta', type=str, help='Fasta file for new contigs, including filepath')
    parser.add_argument('-min_size', metavar='min_size', type=int, default=10000, help='Minimum contig size to analyse')
    parser.add_argument('-img_name', metavar ='img_name', type=str, default=None, help='Name under which to save (optional) graphs of alignment quality. Default value: None (no graphs produced)')
    parser.add_argument('-trim', metavar ='trim', type=int, default=5000, help='Number of bases to trim from each side of an identified misassembly. Default value: 5000')
    parser.add_argument('-T', metavar ='T', type=float, default= -4.0, help='Threshold in Z score below which a misassembly is called. Default value: -4.0')
    parser.add_argument('-step_size', metavar ='step_size', type=int, default=1000, help='Step-size in bases to traverse contigs. Default value: 1000')
    parser.add_argument('-window', metavar ='window', type=int, default=200, help='Window size across which bridging mate pairs are evaluated. Default value: 200')
    parser.add_argument('-minmapq', metavar ='minmapq', type=int, default=40, help='Minimum MapQ value, above which a read pair is included in calculating population statistics. Default value: 40')
    parser.add_argument('-maxinsert', metavar ='maxinsert', type=int, default=30000, help='Maximum insert size, below which a read pair is included in calculating population statistics. Default value: 30000')
    parser.add_argument('-fraction', metavar ='fraction', type=int, default=0.75, help='Minimum fraction of read pairs with correct orientation to call support for the assembly. Default value: 0.75')  
    parser.add_argument('-prior', metavar ='prior', type=float, default=0.01, help='Prior probablility that the insert size is anomalous. Default value: 0.01')  


    args = parser.parse_args()

    # make assembly object
    f = aligned_assembly(args.bam, args.fasta, args.min_size, args.T, args.step_size, args.window, args.minmapq, args.maxinsert, args.fraction, args.prior)
    print "This is ok"
    
    # find anomalies
    with open(args.outfile, "w") as of:
        bps = f.get_anomalies(of, args.trim, args.img_name)

    # break contig at identifed anomalies
    with open(args.newfasta, "w") as outfasta:
        f.breakContigs_double(outfasta, bps, args.trim)

if __name__ == "__main__":
    main()