Source code for src.analysis.internalAnalysis

import pandas as pd
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import minimize

__author__ =  "Gao, Yuqian <Yuqian.Gao@pnnl.gov>"
__maintainer__ = "Anubhav <anubhav@pnnl.gov>"

[docs]class downStreamAnalysis:

    def __init__(self, parent_folder,):
        self.parent_folder = parent_folder


[docs]    def findproteinname(s):
        '''Get Protein type
        :param s:
        :return:
        '''
        p1 = re.compile(r'^Contaminant_')
        p2 = re.compile(r'^XXX_Contaminant_')
        p3 = re.compile(r'^XXX_')
        if p1.search(s) is not None:
            return 'None'
        elif p2.search(s) is not None:
            return 'None'
        elif p3.search(s) is not None:
            return 'Reversed'
        else: # the rest
            return 'Forward'


[docs]    def cleansequence(s):
        '''
        clean peptide sequence is the sequence without prefix and postfix but with oxidation
        :param s:
        :return:
        '''
        p = re.compile(r'\.(?P<cleanseq>[A-Z\*@#]+)\.')
        m = p.search(s)
        return m.group('cleanseq')

[docs]    def process_data(self):
        '''

        :return:
        '''
        # import table
        data = pd.read_table(self.parent_folder +'resultants_df.txt')[[
                           'JobNum', 'Dataset_x', 'Dataset_y', 'Scan', 'Protein', 'Peptide', \
                           'NTT', 'DelM', 'DelM_PPM', 'StatMomentsArea', 'PeakMaxIntensity', \
                           'MSGFDB_SpecEValue', 'EValue', 'QValue', 'PepQValue']]

        # Add protein type, clean peptide sequence, specID
        data.rename(columns={'Dataset_x': 'Dataset', 'Dataset_y': 'Dataset_ID', 'JobNum': 'Job'}, inplace=True)
        data['Protein_Type'] = data['Protein'].apply(self.findproteinname)
        data['Clean Peptide Sequence'] = data['Peptide'].apply(self.cleansequence)
        data["SpecID"] = data.apply(lambda row: str(row["Dataset_ID"]) + "_" + str(row["Scan"]), axis=1)

        # Save protein to peptide mapping of forward peptide identification
        df_mapping = data[data['Protein_Type'] == 'Forward'] [['Protein', 'Peptide', 'Clean Peptide Sequence']].copy()
        df_mapping.drop_duplicates(inplace=True)

        # Calculate the redundancy by clean peptide sequence and protein_type
        df_new = df_mapping[['Protein', 'Clean Peptide Sequence']].copy()
        df_new.drop_duplicates(inplace=True)
        df_redundancy = df_new.groupby(['Clean Peptide Sequence']).count()
        df_redundancy.reset_index(inplace=True)
        df_redundancy.rename(columns={'Protein': 'Clean Peptide Sequence Redundancy'}, inplace=True)

        # merge redundancy to protein peptide map
        df_mapping = df_mapping.merge(df_redundancy, how='left', on=['Clean Peptide Sequence'])
        del df_redundancy

        # save to file
        df_mapping.to_csv("Results/protein_peptide_map.csv", index=False)
        del df_mapping

        # save the map of dataset id, dataset name and job id
        df_ids = data[['Dataset', 'Job', 'Dataset_ID']].copy()
        df_ids.drop_duplicates(inplace=True)
        df_ids.to_csv("Results/dataset_job_map.csv", index=False)
        del df_ids

        # Remove 'Protein', 'dataset_name', 'job_id'
        del data['Protein']
        del data['Dataset']
        del data['Job']
        data.drop_duplicates(inplace=True)  # this is so important to remove any duplicated rows

        # Make a table for just forward peptides
        data_cleaned_forward = data[data['Protein_Type'] == 'Forward'].copy()
        del data_cleaned_forward['Protein_Type']
        data_cleaned_forward.drop_duplicates(inplace=True)

        # Make a table for just reversed peptides
        data_cleaned_reversed = data[data['Protein_Type'] == 'Reversed'].copy()
        del data_cleaned_reversed['Protein_Type']
        data_cleaned_reversed.drop_duplicates(inplace=True)

        # remove data file
        del data

        # Dataset list
        dataset_list = data_cleaned_forward['Dataset_ID'].unique().tolist()

        # Export forward and reverse for individual dataset
        for i in range(len(dataset_list)):
            df_ff = data_cleaned_forward[data_cleaned_forward['Dataset_ID'] == dataset_list[i]].copy()
            df_ff.to_csv("Results/Data/" + str(dataset_list[i]) + "_forward_peptide_identification.csv", \
                         index=False)
            df_rr = data_cleaned_reversed[data_cleaned_reversed['Dataset_ID'] == dataset_list[i]].copy()
            df_rr.to_csv("Results/Data/" + str(dataset_list[i]) + "_reversed_peptide_identification.csv", \
                         index=False)
            del df_ff
            del df_rr

        del data_cleaned_forward
        del data_cleaned_reversed


# optimize the filtering criteria and filter the data
[docs]def parameter_optimization(dataset_ID):
    '''

    :param dataset_ID:
    :return:
    '''
    data_f = pd.read_csv("Results/Data/" + str(dataset_ID) + \
                         "_forward_peptide_identification.csv")
    data_r = pd.read_csv("Results/Data/" + str(dataset_ID) + \
                         "_reversed_peptide_identification.csv")

    # Fit a 1-D data of DelM_PPM and get the peak ppm_shift
    ppm_shift_df = data_f[(data_f["DelM_PPM"] < 10) & (data_f["DelM_PPM"] > -10)].copy()
    ax = sns.distplot(ppm_shift_df["DelM_PPM"])
    ppm_shift = ax.get_lines()[0].get_xdata()[np.argmax(ax.get_lines()[0].get_ydata())]

    def PepFDR(Params):
        '''

        :param Params:
        :return:
        '''
        # function to minimize
        delppm1, delppm2, log10_specprob = Params  # use log10 value so that it is managable for the computer
        ### The FDR function ###
        pep_total = data_r['Clean Peptide Sequence'].unique().size
        df_r = data_r[(data_r["DelM_PPM"] < ppm_shift + delppm1) & (data_r["DelM_PPM"] > ppm_shift - delppm2) & \
                      (data_r["MSGFDB_SpecEValue"] < 10 ** log10_specprob)].copy()
        df_f = data_f[(data_f["DelM_PPM"] < ppm_shift + delppm1) & (data_f["DelM_PPM"] > ppm_shift - delppm2) & \
                      (data_f["MSGFDB_SpecEValue"] < 10 ** log10_specprob)].copy()
        f_pep = df_f['Clean Peptide Sequence'].unique().size
        r_pep = df_r['Clean Peptide Sequence'].unique().size
        ### fdr_pep ###
        if (f_pep == 0) & (r_pep == 0):
            fdr_pep = 1
        else:
            fdr_pep = r_pep / (f_pep + r_pep)
        return 1 / (0.050001 - fdr_pep) * (-f_pep)
        # Note:
        # in this function, if fdr_pep>0.01, it returns +,
        # if fdr_pep < 0.01, it returns -, the closest to 0.01, the smaller the return

    # Add constraint
    def constraint1(Params):
        delppm1, delppm2, log10_specprob = Params  # use log10 value so that it is managable for the computer
        return 20 - delppm1 - delppm2

    def constraint2(Params):
        delppm1, delppm2, log10_specprob = Params  # use log10 value so that it is managable for the computer
        return delppm1 - 5

    def constraint3(Params):
        delppm1, delppm2, log10_specprob = Params  # use log10 value so that it is managable for the computer
        return delppm2 - 5

    con1 = {'type': 'ineq', 'fun': constraint1}
    con2 = {'type': 'ineq', 'fun': constraint2}
    con3 = {'type': 'ineq', 'fun': constraint3}

    # Miminize PepFDR
    initial_guess = [min(10, max(data_f["DelM_PPM"]) - ppm_shift), \
                     min(10, ppm_shift - min(data_f["DelM_PPM"])), \
                     -15]
    # print(dataset_ID, initial_guess)
    result = minimize(PepFDR, initial_guess, method='COBYLA', constraints=[con1, con2, con3])
    if result.success:
        fitted_params = result.x
    else:
        n = 0
        while (result.success == False) & (n < 40):
            initial_guess = [initial_guess[0] + 0.2, initial_guess[1] + 0.2, initial_guess[2]]
            result = minimize(PepFDR, initial_guess, method='COBYLA', constraints=[con1, con2, con3])
            n = n + 1
        if result.success:
            fitted_params = result.x
        else:
            print("Failed optimization @ Dataset ID: %s, Database: %s" % (dataset_ID, DB))
            print(initial_guess)
            raise ValueError(result.message)

    ### plots ###
    plt.close('all')
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 15))
    # plot the DelM_PPM
    sns.distplot(data_f["DelM_PPM"], label='forward', ax=axes[0], bins=200)
    axes[0].axvline(x=ppm_shift, color='r', linestyle='--')
    axes[0].axvline(x=ppm_shift - fitted_params[1], color='g', linestyle='-')
    axes[0].axvline(x=ppm_shift + fitted_params[0], color='g', linestyle='-')
    sns.distplot(data_r["DelM_PPM"], label='reversed', ax=axes[0], bins=200)
    axes[0].set_xlabel('DelM_PPM')
    axes[0].set_ylabel('Density')
    axes[0].set_title(r'left bound: %.2f; right bound: %.2f' % \
                      (ppm_shift - fitted_params[1], ppm_shift + fitted_params[0]))
    # plot MSGFDB_SpecEValue
    sns.distplot(data_f['MSGFDB_SpecEValue'], label='forward', kde=False, ax=axes[1], bins=5000)
    sns.distplot(data_r['MSGFDB_SpecEValue'], label='reversed', kde=False, ax=axes[1], bins=5000)
    axes[1].axvline(x=10 ** fitted_params[2], color='g', linestyle='-')
    axes[1].set_title('MSGFDB_SpecEValue: %.2e' % (10 ** fitted_params[2]))
    axes[1].set_xlabel("log(MSGFDB_SpecEValue)")
    axes[1].set_ylabel('Density')
    axes[1].set_xscale('log')
    # save
    plt.tight_layout()
    plt.savefig("Results/Plots/" + str(dataset_ID) + ".jpg")
    plt.close('all')

    # #Filter the data
    df_f = data_f[
        (data_f["DelM_PPM"] < ppm_shift + fitted_params[0]) & (data_f["DelM_PPM"] > ppm_shift - fitted_params[1]) & \
        (data_f["MSGFDB_SpecEValue"] < 10 ** fitted_params[2])].copy()
    del data_f

    df_r = data_r[
        (data_r["DelM_PPM"] < ppm_shift + fitted_params[0]) & (data_r["DelM_PPM"] > ppm_shift - fitted_params[1]) & \
        (data_r["MSGFDB_SpecEValue"] < 10 ** fitted_params[2])].copy()
    del data_r

    # Calculate FDR
    f_spec = df_f['SpecID'].unique().size  # Modified on Apr23 to only count for unique dataset-scan
    r_spec = df_r['SpecID'].unique().size
    if (f_spec == 0) & (r_spec == 0):
        fdr_spec = 1
    else:
        fdr_spec = r_spec / (f_spec + r_spec)

    # Modified on Aug 02, 2019 to use Clean Peptide Sequence instead of peptide
    f_pep = df_f['Clean Peptide Sequence'].unique().size
    r_pep = df_r['Clean Peptide Sequence'].unique().size
    if (f_pep == 0) & (r_pep == 0):
        fdr_pep = 1
    else:
        fdr_pep = r_pep / (f_pep + r_pep)

    ######################################Dataset FDR_table################################################
    df_Metadata = pd.DataFrame({'PPM center': ppm_shift, 'PPM cutoff left': ppm_shift - fitted_params[1], \
                                'PPM cutoff right': ppm_shift + fitted_params[0], \
                                'SpecProb cutoff': 10 ** fitted_params[2], \
                                'Spectra forward': f_spec, 'Spectra reverse': r_spec, \
                                'Peptide forward': f_pep, 'Peptide reverse': r_pep, \
                                'PeptideFDR': [fdr_pep], 'SepctraFDR': [fdr_spec], 'Dataset_ID': dataset_ID, \
                                'Initial ppm cutoff left': ppm_shift - initial_guess[1], \
                                'Initial ppm cutoff right': ppm_shift + initial_guess[0], \
                                'Initial SpecProb cutoff': 10 ** initial_guess[2]})

    ######################################SpecID table################################################
    df_SpecID_f = df_f[['SpecID', 'Dataset_ID', 'Scan', 'Peptide', 'Clean Peptide Sequence', 'MSGFDB_SpecEValue', \
                        'StatMomentsArea', 'DelM_PPM']].copy()
    df_SpecID_f.drop_duplicates(inplace=True)

    df_SpecID_r = df_r[['SpecID', 'Dataset_ID', 'Scan', 'Peptide', 'Clean Peptide Sequence', 'MSGFDB_SpecEValue', \
                        'StatMomentsArea', 'DelM_PPM']].copy()
    df_SpecID_r.drop_duplicates(inplace=True)

    ######################################Spetra Count table##########################################
    df_spec = df_f[['Clean Peptide Sequence', 'SpecID']].copy()
    df_spec.drop_duplicates(inplace=True)
    df_SpectraCount = df_spec.groupby(['Clean Peptide Sequence']).size().rename('Peptide Spectra Count').reset_index()
    del df_spec
    df_SpectraCount['Dataset_ID'] = dataset_ID

    ######################################Spetra Count table##########################################
    df_Intensity = df_f[['Clean Peptide Sequence', 'StatMomentsArea']].groupby(['Clean Peptide Sequence']).max()
    df_Intensity.rename(columns={'StatMomentsArea': 'Peptide Peak Area'}, inplace=True)
    df_Intensity.reset_index(inplace=True)
    df_Intensity['Dataset_ID'] = dataset_ID

    return df_Metadata, df_SpecID_f, df_SpecID_r, df_SpectraCount, df_Intensity