Source code for src.analysis.internalAnalysis

import pandas as pd
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import minimize

__author__ =  "Gao, Yuqian <Yuqian.Gao@pnnl.gov>"
__maintainer__ = "Anubhav <anubhav@pnnl.gov>"

[docs]class downStreamAnalysis: def __init__(self, parent_folder,): self.parent_folder = parent_folder
[docs] def findproteinname(s): '''Get Protein type :param s: :return: ''' p1 = re.compile(r'^Contaminant_') p2 = re.compile(r'^XXX_Contaminant_') p3 = re.compile(r'^XXX_') if p1.search(s) is not None: return 'None' elif p2.search(s) is not None: return 'None' elif p3.search(s) is not None: return 'Reversed' else: # the rest return 'Forward'
[docs] def cleansequence(s): ''' clean peptide sequence is the sequence without prefix and postfix but with oxidation :param s: :return: ''' p = re.compile(r'\.(?P<cleanseq>[A-Z\*@#]+)\.') m = p.search(s) return m.group('cleanseq')
[docs] def process_data(self): ''' :return: ''' # import table data = pd.read_table(self.parent_folder +'resultants_df.txt')[[ 'JobNum', 'Dataset_x', 'Dataset_y', 'Scan', 'Protein', 'Peptide', \ 'NTT', 'DelM', 'DelM_PPM', 'StatMomentsArea', 'PeakMaxIntensity', \ 'MSGFDB_SpecEValue', 'EValue', 'QValue', 'PepQValue']] # Add protein type, clean peptide sequence, specID data.rename(columns={'Dataset_x': 'Dataset', 'Dataset_y': 'Dataset_ID', 'JobNum': 'Job'}, inplace=True) data['Protein_Type'] = data['Protein'].apply(self.findproteinname) data['Clean Peptide Sequence'] = data['Peptide'].apply(self.cleansequence) data["SpecID"] = data.apply(lambda row: str(row["Dataset_ID"]) + "_" + str(row["Scan"]), axis=1) # Save protein to peptide mapping of forward peptide identification df_mapping = data[data['Protein_Type'] == 'Forward'] [['Protein', 'Peptide', 'Clean Peptide Sequence']].copy() df_mapping.drop_duplicates(inplace=True) # Calculate the redundancy by clean peptide sequence and protein_type df_new = df_mapping[['Protein', 'Clean Peptide Sequence']].copy() df_new.drop_duplicates(inplace=True) df_redundancy = df_new.groupby(['Clean Peptide Sequence']).count() df_redundancy.reset_index(inplace=True) df_redundancy.rename(columns={'Protein': 'Clean Peptide Sequence Redundancy'}, inplace=True) # merge redundancy to protein peptide map df_mapping = df_mapping.merge(df_redundancy, how='left', on=['Clean Peptide Sequence']) del df_redundancy # save to file df_mapping.to_csv("Results/protein_peptide_map.csv", index=False) del df_mapping # save the map of dataset id, dataset name and job id df_ids = data[['Dataset', 'Job', 'Dataset_ID']].copy() df_ids.drop_duplicates(inplace=True) df_ids.to_csv("Results/dataset_job_map.csv", index=False) del df_ids # Remove 'Protein', 'dataset_name', 'job_id' del data['Protein'] del data['Dataset'] del data['Job'] data.drop_duplicates(inplace=True) # this is so important to remove any duplicated rows # Make a table for just forward peptides data_cleaned_forward = data[data['Protein_Type'] == 'Forward'].copy() del data_cleaned_forward['Protein_Type'] data_cleaned_forward.drop_duplicates(inplace=True) # Make a table for just reversed peptides data_cleaned_reversed = data[data['Protein_Type'] == 'Reversed'].copy() del data_cleaned_reversed['Protein_Type'] data_cleaned_reversed.drop_duplicates(inplace=True) # remove data file del data # Dataset list dataset_list = data_cleaned_forward['Dataset_ID'].unique().tolist() # Export forward and reverse for individual dataset for i in range(len(dataset_list)): df_ff = data_cleaned_forward[data_cleaned_forward['Dataset_ID'] == dataset_list[i]].copy() df_ff.to_csv("Results/Data/" + str(dataset_list[i]) + "_forward_peptide_identification.csv", \ index=False) df_rr = data_cleaned_reversed[data_cleaned_reversed['Dataset_ID'] == dataset_list[i]].copy() df_rr.to_csv("Results/Data/" + str(dataset_list[i]) + "_reversed_peptide_identification.csv", \ index=False) del df_ff del df_rr del data_cleaned_forward del data_cleaned_reversed
# optimize the filtering criteria and filter the data
[docs]def parameter_optimization(dataset_ID): ''' :param dataset_ID: :return: ''' data_f = pd.read_csv("Results/Data/" + str(dataset_ID) + \ "_forward_peptide_identification.csv") data_r = pd.read_csv("Results/Data/" + str(dataset_ID) + \ "_reversed_peptide_identification.csv") # Fit a 1-D data of DelM_PPM and get the peak ppm_shift ppm_shift_df = data_f[(data_f["DelM_PPM"] < 10) & (data_f["DelM_PPM"] > -10)].copy() ax = sns.distplot(ppm_shift_df["DelM_PPM"]) ppm_shift = ax.get_lines()[0].get_xdata()[np.argmax(ax.get_lines()[0].get_ydata())] def PepFDR(Params): ''' :param Params: :return: ''' # function to minimize delppm1, delppm2, log10_specprob = Params # use log10 value so that it is managable for the computer ### The FDR function ### pep_total = data_r['Clean Peptide Sequence'].unique().size df_r = data_r[(data_r["DelM_PPM"] < ppm_shift + delppm1) & (data_r["DelM_PPM"] > ppm_shift - delppm2) & \ (data_r["MSGFDB_SpecEValue"] < 10 ** log10_specprob)].copy() df_f = data_f[(data_f["DelM_PPM"] < ppm_shift + delppm1) & (data_f["DelM_PPM"] > ppm_shift - delppm2) & \ (data_f["MSGFDB_SpecEValue"] < 10 ** log10_specprob)].copy() f_pep = df_f['Clean Peptide Sequence'].unique().size r_pep = df_r['Clean Peptide Sequence'].unique().size ### fdr_pep ### if (f_pep == 0) & (r_pep == 0): fdr_pep = 1 else: fdr_pep = r_pep / (f_pep + r_pep) return 1 / (0.050001 - fdr_pep) * (-f_pep) # Note: # in this function, if fdr_pep>0.01, it returns +, # if fdr_pep < 0.01, it returns -, the closest to 0.01, the smaller the return # Add constraint def constraint1(Params): delppm1, delppm2, log10_specprob = Params # use log10 value so that it is managable for the computer return 20 - delppm1 - delppm2 def constraint2(Params): delppm1, delppm2, log10_specprob = Params # use log10 value so that it is managable for the computer return delppm1 - 5 def constraint3(Params): delppm1, delppm2, log10_specprob = Params # use log10 value so that it is managable for the computer return delppm2 - 5 con1 = {'type': 'ineq', 'fun': constraint1} con2 = {'type': 'ineq', 'fun': constraint2} con3 = {'type': 'ineq', 'fun': constraint3} # Miminize PepFDR initial_guess = [min(10, max(data_f["DelM_PPM"]) - ppm_shift), \ min(10, ppm_shift - min(data_f["DelM_PPM"])), \ -15] # print(dataset_ID, initial_guess) result = minimize(PepFDR, initial_guess, method='COBYLA', constraints=[con1, con2, con3]) if result.success: fitted_params = result.x else: n = 0 while (result.success == False) & (n < 40): initial_guess = [initial_guess[0] + 0.2, initial_guess[1] + 0.2, initial_guess[2]] result = minimize(PepFDR, initial_guess, method='COBYLA', constraints=[con1, con2, con3]) n = n + 1 if result.success: fitted_params = result.x else: print("Failed optimization @ Dataset ID: %s, Database: %s" % (dataset_ID, DB)) print(initial_guess) raise ValueError(result.message) ### plots ### plt.close('all') fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 15)) # plot the DelM_PPM sns.distplot(data_f["DelM_PPM"], label='forward', ax=axes[0], bins=200) axes[0].axvline(x=ppm_shift, color='r', linestyle='--') axes[0].axvline(x=ppm_shift - fitted_params[1], color='g', linestyle='-') axes[0].axvline(x=ppm_shift + fitted_params[0], color='g', linestyle='-') sns.distplot(data_r["DelM_PPM"], label='reversed', ax=axes[0], bins=200) axes[0].set_xlabel('DelM_PPM') axes[0].set_ylabel('Density') axes[0].set_title(r'left bound: %.2f; right bound: %.2f' % \ (ppm_shift - fitted_params[1], ppm_shift + fitted_params[0])) # plot MSGFDB_SpecEValue sns.distplot(data_f['MSGFDB_SpecEValue'], label='forward', kde=False, ax=axes[1], bins=5000) sns.distplot(data_r['MSGFDB_SpecEValue'], label='reversed', kde=False, ax=axes[1], bins=5000) axes[1].axvline(x=10 ** fitted_params[2], color='g', linestyle='-') axes[1].set_title('MSGFDB_SpecEValue: %.2e' % (10 ** fitted_params[2])) axes[1].set_xlabel("log(MSGFDB_SpecEValue)") axes[1].set_ylabel('Density') axes[1].set_xscale('log') # save plt.tight_layout() plt.savefig("Results/Plots/" + str(dataset_ID) + ".jpg") plt.close('all') # #Filter the data df_f = data_f[ (data_f["DelM_PPM"] < ppm_shift + fitted_params[0]) & (data_f["DelM_PPM"] > ppm_shift - fitted_params[1]) & \ (data_f["MSGFDB_SpecEValue"] < 10 ** fitted_params[2])].copy() del data_f df_r = data_r[ (data_r["DelM_PPM"] < ppm_shift + fitted_params[0]) & (data_r["DelM_PPM"] > ppm_shift - fitted_params[1]) & \ (data_r["MSGFDB_SpecEValue"] < 10 ** fitted_params[2])].copy() del data_r # Calculate FDR f_spec = df_f['SpecID'].unique().size # Modified on Apr23 to only count for unique dataset-scan r_spec = df_r['SpecID'].unique().size if (f_spec == 0) & (r_spec == 0): fdr_spec = 1 else: fdr_spec = r_spec / (f_spec + r_spec) # Modified on Aug 02, 2019 to use Clean Peptide Sequence instead of peptide f_pep = df_f['Clean Peptide Sequence'].unique().size r_pep = df_r['Clean Peptide Sequence'].unique().size if (f_pep == 0) & (r_pep == 0): fdr_pep = 1 else: fdr_pep = r_pep / (f_pep + r_pep) ######################################Dataset FDR_table################################################ df_Metadata = pd.DataFrame({'PPM center': ppm_shift, 'PPM cutoff left': ppm_shift - fitted_params[1], \ 'PPM cutoff right': ppm_shift + fitted_params[0], \ 'SpecProb cutoff': 10 ** fitted_params[2], \ 'Spectra forward': f_spec, 'Spectra reverse': r_spec, \ 'Peptide forward': f_pep, 'Peptide reverse': r_pep, \ 'PeptideFDR': [fdr_pep], 'SepctraFDR': [fdr_spec], 'Dataset_ID': dataset_ID, \ 'Initial ppm cutoff left': ppm_shift - initial_guess[1], \ 'Initial ppm cutoff right': ppm_shift + initial_guess[0], \ 'Initial SpecProb cutoff': 10 ** initial_guess[2]}) ######################################SpecID table################################################ df_SpecID_f = df_f[['SpecID', 'Dataset_ID', 'Scan', 'Peptide', 'Clean Peptide Sequence', 'MSGFDB_SpecEValue', \ 'StatMomentsArea', 'DelM_PPM']].copy() df_SpecID_f.drop_duplicates(inplace=True) df_SpecID_r = df_r[['SpecID', 'Dataset_ID', 'Scan', 'Peptide', 'Clean Peptide Sequence', 'MSGFDB_SpecEValue', \ 'StatMomentsArea', 'DelM_PPM']].copy() df_SpecID_r.drop_duplicates(inplace=True) ######################################Spetra Count table########################################## df_spec = df_f[['Clean Peptide Sequence', 'SpecID']].copy() df_spec.drop_duplicates(inplace=True) df_SpectraCount = df_spec.groupby(['Clean Peptide Sequence']).size().rename('Peptide Spectra Count').reset_index() del df_spec df_SpectraCount['Dataset_ID'] = dataset_ID ######################################Spetra Count table########################################## df_Intensity = df_f[['Clean Peptide Sequence', 'StatMomentsArea']].groupby(['Clean Peptide Sequence']).max() df_Intensity.rename(columns={'StatMomentsArea': 'Peptide Peak Area'}, inplace=True) df_Intensity.reset_index(inplace=True) df_Intensity['Dataset_ID'] = dataset_ID return df_Metadata, df_SpecID_f, df_SpecID_r, df_SpectraCount, df_Intensity