Source code for src.processing.DatasetsMerger
from src.processing.MSGFplusMerger import MSGFplusMerger
from src.processing.MASICmerger import MASICmerger
import os
import pandas as pd
import fnmatch
from pathlib import Path
[docs]class DatasetsMerger(MSGFplusMerger):
'''1. Run for UserInput:
a datapackage or
a set of datasets or
a set of MSGFJobNums
2. create a crossTab object
'''
def __init__(self, folder= None, combineDatasets=None):
'''
:param folder:
:param combineDatasets:
'''
self.resultants = []
self.parent_folder = folder
self.resultants_df= None
self.crossTab = None
self.dataset_result_folder = folder.replace("data", "results")
self.combineDatasets= combineDatasets
[docs] def merge_all_jobs_in_UserInput(self):
'''
1. Run for each dataset.
2. Merge all MSGFjobs_MASIC_resultant objects.
:return:
'''
if not os.path.exists(self.dataset_result_folder):
# stop =0
for dataset in next(os.walk(self.parent_folder))[1]:
if dataset != "DMS_fasta_param":
dataset_loc = self.parent_folder + dataset + '/'
# print("dataset_loc >> ", dataset_loc)
msfg_obj= MSGFplusMerger(dataset_loc)
msfg_obj.consolidate_syn_files()
masic = MASICmerger(dataset_loc)
masic.merge_msgfplus_msaic(msfg_obj.MSGFjobs_Merged)
if self.combineDatasets:
self.resultants.append(masic.MSGFjobs_MASIC_resultant)
# if stop==1:
# break
print("`"*5)
print("Finished aggregating analysis tools results at loc:{}".format(self.dataset_result_folder))
print("`"*5)
if self.combineDatasets:
# concatenate all datasets
# print("self.combineDatasets >>", self.combineDatasets)
self.resultants_df = pd.concat(self.resultants)
# print("self.dataset_result_folder >> ", self.dataset_result_folder)
self.write_to_disk(self.resultants_df, self.dataset_result_folder, "resultants_df.tsv")
print("Already ran Pipeline, Merged jobs exists at @:{}! please delete them & rerun the pipeline!".format(self.dataset_result_folder))
return self.dataset_result_folder
# def manual_merge_datasets(self):
#
# group_files=[]
# for cur_path, directories, files in os.walk(str(Path(__file__).parents[2])+'/'+self.parent_folder):
# # print(cur_path)
# for file in files:
# if fnmatch.fnmatch(file, "MSGFjobs_MASIC_resultant.xlsx"):
# group_files.append(os.path.join(cur_path, file))
# print(group_files)
# df = pd.DataFrame()
# for f in group_files:
# data = pd.read_excel(f, 'Sheet1')
# df = df.append(data)
# self.write_to_disk(df,str(Path(__file__).parents[2])+'/'+self.parent_folder, "resultants_df_1.csv" )