import os import yaml import pandas as pd def extract_rmsf_labels(file_path): with open(file_path, 'r') as f: lines = f.readlines() protein_id = file_path.split('/')[-2].split('_')[:-1] protein_id = '.'.join(protein_id) rmsf_values = [] for line in lines[1:]: parts = line.strip().split('\t') rmsf_r1 = float(parts[1]) rmsf_r2 = float(parts[2]) rmsf_r3 = float(parts[3]) avg_rmsf = (rmsf_r1 + rmsf_r2 + rmsf_r3) / 3 rmsf_values.append(avg_rmsf) return protein_id, rmsf_values def extract_bfactor_labels(file_path): bfactor = pd.read_csv(file_path, delimiter='\t')['Bfactor'] protein_id = file_path.split('/')[-2].split('_')[:-1] protein_id = '.'.join(protein_id) return protein_id, bfactor def extract_plddt_labels(file_path): plddt = pd.read_csv(file_path, delimiter='\t')['pLDDT'] protein_id = file_path.split('/')[-2].split('_')[:-1] protein_id = '.'.join(protein_id) return protein_id, plddt if __name__ == "__main__": config = yaml.load(open('configs/data_config.yaml'), Loader=yaml.FullLoader) in_path = config['atlas_out_dir'] out_path = config['atlas_labels_path'] rmsf_data = {} for folder in os.listdir(in_path): folder_path = os.path.join(in_path, folder) if os.path.isdir(folder_path): for file in os.listdir(folder_path): if file.endswith("_RMSF.tsv"): file_path = os.path.join(folder_path, file) protein_id, rmsf_labels = extract_rmsf_labels(file_path) rmsf_data[protein_id] = rmsf_labels with open(out_path, 'w') as out_file: for protein_id, values in rmsf_data.items(): out_file.write(f"{protein_id}: {', '.join(map(str, values))}\n")