File size: 7,000 Bytes
4707555 d50ee88 4707555 d50ee88 4707555 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | #!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Title : maotao_predict.py.py
project : minimind_RiboUTR
Created by: julse
Created on: 2025/10/23 16:02
des: TODO
"""
import os
import time
import pandas as pd
from inference import inference
from model.assemble_fragment import assemble_fragments
from model.codon_attr import Codon
from model.sliding_windows import process_nucleotide_sequences
from model.tools import get_pretraining_args
def check_path(dirout,file=False):
if file:dirout = dirout.rsplit('/',1)[0]
try:
if not os.path.exists(dirout):
print('make dir -p '+dirout)
os.makedirs(dirout)
except:
print(f'{dirout} have been made by other process')
def translate(nucleotide_seq):
seq = nucleotide_seq.replace('T','U')
amino_acid_seq = ''.join([Codon.CODON_TO_AA.get(seq[x:x+3],'_') for x in range(0,len(seq),3)])
return amino_acid_seq
def process_inputs(fin=None,dirout= None,codon_table=None):
# codon_table = '/Users/gz_julse/code/minimind_RiboUTR/maotao_file/codon_table/codon_usage_{species}.csv'
# fin = '/Users/gz_julse/Data/maotao/2025_bio-os_data/Tests.xlsx'
# dirout = f'{WDIR}/predict_web/'
check_path(dirout)
df = pd.read_excel(fin)
data = df[['id', 'RefSeq_aa']]
df = data
species_list = """mouse,Ec,Sac,Pic,Human""".split(',')
print(f'loading {len(data)} AA from {fin}\nprepare inputs for generating to CDS for expression in {species_list}')
codon_instance = {species: Codon(codon_table.format(species=species), rna=False) for species in species_list}
for i, species in enumerate(species_list):
df['species'] = species
df['cai_best_nn'] = df.apply(lambda x: codon_instance[x['species']].cai_opt_codon(x['RefSeq_aa']), axis=1)
if i == 0:
df.to_csv(dirout + '/TS.csv', mode='w', index=False, header=True)
else:
df.to_csv(dirout + '/TS.csv', mode='a', index=False, header=False)
data = pd.read_csv(dirout + '/TS.csv')
data['RefSeq_nn'] = data['cai_best_nn']
fragments_list = data.apply(
lambda x: process_nucleotide_sequences(x['RefSeq_nn'], max_nn_length=1200, step=300, pad_char='_',
meta_dict={'_id': x['id'], 'species': x['species']}), axis=1)
expanded_data = pd.DataFrame([item for sublist in fragments_list for item in sublist])
expanded_data['truncated_aa'] = expanded_data['truncated_nn'].apply(lambda x: translate(x))
expanded_data = expanded_data.rename(columns={'truncated_nn': 'cai_best_nn'})
expanded_data.to_csv(dirout + '/TS.csv', mode='w', index=False, header=True)
print(f'process {len(expanded_data)} data and saving to {dirout}/TS.csv')
def process_result(fin=None,fpred=None,ftruncated=None,fout=None):
# fin = '/Users/gz_julse/Data/maotao/2025_bio-os_data/Tests.xlsx'
# fpred = '/Users/gz_julse/Data/maotao/2025_bio-os_data/predict_web/TS_pred.csv'
# ftruncated = '/Users/gz_julse/Data/maotao/2025_bio-os_data/predict_web/TS.csv'
# fout = '/Users/gz_julse/Data/maotao/2025_bio-os_data/predict_web/TS_assemble.xlsx'
df = pd.read_excel(fin)
df_pred = pd.read_csv(fpred)
df_trun = pd.read_csv(ftruncated)
df_info = df_pred.merge(df_trun)
tmps = []
for (_id, species), data in df_info.groupby(by=['_id', 'species']):
# print(_id)
# if len(data) <40: continue
seq = assemble_fragments(data)
# seq = seq.replace('T','U')
# aa = ''.join([Codon.CODON_TO_AA[seq[x:x+3]] for x in range(0,len(seq),3)])
# print('seq',seq)
tmps.append([_id, species, seq])
df_tmp = pd.DataFrame(tmps, columns=['_id', 'species', 'seq'])
df_tmp['species'] = df_tmp['species'].replace({
'Ec': 'Escherichia coli',
'Human': 'Homo sapiens (Human)',
'Pic': 'Pichia angusta',
'Sac': 'Saccharomyces cerevisiae',
'mouse': 'Mus musculus (Mouse)'
})
full_name = ['Homo sapiens (Human)','Mus musculus (Mouse)','Escherichia coli','Saccharomyces cerevisiae','Pichia angusta']
df_wide = df_tmp.pivot(index=['_id'], columns='species', values='seq')
df_wide = df_wide.reset_index() # 将索引转回列
df_wide['RefSeq_aa_translate'] = df_wide['Homo sapiens (Human)'].apply(
lambda x: ''.join([Codon.CODON_TO_AA[x.replace('T', 'U')[i:i + 3]] for i in range(0, len(x), 3)]))
df_wide = df_wide.rename(columns={'_id': 'id'})
df_wide = df[['id', 'RefSeq_aa']].merge(df_wide, on=['id'])[['id', 'RefSeq_aa'] + full_name]
# if len(df_wide[df_wide['RefSeq_aa']!=df_wide['RefSeq_aa_translate']]):print('wrongly translated')
df_wide.to_excel(fout, index=False, engine='openpyxl')
def predict(fin,dirout):
'''prepare data'''
# codon_table = '/Users/gz_julse/code/minimind_RiboUTR/maotao_file/codon_table/codon_usage_{species}.csv'
# fin = '/Users/gz_julse/Data/maotao/2025_bio-os_data/Tests.xlsx'
# dirout = f'{WDIR}/predict_web/'
parser = get_pretraining_args()
args = parser.parse_args()
# config parameters
# args.downstream_data_path = 'maotao_file/'
# args.predict =True
# args.out_dir = 'maotao_exp/test'
# args.task = 'AA2CDS_data'
# args.mlm_pretrained_model_path = args.out_dir + '/AA2CDS.pth'
tmp_dir = dirout+'/tmp/'
# os.system(f'rm -rf {tmp_dir}')
check_path(tmp_dir)
args.downstream_data_path = tmp_dir
args.predict =True
args.out_dir = tmp_dir
args.task = 'AA2CDS_data/'
args.mlm_pretrained_model_path = 'checkpoint/AA2CDS.pth'
WDIR = os.path.join(args.downstream_data_path,args.task)
check_path(WDIR)
# fin = '/Users/gz_julse/Data/maotao/2025_bio-os_data/Tests.xlsx'
fpred = f'{WDIR}/TS_pred.csv'
ftruncated = f'{WDIR}/TS.csv'
fout = f'{dirout}/Tests.xlsx'
'''process inputs'''
process_inputs(fin=fin, dirout=os.path.dirname(fpred), codon_table=args.codon_table_path)
'''predict'''
inference(args)
# '''assemble'''
process_result(fin=fin,fpred=fpred,
ftruncated=ftruncated,
fout=fout)
if __name__ == '__main__':
print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
start = time.time()
# fin = '/Users/gz_julse/Data/maotao/2025_bio-os_data/Tests.xlsx'
'''round 1'''
# fin = 'example/Tests.xlsx'
# # dirout = 'maotao_exp/test/'
# dirout = os.path.abspath('example/out')
# os.system(f'rm -rf {dirout}')
# # --limit=320 --batch_size=12 --epoch=2 --out_dir=maotao_exp/test --learning_rate=0.000001 --predict --debug
# predict(fin,dirout)
# os.system(f'cp {dirout}/Tests.xlsx Tests.xlsx')
'''round2 for experiment'''
fin = 'example/Tests_round3.xlsx'
# dirout = 'maotao_exp/test/'
dirout = os.path.abspath('example/out_round3')
os.system(f'rm -rf {dirout}')
predict(fin,dirout)
print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('time', time.time() - start)
|