|
|
|
|
|
|
|
|
""" |
|
|
Title : maotao_predict.py.py |
|
|
project : minimind_RiboUTR |
|
|
Created by: julse |
|
|
Created on: 2025/10/23 16:02 |
|
|
des: TODO |
|
|
""" |
|
|
import os |
|
|
|
|
|
import time |
|
|
import pandas as pd |
|
|
|
|
|
from inference import inference |
|
|
from model.assemble_fragment import assemble_fragments |
|
|
from model.codon_attr import Codon |
|
|
from model.sliding_windows import process_nucleotide_sequences |
|
|
from model.tools import get_pretraining_args |
|
|
|
|
|
def check_path(dirout,file=False): |
|
|
if file:dirout = dirout.rsplit('/',1)[0] |
|
|
try: |
|
|
if not os.path.exists(dirout): |
|
|
print('make dir -p '+dirout) |
|
|
os.makedirs(dirout) |
|
|
except: |
|
|
print(f'{dirout} have been made by other process') |
|
|
def translate(nucleotide_seq): |
|
|
seq = nucleotide_seq.replace('T','U') |
|
|
amino_acid_seq = ''.join([Codon.CODON_TO_AA.get(seq[x:x+3],'_') for x in range(0,len(seq),3)]) |
|
|
return amino_acid_seq |
|
|
def process_inputs(fin=None,dirout= None,codon_table=None): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
check_path(dirout) |
|
|
df = pd.read_excel(fin) |
|
|
data = df[['id', 'RefSeq_aa']] |
|
|
df = data |
|
|
species_list = """mouse,Ec,Sac,Pic,Human""".split(',') |
|
|
print(f'loading {len(data)} AA from {fin}\nprepare inputs for generating to CDS for expression in {species_list}') |
|
|
|
|
|
codon_instance = {species: Codon(codon_table.format(species=species), rna=False) for species in species_list} |
|
|
|
|
|
for i, species in enumerate(species_list): |
|
|
df['species'] = species |
|
|
df['cai_best_nn'] = df.apply(lambda x: codon_instance[x['species']].cai_opt_codon(x['RefSeq_aa']), axis=1) |
|
|
if i == 0: |
|
|
df.to_csv(dirout + '/TS.csv', mode='w', index=False, header=True) |
|
|
else: |
|
|
df.to_csv(dirout + '/TS.csv', mode='a', index=False, header=False) |
|
|
data = pd.read_csv(dirout + '/TS.csv') |
|
|
data['RefSeq_nn'] = data['cai_best_nn'] |
|
|
fragments_list = data.apply( |
|
|
lambda x: process_nucleotide_sequences(x['RefSeq_nn'], max_nn_length=1200, step=300, pad_char='_', |
|
|
meta_dict={'_id': x['id'], 'species': x['species']}), axis=1) |
|
|
expanded_data = pd.DataFrame([item for sublist in fragments_list for item in sublist]) |
|
|
expanded_data['truncated_aa'] = expanded_data['truncated_nn'].apply(lambda x: translate(x)) |
|
|
expanded_data = expanded_data.rename(columns={'truncated_nn': 'cai_best_nn'}) |
|
|
expanded_data.to_csv(dirout + '/TS.csv', mode='w', index=False, header=True) |
|
|
print(f'process {len(expanded_data)} data and saving to {dirout}/TS.csv') |
|
|
def process_result(fin=None,fpred=None,ftruncated=None,fout=None): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_excel(fin) |
|
|
df_pred = pd.read_csv(fpred) |
|
|
df_trun = pd.read_csv(ftruncated) |
|
|
df_info = df_pred.merge(df_trun) |
|
|
tmps = [] |
|
|
for (_id, species), data in df_info.groupby(by=['_id', 'species']): |
|
|
|
|
|
|
|
|
seq = assemble_fragments(data) |
|
|
|
|
|
|
|
|
|
|
|
tmps.append([_id, species, seq]) |
|
|
df_tmp = pd.DataFrame(tmps, columns=['_id', 'species', 'seq']) |
|
|
|
|
|
df_tmp['species'] = df_tmp['species'].replace({ |
|
|
'Ec': 'Escherichia coli', |
|
|
'Human': 'Homo sapiens (Human)', |
|
|
'Pic': 'Pichia angusta', |
|
|
'Sac': 'Saccharomyces cerevisiae', |
|
|
'mouse': 'Mus musculus (Mouse)' |
|
|
}) |
|
|
full_name = ['Homo sapiens (Human)','Mus musculus (Mouse)','Escherichia coli','Saccharomyces cerevisiae','Pichia angusta'] |
|
|
df_wide = df_tmp.pivot(index=['_id'], columns='species', values='seq') |
|
|
df_wide = df_wide.reset_index() |
|
|
df_wide['RefSeq_aa_translate'] = df_wide['Homo sapiens (Human)'].apply( |
|
|
lambda x: ''.join([Codon.CODON_TO_AA[x.replace('T', 'U')[i:i + 3]] for i in range(0, len(x), 3)])) |
|
|
df_wide = df_wide.rename(columns={'_id': 'id'}) |
|
|
df_wide = df[['id', 'RefSeq_aa']].merge(df_wide, on=['id'])[['id', 'RefSeq_aa'] + full_name] |
|
|
|
|
|
df_wide.to_excel(fout, index=False, engine='openpyxl') |
|
|
|
|
|
def predict(fin,dirout): |
|
|
'''prepare data''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser = get_pretraining_args() |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tmp_dir = dirout+'/tmp/' |
|
|
|
|
|
check_path(tmp_dir) |
|
|
args.downstream_data_path = tmp_dir |
|
|
args.predict =True |
|
|
args.out_dir = tmp_dir |
|
|
args.task = 'AA2CDS_data/' |
|
|
args.mlm_pretrained_model_path = 'checkpoint/AA2CDS.pth' |
|
|
|
|
|
WDIR = os.path.join(args.downstream_data_path,args.task) |
|
|
check_path(WDIR) |
|
|
|
|
|
fpred = f'{WDIR}/TS_pred.csv' |
|
|
ftruncated = f'{WDIR}/TS.csv' |
|
|
fout = f'{dirout}/Tests.xlsx' |
|
|
|
|
|
|
|
|
'''process inputs''' |
|
|
process_inputs(fin=fin, dirout=os.path.dirname(fpred), codon_table=args.codon_table_path) |
|
|
'''predict''' |
|
|
|
|
|
inference(args) |
|
|
|
|
|
process_result(fin=fin,fpred=fpred, |
|
|
ftruncated=ftruncated, |
|
|
fout=fout) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) |
|
|
start = time.time() |
|
|
|
|
|
'''round 1''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''round2 for experiment''' |
|
|
fin = 'example/Tests_round3.xlsx' |
|
|
|
|
|
dirout = os.path.abspath('example/out_round3') |
|
|
os.system(f'rm -rf {dirout}') |
|
|
predict(fin,dirout) |
|
|
|
|
|
|
|
|
print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) |
|
|
print('time', time.time() - start) |
|
|
|