julse commited on
Commit
77d01f2
·
verified ·
1 Parent(s): c4ae01e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -126
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import html
2
  import re
 
3
  import uuid
4
  from typing import Tuple, List
5
  from datetime import datetime
@@ -170,13 +171,12 @@ def plot_optimization_metrics(display_df, method="AA2CDS", figsize=(18, 12)):
170
  """
171
 
172
  # 定义要绘制的5个指标及其显示名称
173
- metrics = ['GC', 'GC_head', 'CAI', 'CAI_head', 'ENC']
174
  metric_titles = {
175
  'GC': 'GC Content',
176
  'GC_head': "5' GC Content",
177
  'CAI': 'Codon Adaptation Index (CAI)',
178
  'CAI_head': "5' CAI",
179
- 'ENC': 'Effective Number of Codons (ENC)'
180
  }
181
 
182
  # 验证数据列是否存在
@@ -185,7 +185,7 @@ def plot_optimization_metrics(display_df, method="AA2CDS", figsize=(18, 12)):
185
  raise ValueError(f"DataFrame missing required columns: {missing_cols}")
186
 
187
  # 创建2x3的子图布局(最后一个位置留空)
188
- fig, axes = plt.subplots(2, 3, figsize=figsize)
189
  axes = axes.flatten() # 展平为1D数组
190
 
191
  # 设置全局样式
@@ -392,7 +392,7 @@ def optimize_cds(protein_seq, species, codon_usage_table, method, status_msg,opt
392
  status_msg = log(f" • Protein length: {len(protein_seq)} aa")
393
 
394
  timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
395
- dirout = f'tmp/{timestamp}_{species}/'
396
  os.makedirs(dirout, exist_ok=True)
397
 
398
  task = 'predict_web'
@@ -402,119 +402,131 @@ def optimize_cds(protein_seq, species, codon_usage_table, method, status_msg,opt
402
  codon_usage_path = f'{dirout}/codon_usage.csv'
403
  codon_usage_table.to_csv(codon_usage_path, index=False)
404
 
405
- status_msg = log("🔹 Step 2/5: Initial CAI-optimal CDS generation")
406
-
407
- df = pd.DataFrame({'id': [_id], 'RefSeq_aa': [protein_seq]})
408
- df.to_csv(dirout + f'{task}/input.csv', index=False)
409
-
410
- reverse_mapping = {
411
- "Mus_musculus": "mouse",
412
- "Escherichia_coli": "Ec",
413
- "Saccharomyces_cerevisiae": "Sac",
414
- "Pichia": "Pic",
415
- "Homo_sapiens": "Human"
416
- }
417
- species = reverse_mapping[species]
418
- df['species'] = species
 
 
 
 
 
 
419
 
420
- codon_instance = {species: Codon(codon_usage_path, rna=False)}
421
- df['cai_best_nn'] = df.apply(
422
- lambda x: codon_instance[x['species']].cai_opt_codon(x['RefSeq_aa']), axis=1
423
- )
 
 
 
 
 
 
 
 
424
 
425
- status_msg = log("🔹 Step 3/5: Fragmentation & translation consistency check")
426
-
427
- fragments_list = df.apply(
428
- lambda x: process_nucleotide_sequences(
429
- x['cai_best_nn'],
430
- max_nn_length=1200,
431
- step=300,
432
- pad_char='_',
433
- meta_dict={'_id': x['id'], 'species': x['species']}
434
- ),
435
- axis=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  )
437
 
438
- expanded_data = pd.DataFrame([item for sublist in fragments_list for item in sublist])
439
- expanded_data['truncated_aa'] = expanded_data['truncated_nn'].apply(translate)
440
- expanded_data = expanded_data.rename(columns={'truncated_nn': 'cai_best_nn'})
441
- expanded_data.to_csv(dirout + f'{task}/TS.csv', index=False)
442
-
443
- status_msg = log("🔹 Step 4/5: Multi-seed neural optimization")
444
- # seeds = ['1337', '42', '2022', '2023', '2024', '2025']
445
- seeds = optimize_seed.split(',')
446
- status_msg = log(f" • Seeds: {', '.join(seeds)}")
447
-
448
- parser = get_pretraining_args()
449
- args = parser.parse_args()
450
- args.downstream_data_path = dirout
451
- args.task = task
452
- args.predict = True
453
- args.mlm_pretrained_model_path = 'checkpoint/AA2CDS.pth'
454
-
455
- tmps = []
456
- df_trun = pd.read_csv(dirout + f'{task}/TS.csv')
457
-
458
- for seed in seeds:
459
- status_msg = log(f" ⏳ Running inference (seed={seed})")
460
- args.seed = seed
461
- args.out_dir = f'{dirout}/{seed}'
462
- os.makedirs(args.out_dir, exist_ok=True)
463
-
464
- inference(args)
465
-
466
- fpred = f'{args.out_dir}/{task}/TS_pred.csv'
467
- df_pred = pd.read_csv(fpred)
468
- df_info = df_pred.merge(df_trun)
469
- print(len(df_info),df_info.columns)
470
- seq = assemble_fragments(df_info)
471
- analyzer = CodonUsageAnalyzer(codon_usage_path)
472
-
473
- result = single_seq_analysis(seq, _id, codon_usage_path)
474
- result.update({
475
- 'GC': round((seq.count("G") + seq.count("C")) / len(seq), 4),
476
- 'GC_head': round((seq[:60].count("G") + seq[:60].count("C")) / len(seq[:60]), 4),
477
- 'CAI': round(analyzer.calculate_CAI(seq), 4),
478
- 'CAI_head': round(analyzer.calculate_CAI(seq[:60]), 4),
479
- '_id': f'seed_{seed}',
480
- 'CDS_Full': seq,
481
- 'CDS': seq[:30] + "..." if len(seq) > 30 else seq,
482
- 'species': species,
483
- })
484
-
485
- tmps.append(pd.DataFrame({k: [v] for k, v in result.items()}))
486
-
487
- tmp_df = pd.concat(tmps, ignore_index=True)
488
- tmp_df = tmp_df.sort_values(by='CAI', ascending=False)
489
- tmp_df.to_csv(f'{dirout}/results.csv', index=False)
490
-
491
- status_msg = log("🔹 Step 5/5: Ranking & visualization")
492
-
493
- display_df = tmp_df[['_id', 'GC', 'GC_head', 'CAI', 'CAI_head', 'ENC','species','CDS']]
494
-
495
- # 使用示例
496
- fig_df = tmp_df[['GC', 'GC_head', 'CAI', 'CAI_head', 'ENC']] # 只选需要的列
497
- fig, axes = plot_optimization_metrics(fig_df, method=method)
498
- plt.savefig(f'{dirout}/optimization_metrics.png', dpi=300, bbox_inches='tight')
499
- plt.show()
500
-
501
- # fig, ax = plt.subplots(figsize=(10, 6))
502
- # scores = display_df["GC"].astype(float).tolist()
503
- # bars = ax.bar(range(1, len(scores) + 1), scores, alpha=0.7)
504
- # ax.set_xlabel("Sequence Rank")
505
- # ax.set_ylabel("GC Content")
506
- # ax.set_title(f"CDS Optimization Results ({method})")
507
- # ax.grid(True, alpha=0.3)
508
-
509
-
510
-
511
- # for i in range(min(5, len(bars))):
512
- # bars[i].set_color('orange')
513
-
514
- status_msg = log(f"✅ Successfully generated {len(display_df)} optimized CDS sequences")
515
- status_msg = log("🎉 Optimization complete")
516
-
517
- return display_df, fig,status_msg
518
 
519
  def download_cds_results(results_df):
520
  if results_df is None or len(results_df) == 0:
@@ -1115,41 +1127,43 @@ class MaoTaoWeb:
1115
  with gr.Row():
1116
  results_table = gr.Dataframe(
1117
  label="Optimization Results",
1118
- headers=["Rank", "Sequence", "GC%", "tRNA", "Usage", "MFE", "Score"],
1119
- datatype=["number", "str", "str", "str", "str", "str", "str"],
1120
- col_count=(7, "fixed"),
1121
  wrap=True
1122
  )
1123
 
1124
  optimization_plot = gr.Plot(label="Score Distribution")
1125
 
1126
  with gr.Row():
1127
- download_cds_btn = gr.Button("📥 Download CDS Results", variant="secondary")
1128
- cds_download_file = gr.File(label="Download File", visible=False)
 
 
 
 
 
1129
 
1130
  def optimize_and_update(protein_seq, species, codon_usage_table,method,optimize_seed):
1131
  status_msg = f"🔄 Optimizing CDS sequence using {method} method..."
1132
  # 执行优化
1133
- df, plot,status_msg = optimize_cds(protein_seq, species,codon_usage_table, method,status_msg,optimize_seed)
1134
  # 最终状态
1135
 
1136
  # final_status = f"✅ Optimization complete! Generated {len(df)} sequences with {variants:,} potential variants"
1137
  # self.status_display.update(final_status)
1138
- return df, plot,status_msg
1139
 
1140
  optimize_btn.click(
1141
  optimize_and_update, # protein_seq, species, codon_usage_table,method
1142
  inputs=[protein_seq, species,codon_usage_table,method,optimize_seed],
1143
- outputs=[results_table, optimization_plot, optimize_log]
1144
  )
1145
 
1146
  cds_example_btn.click(lambda: EXAMPLE_PROTEIN, outputs=protein_seq)
1147
 
1148
- download_cds_btn.click(
1149
- download_cds_results,
1150
- inputs=results_table,
1151
- outputs=cds_download_file
1152
- )
1153
 
1154
  def resources_tab(self):
1155
  with gr.Tab("📚 Resources"):
 
1
  import html
2
  import re
3
+ import shutil
4
  import uuid
5
  from typing import Tuple, List
6
  from datetime import datetime
 
171
  """
172
 
173
  # 定义要绘制的5个指标及其显示名称
174
+ metrics = ['GC', 'GC_head', 'CAI', 'CAI_head']
175
  metric_titles = {
176
  'GC': 'GC Content',
177
  'GC_head': "5' GC Content",
178
  'CAI': 'Codon Adaptation Index (CAI)',
179
  'CAI_head': "5' CAI",
 
180
  }
181
 
182
  # 验证数据列是否存在
 
185
  raise ValueError(f"DataFrame missing required columns: {missing_cols}")
186
 
187
  # 创建2x3的子图布局(最后一个位置留空)
188
+ fig, axes = plt.subplots(2, 2, figsize=figsize)
189
  axes = axes.flatten() # 展平为1D数组
190
 
191
  # 设置全局样式
 
392
  status_msg = log(f" • Protein length: {len(protein_seq)} aa")
393
 
394
  timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
395
+ dirout = f'tmp1/{timestamp}_{species}/'
396
  os.makedirs(dirout, exist_ok=True)
397
 
398
  task = 'predict_web'
 
402
  codon_usage_path = f'{dirout}/codon_usage.csv'
403
  codon_usage_table.to_csv(codon_usage_path, index=False)
404
 
405
+ try:
406
+ status_msg = log("🔹 Step 2/5: Initial CAI-optimal CDS generation")
407
+
408
+ df = pd.DataFrame({'id': [_id], 'RefSeq_aa': [protein_seq]})
409
+ df.to_csv(dirout + f'{task}/input.csv', index=False)
410
+
411
+ reverse_mapping = {
412
+ "Mus_musculus": "mouse",
413
+ "Escherichia_coli": "Ec",
414
+ "Saccharomyces_cerevisiae": "Sac",
415
+ "Pichia": "Pic",
416
+ "Homo_sapiens": "Human"
417
+ }
418
+ species = reverse_mapping[species]
419
+ df['species'] = species
420
+
421
+ codon_instance = {species: Codon(codon_usage_path, rna=False)}
422
+ df['cai_best_nn'] = df.apply(
423
+ lambda x: codon_instance[x['species']].cai_opt_codon(x['RefSeq_aa']), axis=1
424
+ )
425
 
426
+ status_msg = log("🔹 Step 3/5: Fragmentation & translation consistency check")
427
+
428
+ fragments_list = df.apply(
429
+ lambda x: process_nucleotide_sequences(
430
+ x['cai_best_nn'],
431
+ max_nn_length=1200,
432
+ step=300,
433
+ pad_char='_',
434
+ meta_dict={'_id': x['id'], 'species': x['species']}
435
+ ),
436
+ axis=1
437
+ )
438
 
439
+ expanded_data = pd.DataFrame([item for sublist in fragments_list for item in sublist])
440
+ expanded_data['truncated_aa'] = expanded_data['truncated_nn'].apply(translate)
441
+ expanded_data = expanded_data.rename(columns={'truncated_nn': 'cai_best_nn'})
442
+ expanded_data.to_csv(dirout + f'{task}/TS.csv', index=False)
443
+
444
+ status_msg = log("🔹 Step 4/5: Multi-seed neural optimization")
445
+ # seeds = ['1337', '42', '2022', '2023', '2024', '2025']
446
+ seeds = optimize_seed.split(',')
447
+ status_msg = log(f" • Seeds: {', '.join(seeds)}")
448
+
449
+ parser = get_pretraining_args()
450
+ args = parser.parse_args()
451
+ args.downstream_data_path = dirout
452
+ args.task = task
453
+ args.predict = True
454
+ args.mlm_pretrained_model_path = 'checkpoint/AA2CDS.pth'
455
+
456
+ tmps = []
457
+ df_trun = pd.read_csv(dirout + f'{task}/TS.csv')
458
+
459
+ for seed in seeds:
460
+ status_msg = log(f" ⏳ Running inference (seed={seed})")
461
+ args.seed = seed
462
+ args.out_dir = f'{dirout}/{seed}'
463
+ os.makedirs(args.out_dir, exist_ok=True)
464
+
465
+ inference(args)
466
+
467
+ fpred = f'{args.out_dir}/{task}/TS_pred.csv'
468
+ os.system(f'cat {fpred}')
469
+ df_pred = pd.read_csv(fpred)
470
+ df_info = df_pred.merge(df_trun)
471
+ print(len(df_info), df_info.columns)
472
+ seq = assemble_fragments(df_info)
473
+ analyzer = CodonUsageAnalyzer(codon_usage_path)
474
+
475
+ result = single_seq_analysis(seq, _id, codon_usage_path)
476
+ result.update({
477
+ 'GC': round((seq.count("G") + seq.count("C")) / len(seq), 4),
478
+ 'GC_head': round((seq[:60].count("G") + seq[:60].count("C")) / len(seq[:60]), 4),
479
+ 'CAI': round(analyzer.calculate_CAI(seq), 4),
480
+ 'CAI_head': round(analyzer.calculate_CAI(seq[:60]), 4),
481
+ '_id': f'seed_{seed}',
482
+ 'CDS_Full': seq,
483
+ 'CDS': seq[:30] + "..." if len(seq) > 30 else seq,
484
+ 'species': species,
485
+ })
486
+
487
+ tmps.append(pd.DataFrame({k: [v] for k, v in result.items()}))
488
+
489
+ tmp_df = pd.concat(tmps, ignore_index=True)
490
+ tmp_df = tmp_df.sort_values(by='CAI', ascending=False)
491
+ tmp_df.to_csv(f'{dirout}/results.csv', index=False)
492
+
493
+ status_msg = log("🔹 Step 5/5: Ranking & visualization")
494
+
495
+ display_df = tmp_df[['_id', 'GC', 'GC_head', 'CAI', 'CAI_head', 'ENC', 'species', 'CDS']]
496
+
497
+ # 使用示例
498
+ fig_df = tmp_df[['GC', 'GC_head', 'CAI', 'CAI_head']] # 只选需要的列
499
+ fig, axes = plot_optimization_metrics(fig_df, method=method)
500
+ plt.savefig(f'{dirout}/optimization_metrics.png', dpi=300, bbox_inches='tight')
501
+ plt.show()
502
+
503
+ # fig, ax = plt.subplots(figsize=(10, 6))
504
+ # scores = display_df["GC"].astype(float).tolist()
505
+ # bars = ax.bar(range(1, len(scores) + 1), scores, alpha=0.7)
506
+ # ax.set_xlabel("Sequence Rank")
507
+ # ax.set_ylabel("GC Content")
508
+ # ax.set_title(f"CDS Optimization Results ({method})")
509
+ # ax.grid(True, alpha=0.3)
510
+
511
+ # for i in range(min(5, len(bars))):
512
+ # bars[i].set_color('orange')
513
+
514
+ status_msg = log(f"✅ Successfully generated {len(display_df)} optimized CDS sequences")
515
+ status_msg = log("🎉 Optimization complete")
516
+ except Exception as e:
517
+ status_msg = log(f"❌ Error: {e}")
518
+ None, None,None, status_msg
519
+
520
+ src_dir = "/app/tmp/20251220153157_Mus_musculus/42/predict_web"
521
+ zip_base = "/app/tmp/predict_web_results" # 不要加 .zip
522
+
523
+ zip_path = shutil.make_archive(
524
+ base_name=zip_base,
525
+ format="zip",
526
+ root_dir=src_dir
527
  )
528
 
529
+ return display_df, fig,zip_path,status_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
  def download_cds_results(results_df):
532
  if results_df is None or len(results_df) == 0:
 
1127
  with gr.Row():
1128
  results_table = gr.Dataframe(
1129
  label="Optimization Results",
1130
+ headers=['_id', 'GC', 'GC_head', 'CAI', 'CAI_head', 'ENC','species','CDS'],
1131
+ datatype=["str","number", "number", "number", "number", "number", "str", "str"],
1132
+ col_count=(8, "fixed"),
1133
  wrap=True
1134
  )
1135
 
1136
  optimization_plot = gr.Plot(label="Score Distribution")
1137
 
1138
  with gr.Row():
1139
+ # download_cds_btn = gr.Button("📥 Download CDS Results", variant="secondary")
1140
+ # cds_download_file = gr.File(label="Download File", visible=False)
1141
+
1142
+ download_btn = gr.DownloadButton(
1143
+ label="⬇ Download all results (ZIP)",
1144
+ value='predict_web_results.zip',
1145
+ )
1146
 
1147
  def optimize_and_update(protein_seq, species, codon_usage_table,method,optimize_seed):
1148
  status_msg = f"🔄 Optimizing CDS sequence using {method} method..."
1149
  # 执行优化
1150
+ df, plot,zip_path,status_msg = optimize_cds(protein_seq, species,codon_usage_table, method,status_msg,optimize_seed)
1151
  # 最终状态
1152
 
1153
  # final_status = f"✅ Optimization complete! Generated {len(df)} sequences with {variants:,} potential variants"
1154
  # self.status_display.update(final_status)
1155
+ return df, plot,zip_path,status_msg
1156
 
1157
  optimize_btn.click(
1158
  optimize_and_update, # protein_seq, species, codon_usage_table,method
1159
  inputs=[protein_seq, species,codon_usage_table,method,optimize_seed],
1160
+ outputs=[results_table, optimization_plot,download_btn, optimize_log]
1161
  )
1162
 
1163
  cds_example_btn.click(lambda: EXAMPLE_PROTEIN, outputs=protein_seq)
1164
 
1165
+
1166
+
 
 
 
1167
 
1168
  def resources_tab(self):
1169
  with gr.Tab("📚 Resources"):