re-type commited on
Commit
740aa59
·
verified ·
1 Parent(s): 62b42ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +277 -39
app.py CHANGED
@@ -11,16 +11,18 @@ import numpy as np
11
  from predictor import GenePredictor
12
  from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
 
 
14
 
15
  # --- Global Variables ---
16
- MAFFT_PATH = "mafft/mafftdir/bin/mafft"
 
17
 
18
  # --- Logging ---
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
 
21
  # --- Paths ---
22
  from huggingface_hub import hf_hub_download
23
- import tempfile
24
 
25
  # Model repository and file paths
26
  model_repo = "GGproject10/best_boundary_aware_model"
@@ -95,6 +97,212 @@ except Exception as e:
95
  logging.error(f"Failed to initialize tree analyzer: {e}")
96
  analyzer = None
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # --- Tree Analysis Function (Based on old Gradio API) ---
99
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
100
  """
@@ -205,23 +413,23 @@ def read_fasta_file(file_obj):
205
  return ""
206
 
207
  # --- Full Pipeline ---
208
- def run_pipeline_from_file(fasta_file_obj, similarity_score):
209
  try:
210
  dna_input = read_fasta_file(fasta_file_obj)
211
  if not dna_input:
212
- return "Failed to read FASTA file", "", "", "", None, None, None, "No input sequence"
213
- return run_pipeline(dna_input, similarity_score)
214
  except Exception as e:
215
  error_msg = f"Pipeline error: {str(e)}"
216
  logging.error(error_msg)
217
- return error_msg, "", "", "", None, None, None, error_msg
218
 
219
- def run_pipeline(dna_input, similarity_score=95.0):
220
  try:
221
  # Clean input
222
  dna_input = dna_input.upper().strip()
223
  if not dna_input:
224
- return "Empty input", "", "", "", None, None, None, "No input provided"
225
 
226
  # Sanitize DNA sequence
227
  if not re.match('^[ACTGN]+$', dna_input):
@@ -267,29 +475,47 @@ def run_pipeline(dna_input, similarity_score=95.0):
267
  else:
268
  keras_output = "Skipped: sequence too short for F gene validation"
269
 
270
- # Step 3: MAFFT and IQ-TREE (skip due to configuration issues)
271
  aligned_file = None
272
  phy_file = None
 
273
 
274
- # Skip MAFFT due to configuration issues in the container
275
- logging.info("Skipping MAFFT/IQ-TREE due to container configuration issues")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- # Step 4: ML Simplified Tree (using the new approach)
278
  html_file = None
279
  tree_html_content = "No tree generated"
280
- ml_output = ""
281
 
282
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
283
  try:
284
- logging.info(f"Starting ML tree analysis with F gene sequence length: {len(processed_sequence)}")
285
 
286
- # Use the new tree analysis function with user-specified similarity
287
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
288
 
289
  if tree_result and not tree_result.startswith("Error:"):
290
  # Success - we have HTML content
291
  tree_html_content = tree_result
292
- ml_output = "✅ Phylogenetic tree generated successfully!"
293
 
294
  # Check if HTML file was created
295
  output_dir = "output"
@@ -297,35 +523,36 @@ def run_pipeline(dna_input, similarity_score=95.0):
297
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
298
  if html_files:
299
  html_file = os.path.join(output_dir, html_files[-1]) # Get the latest
300
- ml_output += f"\n- Tree file: {html_files[-1]}"
301
 
302
  # Count sequences analyzed
303
  if analyzer.find_query_sequence(processed_sequence):
304
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
305
- ml_output += f"\n- {len(matched_ids)} sequences analyzed"
306
- ml_output += f"\n- Similarity threshold: {perc:.1f}%"
307
  else:
308
  # Error occurred
309
- ml_output = f"❌ Tree analysis failed: {tree_result}"
310
- logging.error(f"Tree analysis failed: {tree_result}")
311
 
312
  except Exception as e:
313
- ml_output = f"❌ ML Tree analysis failed: {str(e)}"
314
- logging.error(f"ML Tree failed: {e}")
315
  import traceback
316
  logging.error(f"Full traceback: {traceback.format_exc()}")
317
  elif not analyzer:
318
- ml_output = "❌ Tree analyzer not initialized"
319
  elif not processed_sequence or len(processed_sequence) < 10:
320
- ml_output = f"❌ F gene sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
321
  else:
322
- ml_output = "❌ Skipped due to previous step errors"
323
 
324
  return (
325
  boundary_output,
326
  keras_output[:500] + "..." if len(keras_output) > 500 else keras_output,
327
  csv_path if os.path.exists(csv_path) else "CSV file not found",
328
- ml_output,
 
329
  html_file,
330
  aligned_file if aligned_file and os.path.exists(aligned_file) else None,
331
  phy_file if phy_file and os.path.exists(phy_file) else None,
@@ -337,16 +564,16 @@ def run_pipeline(dna_input, similarity_score=95.0):
337
  logging.error(error_msg)
338
  import traceback
339
  logging.error(f"Full traceback: {traceback.format_exc()}")
340
- return error_msg, "", "", "", None, None, None, error_msg
341
 
342
  # --- Gradio UI ---
343
  with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft()) as demo:
344
  gr.Markdown("# 🧬 Viral Gene Phylogenetic Inference Pipeline")
345
- gr.Markdown("This pipeline processes DNA sequences through boundary detection, k-mer analysis, and phylogenetic tree construction.")
346
 
347
  with gr.Tab("📝 Paste DNA Sequence"):
348
  with gr.Row():
349
- with gr.Column(scale=3):
350
  inp = gr.Textbox(
351
  label="DNA Input",
352
  placeholder="Paste your DNA sequence here (ACTG format)",
@@ -361,11 +588,16 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft())
361
  label="Similarity Threshold (%)",
362
  info="Higher values = more similar sequences"
363
  )
 
 
 
 
 
364
  btn1 = gr.Button("🚀 Run Pipeline", variant="primary", size="lg")
365
 
366
  with gr.Tab("📁 Upload FASTA File"):
367
  with gr.Row():
368
- with gr.Column(scale=3):
369
  file_input = gr.File(
370
  label="FASTA File",
371
  file_types=['.fasta', '.fa', '.txt']
@@ -379,6 +611,11 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft())
379
  label="Similarity Threshold (%)",
380
  info="Higher values = more similar sequences"
381
  )
 
 
 
 
 
382
  btn2 = gr.Button("🚀 Run on FASTA", variant="primary", size="lg")
383
 
384
  # Outputs
@@ -388,14 +625,15 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft())
388
  with gr.Column():
389
  out1 = gr.Textbox(label="🎯 Step 1: Extracted F Gene Sequence", lines=8)
390
  out2 = gr.Textbox(label="🔍 Step 2: F Gene Validation (Keras)", lines=3)
391
- with gr.Column():
392
  out3 = gr.Textbox(label="📋 Dataset Used")
393
- out4 = gr.Textbox(label="🌳 Step 3: Phylogenetic Tree Status", lines=5)
 
 
394
 
395
  with gr.Row():
396
- html = gr.File(label="📥 Download Tree (HTML)")
397
  fasta = gr.File(label="📥 Download Aligned FASTA")
398
- phy = gr.File(label="📥 Download IQ-TREE .phy File")
399
 
400
  with gr.Row():
401
  tree_html = gr.HTML(label="🌳 Interactive Tree Preview")
@@ -403,13 +641,13 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft())
403
  # Event handlers
404
  btn1.click(
405
  fn=run_pipeline,
406
- inputs=[inp, similarity_input],
407
- outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html]
408
  )
409
  btn2.click(
410
  fn=run_pipeline_from_file,
411
- inputs=[file_input, similarity_input_file],
412
- outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html]
413
  )
414
 
415
  if __name__ == '__main__':
 
11
  from predictor import GenePredictor
12
  from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
+ import tempfile
15
+ import shutil
16
 
17
  # --- Global Variables ---
18
+ MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
19
+ IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
20
 
21
  # --- Logging ---
22
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
 
24
  # --- Paths ---
25
  from huggingface_hub import hf_hub_download
 
26
 
27
  # Model repository and file paths
28
  model_repo = "GGproject10/best_boundary_aware_model"
 
97
  logging.error(f"Failed to initialize tree analyzer: {e}")
98
  analyzer = None
99
 
100
+ # --- Helper Functions ---
101
+ def check_tool_availability():
102
+ """Check if MAFFT and IQ-TREE are available"""
103
+ mafft_available = os.path.exists(MAFFT_PATH) or shutil.which('mafft') is not None
104
+ iqtree_available = os.path.exists(IQTREE_PATH) or shutil.which('iqtree2') is not None or shutil.which('iqtree') is not None
105
+
106
+ return mafft_available, iqtree_available
107
+
108
+ def run_mafft_alignment(input_fasta, output_fasta):
109
+ """Run MAFFT alignment on input FASTA file"""
110
+ try:
111
+ # Check if MAFFT is available
112
+ mafft_cmd = MAFFT_PATH if os.path.exists(MAFFT_PATH) else 'mafft'
113
+
114
+ # MAFFT command
115
+ cmd = [mafft_cmd, '--auto', input_fasta]
116
+
117
+ logging.info(f"Running MAFFT: {' '.join(cmd)}")
118
+
119
+ # Run MAFFT
120
+ result = subprocess.run(
121
+ cmd,
122
+ capture_output=True,
123
+ text=True,
124
+ timeout=300 # 5 minute timeout
125
+ )
126
+
127
+ if result.returncode == 0:
128
+ # Write aligned sequences to output file
129
+ with open(output_fasta, 'w') as f:
130
+ f.write(result.stdout)
131
+ logging.info(f"MAFFT alignment completed: {output_fasta}")
132
+ return True, output_fasta
133
+ else:
134
+ logging.error(f"MAFFT failed: {result.stderr}")
135
+ return False, f"MAFFT error: {result.stderr}"
136
+
137
+ except subprocess.TimeoutExpired:
138
+ logging.error("MAFFT timeout")
139
+ return False, "MAFFT timeout (>5 minutes)"
140
+ except Exception as e:
141
+ logging.error(f"MAFFT execution failed: {e}")
142
+ return False, f"MAFFT execution failed: {str(e)}"
143
+
144
+ def run_iqtree_analysis(aligned_fasta, output_prefix):
145
+ """Run IQ-TREE maximum likelihood analysis"""
146
+ try:
147
+ # Check if IQ-TREE is available
148
+ if os.path.exists(IQTREE_PATH):
149
+ iqtree_cmd = IQTREE_PATH
150
+ elif shutil.which('iqtree2') is not None:
151
+ iqtree_cmd = 'iqtree2'
152
+ elif shutil.which('iqtree') is not None:
153
+ iqtree_cmd = 'iqtree'
154
+ else:
155
+ return False, "IQ-TREE not found"
156
+
157
+ # IQ-TREE command for maximum likelihood tree
158
+ cmd = [
159
+ iqtree_cmd,
160
+ '-s', aligned_fasta,
161
+ '-m', 'TEST', # Auto model selection
162
+ '-bb', '1000', # Bootstrap replicates
163
+ '-alrt', '1000', # SH-aLRT test
164
+ '-nt', 'AUTO', # Auto detect threads
165
+ '--prefix', output_prefix,
166
+ '-redo' # Overwrite existing files
167
+ ]
168
+
169
+ logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
170
+
171
+ # Run IQ-TREE
172
+ result = subprocess.run(
173
+ cmd,
174
+ capture_output=True,
175
+ text=True,
176
+ timeout=600 # 10 minute timeout
177
+ )
178
+
179
+ if result.returncode == 0:
180
+ tree_file = f"{output_prefix}.treefile"
181
+ if os.path.exists(tree_file):
182
+ logging.info(f"IQ-TREE analysis completed: {tree_file}")
183
+ return True, tree_file
184
+ else:
185
+ logging.error("IQ-TREE completed but tree file not found")
186
+ return False, "Tree file not generated"
187
+ else:
188
+ logging.error(f"IQ-TREE failed: {result.stderr}")
189
+ return False, f"IQ-TREE error: {result.stderr}"
190
+
191
+ except subprocess.TimeoutExpired:
192
+ logging.error("IQ-TREE timeout")
193
+ return False, "IQ-TREE timeout (>10 minutes)"
194
+ except Exception as e:
195
+ logging.error(f"IQ-TREE execution failed: {e}")
196
+ return False, f"IQ-TREE execution failed: {str(e)}"
197
+
198
+ def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
199
+ """Create a multi-FASTA file with query sequence and reference sequences"""
200
+ try:
201
+ # Create temporary FASTA file
202
+ temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
203
+
204
+ # Add query sequence
205
+ temp_fasta.write(f">{query_id}\n{query_sequence}\n")
206
+
207
+ # Add reference sequences from existing aligned FASTA if available
208
+ ref_fasta_path = "f_gene_sequences_aligned.fasta"
209
+ if os.path.exists(ref_fasta_path):
210
+ with open(ref_fasta_path, 'r') as ref_file:
211
+ temp_fasta.write(ref_file.read())
212
+ logging.info(f"Added reference sequences from {ref_fasta_path}")
213
+ else:
214
+ # If no reference file, try to create from CSV data
215
+ if analyzer and hasattr(analyzer, 'data'):
216
+ count = 0
217
+ for idx, row in analyzer.data.iterrows():
218
+ if 'sequence' in row and len(str(row['sequence'])) > 50:
219
+ seq_id = row.get('id', f"Ref_{count}")
220
+ sequence = str(row['sequence']).upper()
221
+ temp_fasta.write(f">{seq_id}\n{sequence}\n")
222
+ count += 1
223
+ if count >= 20: # Limit to prevent too large datasets
224
+ break
225
+ logging.info(f"Added {count} reference sequences from CSV")
226
+
227
+ temp_fasta.close()
228
+ return temp_fasta.name
229
+
230
+ except Exception as e:
231
+ logging.error(f"Failed to create multi-FASTA: {e}")
232
+ return None
233
+
234
+ def build_maximum_likelihood_tree(f_gene_sequence):
235
+ """Build maximum likelihood phylogenetic tree using MAFFT + IQ-TREE"""
236
+ try:
237
+ # Check tool availability
238
+ mafft_available, iqtree_available = check_tool_availability()
239
+
240
+ if not mafft_available:
241
+ return False, "MAFFT not available", None, None
242
+ if not iqtree_available:
243
+ return False, "IQ-TREE not available", None, None
244
+
245
+ # Create output directory
246
+ output_dir = "ml_tree_output"
247
+ os.makedirs(output_dir, exist_ok=True)
248
+
249
+ # Step 1: Create multi-FASTA file with query and reference sequences
250
+ logging.info("Creating multi-FASTA file...")
251
+ multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
252
+ if not multi_fasta:
253
+ return False, "Failed to create input FASTA", None, None
254
+
255
+ # Step 2: Run MAFFT alignment
256
+ logging.info("Running MAFFT alignment...")
257
+ aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
258
+ mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta)
259
+
260
+ # Clean up temporary file
261
+ os.unlink(multi_fasta)
262
+
263
+ if not mafft_success:
264
+ return False, f"MAFFT failed: {mafft_result}", None, None
265
+
266
+ # Step 3: Run IQ-TREE analysis
267
+ logging.info("Running IQ-TREE analysis...")
268
+ tree_prefix = os.path.join(output_dir, "ml_tree")
269
+ iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix)
270
+
271
+ if not iqtree_success:
272
+ return False, f"IQ-TREE failed: {iqtree_result}", aligned_fasta, None
273
+
274
+ # Step 4: Prepare output files
275
+ tree_file = iqtree_result
276
+ log_file = f"{tree_prefix}.log"
277
+
278
+ # Copy to standard names for compatibility
279
+ standard_aligned = "f_gene_sequences_aligned.fasta"
280
+ standard_tree = "f_gene_sequences.phy.treefile"
281
+
282
+ if os.path.exists(aligned_fasta):
283
+ shutil.copy2(aligned_fasta, standard_aligned)
284
+ if os.path.exists(tree_file):
285
+ shutil.copy2(tree_file, standard_tree)
286
+
287
+ success_msg = f"✅ Maximum likelihood tree built successfully!\n"
288
+ success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
289
+ success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
290
+
291
+ if os.path.exists(log_file):
292
+ with open(log_file, 'r') as f:
293
+ log_content = f.read()
294
+ # Extract model information
295
+ if "Best-fit model:" in log_content:
296
+ model_line = [line for line in log_content.split('\n') if "Best-fit model:" in line][0]
297
+ success_msg += f"- {model_line.strip()}\n"
298
+
299
+ logging.info("Maximum likelihood tree construction completed")
300
+ return True, success_msg, aligned_fasta, tree_file
301
+
302
+ except Exception as e:
303
+ logging.error(f"ML tree construction failed: {e}")
304
+ return False, f"ML tree construction failed: {str(e)}", None, None
305
+
306
  # --- Tree Analysis Function (Based on old Gradio API) ---
307
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
308
  """
 
413
  return ""
414
 
415
  # --- Full Pipeline ---
416
+ def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
417
  try:
418
  dna_input = read_fasta_file(fasta_file_obj)
419
  if not dna_input:
420
+ return "Failed to read FASTA file", "", "", "", "", None, None, None, "No input sequence"
421
+ return run_pipeline(dna_input, similarity_score, build_ml_tree)
422
  except Exception as e:
423
  error_msg = f"Pipeline error: {str(e)}"
424
  logging.error(error_msg)
425
+ return error_msg, "", "", "", "", None, None, None, error_msg
426
 
427
+ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
428
  try:
429
  # Clean input
430
  dna_input = dna_input.upper().strip()
431
  if not dna_input:
432
+ return "Empty input", "", "", "", "", None, None, None, "No input provided"
433
 
434
  # Sanitize DNA sequence
435
  if not re.match('^[ACTGN]+$', dna_input):
 
475
  else:
476
  keras_output = "Skipped: sequence too short for F gene validation"
477
 
478
+ # Step 3: Maximum Likelihood Tree (MAFFT + IQ-TREE)
479
  aligned_file = None
480
  phy_file = None
481
+ ml_tree_output = ""
482
 
483
+ if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
484
+ try:
485
+ logging.info("Starting maximum likelihood tree construction...")
486
+ ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
487
+
488
+ if ml_success:
489
+ ml_tree_output = ml_message
490
+ aligned_file = ml_aligned
491
+ phy_file = ml_tree
492
+ else:
493
+ ml_tree_output = f"❌ ML Tree failed: {ml_message}"
494
+
495
+ except Exception as e:
496
+ ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
497
+ logging.error(f"ML Tree failed: {e}")
498
+ elif build_ml_tree:
499
+ ml_tree_output = "❌ F gene sequence too short for ML tree construction (minimum 50 bp)"
500
+ else:
501
+ ml_tree_output = "ML tree construction skipped (not requested)"
502
 
503
+ # Step 4: ML Simplified Tree (using the existing approach)
504
  html_file = None
505
  tree_html_content = "No tree generated"
506
+ simplified_ml_output = ""
507
 
508
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
509
  try:
510
+ logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
511
 
512
+ # Use the existing tree analysis function with user-specified similarity
513
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
514
 
515
  if tree_result and not tree_result.startswith("Error:"):
516
  # Success - we have HTML content
517
  tree_html_content = tree_result
518
+ simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
519
 
520
  # Check if HTML file was created
521
  output_dir = "output"
 
523
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
524
  if html_files:
525
  html_file = os.path.join(output_dir, html_files[-1]) # Get the latest
526
+ simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
527
 
528
  # Count sequences analyzed
529
  if analyzer.find_query_sequence(processed_sequence):
530
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
531
+ simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
532
+ simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
533
  else:
534
  # Error occurred
535
+ simplified_ml_output = f"❌ Simplified tree analysis failed: {tree_result}"
536
+ logging.error(f"Simplified tree analysis failed: {tree_result}")
537
 
538
  except Exception as e:
539
+ simplified_ml_output = f"❌ Simplified ML Tree analysis failed: {str(e)}"
540
+ logging.error(f"Simplified ML Tree failed: {e}")
541
  import traceback
542
  logging.error(f"Full traceback: {traceback.format_exc()}")
543
  elif not analyzer:
544
+ simplified_ml_output = "❌ Tree analyzer not initialized"
545
  elif not processed_sequence or len(processed_sequence) < 10:
546
+ simplified_ml_output = f"❌ F gene sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
547
  else:
548
+ simplified_ml_output = "❌ Skipped due to previous step errors"
549
 
550
  return (
551
  boundary_output,
552
  keras_output[:500] + "..." if len(keras_output) > 500 else keras_output,
553
  csv_path if os.path.exists(csv_path) else "CSV file not found",
554
+ ml_tree_output,
555
+ simplified_ml_output,
556
  html_file,
557
  aligned_file if aligned_file and os.path.exists(aligned_file) else None,
558
  phy_file if phy_file and os.path.exists(phy_file) else None,
 
564
  logging.error(error_msg)
565
  import traceback
566
  logging.error(f"Full traceback: {traceback.format_exc()}")
567
+ return error_msg, "", "", "", "", None, None, None, error_msg
568
 
569
  # --- Gradio UI ---
570
  with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft()) as demo:
571
  gr.Markdown("# 🧬 Viral Gene Phylogenetic Inference Pipeline")
572
+ gr.Markdown("This pipeline processes DNA sequences through boundary detection, k-mer analysis, and phylogenetic tree construction using both simplified ML and full maximum likelihood approaches.")
573
 
574
  with gr.Tab("📝 Paste DNA Sequence"):
575
  with gr.Row():
576
+ with gr.Column(scale=2):
577
  inp = gr.Textbox(
578
  label="DNA Input",
579
  placeholder="Paste your DNA sequence here (ACTG format)",
 
588
  label="Similarity Threshold (%)",
589
  info="Higher values = more similar sequences"
590
  )
591
+ ml_tree_checkbox = gr.Checkbox(
592
+ label="Build Maximum Likelihood Tree",
593
+ value=False,
594
+ info="Use MAFFT + IQ-TREE (slower but more accurate)"
595
+ )
596
  btn1 = gr.Button("🚀 Run Pipeline", variant="primary", size="lg")
597
 
598
  with gr.Tab("📁 Upload FASTA File"):
599
  with gr.Row():
600
+ with gr.Column(scale=2):
601
  file_input = gr.File(
602
  label="FASTA File",
603
  file_types=['.fasta', '.fa', '.txt']
 
611
  label="Similarity Threshold (%)",
612
  info="Higher values = more similar sequences"
613
  )
614
+ ml_tree_checkbox_file = gr.Checkbox(
615
+ label="Build Maximum Likelihood Tree",
616
+ value=False,
617
+ info="Use MAFFT + IQ-TREE (slower but more accurate)"
618
+ )
619
  btn2 = gr.Button("🚀 Run on FASTA", variant="primary", size="lg")
620
 
621
  # Outputs
 
625
  with gr.Column():
626
  out1 = gr.Textbox(label="🎯 Step 1: Extracted F Gene Sequence", lines=8)
627
  out2 = gr.Textbox(label="🔍 Step 2: F Gene Validation (Keras)", lines=3)
 
628
  out3 = gr.Textbox(label="📋 Dataset Used")
629
+ with gr.Column():
630
+ out4 = gr.Textbox(label="🌳 Step 3: Maximum Likelihood Tree (MAFFT+IQ-TREE)", lines=5)
631
+ out5 = gr.Textbox(label="🌿 Step 4: Simplified ML Tree Status", lines=5)
632
 
633
  with gr.Row():
634
+ html = gr.File(label="📥 Download Interactive Tree (HTML)")
635
  fasta = gr.File(label="📥 Download Aligned FASTA")
636
+ phy = gr.File(label="📥 Download ML Tree File")
637
 
638
  with gr.Row():
639
  tree_html = gr.HTML(label="🌳 Interactive Tree Preview")
 
641
  # Event handlers
642
  btn1.click(
643
  fn=run_pipeline,
644
+ inputs=[inp, similarity_input, ml_tree_checkbox],
645
+ outputs=[out1, out2, out3, out4, out5, html, fasta, phy, tree_html]
646
  )
647
  btn2.click(
648
  fn=run_pipeline_from_file,
649
+ inputs=[file_input, similarity_input_file, ml_tree_checkbox_file],
650
+ outputs=[out1, out2, out3, out4, out5, html, fasta, phy, tree_html]
651
  )
652
 
653
  if __name__ == '__main__':