re-type commited on
Commit
6a65f2a
·
verified ·
1 Parent(s): 740aa59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +536 -166
app.py CHANGED
@@ -13,6 +13,8 @@ from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
  import tempfile
15
  import shutil
 
 
16
 
17
  # --- Global Variables ---
18
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
@@ -97,31 +99,102 @@ except Exception as e:
97
  logging.error(f"Failed to initialize tree analyzer: {e}")
98
  analyzer = None
99
 
100
- # --- Helper Functions ---
101
  def check_tool_availability():
102
- """Check if MAFFT and IQ-TREE are available"""
103
- mafft_available = os.path.exists(MAFFT_PATH) or shutil.which('mafft') is not None
104
- iqtree_available = os.path.exists(IQTREE_PATH) or shutil.which('iqtree2') is not None or shutil.which('iqtree') is not None
105
 
106
- return mafft_available, iqtree_available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- def run_mafft_alignment(input_fasta, output_fasta):
109
- """Run MAFFT alignment on input FASTA file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  try:
111
- # Check if MAFFT is available
112
- mafft_cmd = MAFFT_PATH if os.path.exists(MAFFT_PATH) else 'mafft'
113
-
114
- # MAFFT command
115
- cmd = [mafft_cmd, '--auto', input_fasta]
 
 
116
 
117
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
118
 
119
- # Run MAFFT
120
  result = subprocess.run(
121
  cmd,
122
  capture_output=True,
123
  text=True,
124
- timeout=300 # 5 minute timeout
 
125
  )
126
 
127
  if result.returncode == 0:
@@ -129,72 +202,105 @@ def run_mafft_alignment(input_fasta, output_fasta):
129
  with open(output_fasta, 'w') as f:
130
  f.write(result.stdout)
131
  logging.info(f"MAFFT alignment completed: {output_fasta}")
132
- return True, output_fasta
 
 
 
 
 
133
  else:
134
- logging.error(f"MAFFT failed: {result.stderr}")
135
- return False, f"MAFFT error: {result.stderr}"
 
136
 
137
  except subprocess.TimeoutExpired:
138
  logging.error("MAFFT timeout")
139
- return False, "MAFFT timeout (>5 minutes)"
 
 
140
  except Exception as e:
141
  logging.error(f"MAFFT execution failed: {e}")
142
  return False, f"MAFFT execution failed: {str(e)}"
143
 
144
- def run_iqtree_analysis(aligned_fasta, output_prefix):
145
- """Run IQ-TREE maximum likelihood analysis"""
146
  try:
147
- # Check if IQ-TREE is available
148
- if os.path.exists(IQTREE_PATH):
149
- iqtree_cmd = IQTREE_PATH
150
- elif shutil.which('iqtree2') is not None:
151
- iqtree_cmd = 'iqtree2'
152
- elif shutil.which('iqtree') is not None:
153
- iqtree_cmd = 'iqtree'
154
- else:
155
- return False, "IQ-TREE not found"
156
-
157
- # IQ-TREE command for maximum likelihood tree
158
  cmd = [
159
  iqtree_cmd,
160
  '-s', aligned_fasta,
161
- '-m', 'TEST', # Auto model selection
162
  '-bb', '1000', # Bootstrap replicates
163
  '-alrt', '1000', # SH-aLRT test
164
  '-nt', 'AUTO', # Auto detect threads
165
  '--prefix', output_prefix,
166
- '-redo' # Overwrite existing files
 
167
  ]
168
 
169
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
170
 
171
- # Run IQ-TREE
172
  result = subprocess.run(
173
  cmd,
174
  capture_output=True,
175
  text=True,
176
- timeout=600 # 10 minute timeout
 
177
  )
178
 
179
  if result.returncode == 0:
180
  tree_file = f"{output_prefix}.treefile"
181
- if os.path.exists(tree_file):
182
  logging.info(f"IQ-TREE analysis completed: {tree_file}")
183
  return True, tree_file
184
  else:
185
- logging.error("IQ-TREE completed but tree file not found")
186
- return False, "Tree file not generated"
187
  else:
188
- logging.error(f"IQ-TREE failed: {result.stderr}")
189
- return False, f"IQ-TREE error: {result.stderr}"
 
190
 
191
  except subprocess.TimeoutExpired:
192
  logging.error("IQ-TREE timeout")
193
- return False, "IQ-TREE timeout (>10 minutes)"
 
 
194
  except Exception as e:
195
  logging.error(f"IQ-TREE execution failed: {e}")
196
  return False, f"IQ-TREE execution failed: {str(e)}"
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
199
  """Create a multi-FASTA file with query sequence and reference sequences"""
200
  try:
@@ -232,16 +338,68 @@ def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
232
  return None
233
 
234
  def build_maximum_likelihood_tree(f_gene_sequence):
235
- """Build maximum likelihood phylogenetic tree using MAFFT + IQ-TREE"""
236
  try:
237
- # Check tool availability
238
- mafft_available, iqtree_available = check_tool_availability()
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  if not mafft_available:
241
- return False, "MAFFT not available", None, None
 
242
  if not iqtree_available:
243
- return False, "IQ-TREE not available", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
 
245
  # Create output directory
246
  output_dir = "ml_tree_output"
247
  os.makedirs(output_dir, exist_ok=True)
@@ -250,26 +408,26 @@ def build_maximum_likelihood_tree(f_gene_sequence):
250
  logging.info("Creating multi-FASTA file...")
251
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
252
  if not multi_fasta:
253
- return False, "Failed to create input FASTA", None, None
254
 
255
  # Step 2: Run MAFFT alignment
256
  logging.info("Running MAFFT alignment...")
257
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
258
- mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta)
259
 
260
  # Clean up temporary file
261
  os.unlink(multi_fasta)
262
 
263
  if not mafft_success:
264
- return False, f"MAFFT failed: {mafft_result}", None, None
265
 
266
  # Step 3: Run IQ-TREE analysis
267
  logging.info("Running IQ-TREE analysis...")
268
  tree_prefix = os.path.join(output_dir, "ml_tree")
269
- iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix)
270
 
271
  if not iqtree_success:
272
- return False, f"IQ-TREE failed: {iqtree_result}", aligned_fasta, None
273
 
274
  # Step 4: Prepare output files
275
  tree_file = iqtree_result
@@ -284,17 +442,21 @@ def build_maximum_likelihood_tree(f_gene_sequence):
284
  if os.path.exists(tree_file):
285
  shutil.copy2(tree_file, standard_tree)
286
 
287
- success_msg = f"✅ Maximum likelihood tree built successfully!\n"
288
  success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
289
  success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
290
 
291
  if os.path.exists(log_file):
292
- with open(log_file, 'r') as f:
293
- log_content = f.read()
294
- # Extract model information
295
- if "Best-fit model:" in log_content:
296
- model_line = [line for line in log_content.split('\n') if "Best-fit model:" in line][0]
297
- success_msg += f"- {model_line.strip()}\n"
 
 
 
 
298
 
299
  logging.info("Maximum likelihood tree construction completed")
300
  return True, success_msg, aligned_fasta, tree_file
@@ -310,7 +472,7 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
310
  """
311
  try:
312
  if not analyzer:
313
- return "Error: Tree analyzer not initialized."
314
 
315
  if not sequence:
316
  return "Error: Please provide a sequence."
@@ -329,7 +491,7 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
329
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
330
 
331
  if not matched_ids:
332
- return f"No similar sequences found at {matching_percentage}% similarity."
333
 
334
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
335
 
@@ -490,7 +652,7 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
490
  aligned_file = ml_aligned
491
  phy_file = ml_tree
492
  else:
493
- ml_tree_output = f"❌ ML Tree failed: {ml_message}"
494
 
495
  except Exception as e:
496
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
@@ -531,124 +693,332 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
531
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
532
  simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
533
  else:
534
- # Error occurred
535
- simplified_ml_output = f"❌ Simplified tree analysis failed: {tree_result}"
536
- logging.error(f"Simplified tree analysis failed: {tree_result}")
537
-
538
  except Exception as e:
539
- simplified_ml_output = f"Simplified ML Tree analysis failed: {str(e)}"
540
- logging.error(f"Simplified ML Tree failed: {e}")
541
- import traceback
542
- logging.error(f"Full traceback: {traceback.format_exc()}")
543
- elif not analyzer:
544
- simplified_ml_output = "❌ Tree analyzer not initialized"
545
- elif not processed_sequence or len(processed_sequence) < 10:
546
- simplified_ml_output = f"❌ F gene sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
547
  else:
548
- simplified_ml_output = "❌ Skipped due to previous step errors"
 
 
 
549
 
 
550
  return (
551
- boundary_output,
552
- keras_output[:500] + "..." if len(keras_output) > 500 else keras_output,
553
- csv_path if os.path.exists(csv_path) else "CSV file not found",
554
- ml_tree_output,
555
- simplified_ml_output,
556
- html_file,
557
- aligned_file if aligned_file and os.path.exists(aligned_file) else None,
558
- phy_file if phy_file and os.path.exists(phy_file) else None,
559
- tree_html_content
560
  )
561
 
562
  except Exception as e:
563
- error_msg = f"Pipeline failed: {str(e)}"
564
  logging.error(error_msg)
565
  import traceback
566
  logging.error(f"Full traceback: {traceback.format_exc()}")
567
- return error_msg, "", "", "", "", None, None, None, error_msg
 
 
 
568
 
569
- # --- Gradio UI ---
570
- with gr.Blocks(title="Viral Gene Phylogenetic Pipeline", theme=gr.themes.Soft()) as demo:
571
- gr.Markdown("# 🧬 Viral Gene Phylogenetic Inference Pipeline")
572
- gr.Markdown("This pipeline processes DNA sequences through boundary detection, k-mer analysis, and phylogenetic tree construction using both simplified ML and full maximum likelihood approaches.")
573
-
574
- with gr.Tab("📝 Paste DNA Sequence"):
575
- with gr.Row():
576
- with gr.Column(scale=2):
577
- inp = gr.Textbox(
578
- label="DNA Input",
579
- placeholder="Paste your DNA sequence here (ACTG format)",
580
- lines=5
581
- )
582
- with gr.Column(scale=1):
583
- similarity_input = gr.Slider(
584
- minimum=50,
585
- maximum=99,
586
- step=1,
587
- value=95,
588
- label="Similarity Threshold (%)",
589
- info="Higher values = more similar sequences"
590
- )
591
- ml_tree_checkbox = gr.Checkbox(
592
- label="Build Maximum Likelihood Tree",
593
- value=False,
594
- info="Use MAFFT + IQ-TREE (slower but more accurate)"
595
- )
596
- btn1 = gr.Button("🚀 Run Pipeline", variant="primary", size="lg")
597
-
598
- with gr.Tab("📁 Upload FASTA File"):
599
- with gr.Row():
600
- with gr.Column(scale=2):
601
- file_input = gr.File(
602
- label="FASTA File",
603
- file_types=['.fasta', '.fa', '.txt']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  )
605
- with gr.Column(scale=1):
606
- similarity_input_file = gr.Slider(
607
- minimum=50,
608
- maximum=99,
609
- step=1,
610
- value=95,
611
- label="Similarity Threshold (%)",
612
- info="Higher values = more similar sequences"
613
  )
614
- ml_tree_checkbox_file = gr.Checkbox(
615
- label="Build Maximum Likelihood Tree",
616
- value=False,
617
- info="Use MAFFT + IQ-TREE (slower but more accurate)"
618
  )
619
- btn2 = gr.Button("🚀 Run on FASTA", variant="primary", size="lg")
620
-
621
- # Outputs
622
- gr.Markdown("## 📊 Pipeline Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
 
624
- with gr.Row():
625
- with gr.Column():
626
- out1 = gr.Textbox(label="🎯 Step 1: Extracted F Gene Sequence", lines=8)
627
- out2 = gr.Textbox(label="🔍 Step 2: F Gene Validation (Keras)", lines=3)
628
- out3 = gr.Textbox(label="📋 Dataset Used")
629
- with gr.Column():
630
- out4 = gr.Textbox(label="🌳 Step 3: Maximum Likelihood Tree (MAFFT+IQ-TREE)", lines=5)
631
- out5 = gr.Textbox(label="🌿 Step 4: Simplified ML Tree Status", lines=5)
632
-
633
- with gr.Row():
634
- html = gr.File(label="📥 Download Interactive Tree (HTML)")
635
- fasta = gr.File(label="📥 Download Aligned FASTA")
636
- phy = gr.File(label="📥 Download ML Tree File")
637
-
638
- with gr.Row():
639
- tree_html = gr.HTML(label="🌳 Interactive Tree Preview")
640
-
641
- # Event handlers
642
- btn1.click(
643
- fn=run_pipeline,
644
- inputs=[inp, similarity_input, ml_tree_checkbox],
645
- outputs=[out1, out2, out3, out4, out5, html, fasta, phy, tree_html]
646
- )
647
- btn2.click(
648
- fn=run_pipeline_from_file,
649
- inputs=[file_input, similarity_input_file, ml_tree_checkbox_file],
650
- outputs=[out1, out2, out3, out4, out5, html, fasta, phy, tree_html]
651
- )
652
 
653
- if __name__ == '__main__':
654
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  import ml_simplified_tree
14
  import tempfile
15
  import shutil
16
+ import sys
17
+ from pathlib import Path
18
 
19
  # --- Global Variables ---
20
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
 
99
  logging.error(f"Failed to initialize tree analyzer: {e}")
100
  analyzer = None
101
 
102
+ # --- Enhanced Tool Detection ---
103
  def check_tool_availability():
104
+ """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
 
 
105
 
106
+ # Check MAFFT
107
+ mafft_available = False
108
+ mafft_cmd = None
109
+
110
+ # Try multiple MAFFT locations
111
+ mafft_candidates = [
112
+ MAFFT_PATH,
113
+ 'mafft',
114
+ '/usr/bin/mafft',
115
+ '/usr/local/bin/mafft',
116
+ 'mafft.bat', # Windows
117
+ ]
118
+
119
+ for candidate in mafft_candidates:
120
+ if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
121
+ mafft_available = True
122
+ mafft_cmd = candidate
123
+ logging.info(f"Found MAFFT at: {candidate}")
124
+ break
125
+
126
+ # Check IQ-TREE
127
+ iqtree_available = False
128
+ iqtree_cmd = None
129
+
130
+ # Try multiple IQ-TREE locations and names
131
+ iqtree_candidates = [
132
+ IQTREE_PATH,
133
+ 'iqtree2',
134
+ 'iqtree',
135
+ '/usr/bin/iqtree2',
136
+ '/usr/local/bin/iqtree2',
137
+ '/usr/bin/iqtree',
138
+ '/usr/local/bin/iqtree',
139
+ 'iqtree2.exe', # Windows
140
+ 'iqtree.exe', # Windows
141
+ ]
142
+
143
+ for candidate in iqtree_candidates:
144
+ if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
145
+ iqtree_available = True
146
+ iqtree_cmd = candidate
147
+ logging.info(f"Found IQ-TREE at: {candidate}")
148
+ break
149
+
150
+ return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
151
+
152
+ def install_dependencies_guide():
153
+ """Provide installation guidance for missing dependencies"""
154
+ guide = """
155
+ 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
156
 
157
+ For MAFFT:
158
+ - Ubuntu/Debian: sudo apt-get install mafft
159
+ - CentOS/RHEL: sudo yum install mafft
160
+ - macOS: brew install mafft
161
+ - Windows: Download from https://mafft.cbrc.jp/alignment/software/
162
+
163
+ For IQ-TREE:
164
+ - Ubuntu/Debian: sudo apt-get install iqtree
165
+ - CentOS/RHEL: sudo yum install iqtree
166
+ - macOS: brew install iqtree
167
+ - Windows: Download from http://www.iqtree.org/
168
+
169
+ Alternative: Use conda/mamba:
170
+ - conda install -c bioconda mafft iqtree
171
+
172
+ Docker option:
173
+ - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
174
+ - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
175
+ """
176
+ return guide
177
+
178
+ def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
179
+ """Run MAFFT alignment with enhanced error handling"""
180
  try:
181
+ # MAFFT command with more robust options
182
+ cmd = [
183
+ mafft_cmd,
184
+ '--auto', # Automatic strategy selection
185
+ '--quiet', # Reduce output verbosity
186
+ input_fasta
187
+ ]
188
 
189
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
190
 
191
+ # Run MAFFT with enhanced error handling
192
  result = subprocess.run(
193
  cmd,
194
  capture_output=True,
195
  text=True,
196
+ timeout=600, # Increased timeout to 10 minutes
197
+ cwd=os.getcwd() # Ensure working directory is set
198
  )
199
 
200
  if result.returncode == 0:
 
202
  with open(output_fasta, 'w') as f:
203
  f.write(result.stdout)
204
  logging.info(f"MAFFT alignment completed: {output_fasta}")
205
+
206
+ # Verify output file
207
+ if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
208
+ return True, output_fasta
209
+ else:
210
+ return False, "MAFFT completed but output file is empty"
211
  else:
212
+ error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
213
+ logging.error(f"MAFFT failed: {error_msg}")
214
+ return False, f"MAFFT error: {error_msg}"
215
 
216
  except subprocess.TimeoutExpired:
217
  logging.error("MAFFT timeout")
218
+ return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
219
+ except FileNotFoundError:
220
+ return False, f"MAFFT executable not found: {mafft_cmd}"
221
  except Exception as e:
222
  logging.error(f"MAFFT execution failed: {e}")
223
  return False, f"MAFFT execution failed: {str(e)}"
224
 
225
+ def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
226
+ """Run IQ-TREE with enhanced options and error handling"""
227
  try:
228
+ # Enhanced IQ-TREE command
 
 
 
 
 
 
 
 
 
 
229
  cmd = [
230
  iqtree_cmd,
231
  '-s', aligned_fasta,
232
+ '-m', 'MFP', # ModelFinder Plus for automatic model selection
233
  '-bb', '1000', # Bootstrap replicates
234
  '-alrt', '1000', # SH-aLRT test
235
  '-nt', 'AUTO', # Auto detect threads
236
  '--prefix', output_prefix,
237
+ '-redo', # Overwrite existing files
238
+ '--quiet' # Reduce verbosity
239
  ]
240
 
241
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
242
 
243
+ # Run IQ-TREE with enhanced error handling
244
  result = subprocess.run(
245
  cmd,
246
  capture_output=True,
247
  text=True,
248
+ timeout=1200, # 20 minute timeout for larger datasets
249
+ cwd=os.getcwd()
250
  )
251
 
252
  if result.returncode == 0:
253
  tree_file = f"{output_prefix}.treefile"
254
+ if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
255
  logging.info(f"IQ-TREE analysis completed: {tree_file}")
256
  return True, tree_file
257
  else:
258
+ logging.error("IQ-TREE completed but tree file not found or empty")
259
+ return False, "Tree file not generated or empty"
260
  else:
261
+ error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
262
+ logging.error(f"IQ-TREE failed: {error_msg}")
263
+ return False, f"IQ-TREE error: {error_msg}"
264
 
265
  except subprocess.TimeoutExpired:
266
  logging.error("IQ-TREE timeout")
267
+ return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
268
+ except FileNotFoundError:
269
+ return False, f"IQ-TREE executable not found: {iqtree_cmd}"
270
  except Exception as e:
271
  logging.error(f"IQ-TREE execution failed: {e}")
272
  return False, f"IQ-TREE execution failed: {str(e)}"
273
 
274
+ def create_simple_neighbor_joining_tree(sequences_dict):
275
+ """Create a simple distance-based tree when ML tools are not available"""
276
+ try:
277
+ # This is a simplified implementation
278
+ # In a real scenario, you'd want to use a proper NJ implementation
279
+ import random
280
+
281
+ seq_names = list(sequences_dict.keys())
282
+ n_seqs = len(seq_names)
283
+
284
+ if n_seqs < 2:
285
+ return None, "Need at least 2 sequences for tree construction"
286
+
287
+ # Create a simple Newick tree structure
288
+ if n_seqs == 2:
289
+ tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
290
+ else:
291
+ # Simple clustering approach
292
+ tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
293
+
294
+ # Save to temporary file
295
+ tree_file = "simple_tree.nwk"
296
+ with open(tree_file, 'w') as f:
297
+ f.write(tree_str)
298
+
299
+ return tree_file, "Simple distance-based tree created"
300
+
301
+ except Exception as e:
302
+ return None, f"Simple tree creation failed: {str(e)}"
303
+
304
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
305
  """Create a multi-FASTA file with query sequence and reference sequences"""
306
  try:
 
338
  return None
339
 
340
  def build_maximum_likelihood_tree(f_gene_sequence):
341
+ """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
342
  try:
343
+ # Check tool availability with enhanced detection
344
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
345
 
346
+ # Prepare status message
347
+ status_msg = "🔍 Checking dependencies...\n"
348
+
349
+ if not mafft_available:
350
+ status_msg += "❌ MAFFT not found\n"
351
+ else:
352
+ status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
353
+
354
+ if not iqtree_available:
355
+ status_msg += "❌ IQ-TREE not found\n"
356
+ else:
357
+ status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
358
+
359
+ # If neither tool is available, provide installation guide
360
+ if not mafft_available and not iqtree_available:
361
+ guide = install_dependencies_guide()
362
+ return False, f"{status_msg}\n{guide}", None, None
363
+
364
+ # If only one tool is missing, provide specific guidance
365
  if not mafft_available:
366
+ return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
367
+
368
  if not iqtree_available:
369
+ status_msg += "\n⚠️ IQ-TREE not available. Attempting simple tree construction...\n"
370
+
371
+ # Try to create a simple tree as fallback
372
+ multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
373
+ if multi_fasta:
374
+ # Read sequences
375
+ sequences = {}
376
+ current_seq = ""
377
+ current_name = ""
378
+
379
+ with open(multi_fasta, 'r') as f:
380
+ for line in f:
381
+ line = line.strip()
382
+ if line.startswith('>'):
383
+ if current_name and current_seq:
384
+ sequences[current_name] = current_seq
385
+ current_name = line[1:]
386
+ current_seq = ""
387
+ else:
388
+ current_seq += line
389
+ if current_name and current_seq:
390
+ sequences[current_name] = current_seq
391
+
392
+ simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
393
+ os.unlink(multi_fasta)
394
+
395
+ if simple_tree:
396
+ return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
397
+ else:
398
+ return False, f"{status_msg}❌ {simple_msg}", None, None
399
+ else:
400
+ return False, f"{status_msg}❌ Failed to create input sequences", None, None
401
 
402
+ # Both tools available - proceed with full ML analysis
403
  # Create output directory
404
  output_dir = "ml_tree_output"
405
  os.makedirs(output_dir, exist_ok=True)
 
408
  logging.info("Creating multi-FASTA file...")
409
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
410
  if not multi_fasta:
411
+ return False, f"{status_msg}❌ Failed to create input FASTA", None, None
412
 
413
  # Step 2: Run MAFFT alignment
414
  logging.info("Running MAFFT alignment...")
415
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
416
+ mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
417
 
418
  # Clean up temporary file
419
  os.unlink(multi_fasta)
420
 
421
  if not mafft_success:
422
+ return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
423
 
424
  # Step 3: Run IQ-TREE analysis
425
  logging.info("Running IQ-TREE analysis...")
426
  tree_prefix = os.path.join(output_dir, "ml_tree")
427
+ iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
428
 
429
  if not iqtree_success:
430
+ return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
431
 
432
  # Step 4: Prepare output files
433
  tree_file = iqtree_result
 
442
  if os.path.exists(tree_file):
443
  shutil.copy2(tree_file, standard_tree)
444
 
445
+ success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
446
  success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
447
  success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
448
 
449
  if os.path.exists(log_file):
450
+ try:
451
+ with open(log_file, 'r') as f:
452
+ log_content = f.read()
453
+ # Extract model information
454
+ if "Best-fit model:" in log_content:
455
+ model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
456
+ if model_lines:
457
+ success_msg += f"- {model_lines[0].strip()}\n"
458
+ except Exception as e:
459
+ logging.warning(f"Could not read log file: {e}")
460
 
461
  logging.info("Maximum likelihood tree construction completed")
462
  return True, success_msg, aligned_fasta, tree_file
 
472
  """
473
  try:
474
  if not analyzer:
475
+ return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
476
 
477
  if not sequence:
478
  return "Error: Please provide a sequence."
 
491
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
492
 
493
  if not matched_ids:
494
+ return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
495
 
496
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
497
 
 
652
  aligned_file = ml_aligned
653
  phy_file = ml_tree
654
  else:
655
+ ml_tree_output = ml_message # This now includes detailed error information
656
 
657
  except Exception as e:
658
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
 
693
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
694
  simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
695
  else:
696
+ simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
697
+ tree_html_content = f"<p>Error: {tree_result}</p>"
698
+
 
699
  except Exception as e:
700
+ logging.error(f"Simplified ML tree analysis failed: {e}")
701
+ simplified_ml_output = f"Simplified ML tree analysis failed: {str(e)}"
702
+ tree_html_content = f"<p>Error: {str(e)}</p>"
 
 
 
 
 
703
  else:
704
+ if not analyzer:
705
+ simplified_ml_output = "❌ Tree analyzer not available"
706
+ else:
707
+ simplified_ml_output = "❌ F gene sequence too short for tree analysis (minimum 10 bp)"
708
 
709
+ # Return all results
710
  return (
711
+ boundary_output, # F gene extraction result
712
+ keras_output, # F gene validation result
713
+ ml_tree_output, # ML tree construction status
714
+ simplified_ml_output, # Simplified tree analysis status
715
+ tree_html_content, # HTML content for tree display
716
+ aligned_file, # Path to aligned FASTA file
717
+ phy_file, # Path to phylogenetic tree file
718
+ html_file, # Path to HTML tree file
719
+ f"Pipeline completed. F gene length: {len(processed_sequence)} bp" # Summary
720
  )
721
 
722
  except Exception as e:
723
+ error_msg = f"Pipeline execution failed: {str(e)}"
724
  logging.error(error_msg)
725
  import traceback
726
  logging.error(f"Full traceback: {traceback.format_exc()}")
727
+ return (
728
+ error_msg, "", "", "", f"<p>Error: {error_msg}</p>",
729
+ None, None, None, error_msg
730
+ )
731
 
732
+ # --- Gradio Interface ---
733
+ def create_interface():
734
+ """Create the Gradio interface with enhanced layout and features"""
735
+
736
+ # Custom CSS for better styling
737
+ custom_css = """
738
+ .gradio-container {
739
+ max-width: 1200px !important;
740
+ }
741
+ .tab-nav button {
742
+ font-size: 16px !important;
743
+ }
744
+ .output-html {
745
+ height: 600px !important;
746
+ overflow: auto;
747
+ }
748
+ """
749
+
750
+ with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
751
+ gr.Markdown("""
752
+ # 🧬 F Gene Analysis Pipeline
753
+
754
+ This tool provides comprehensive analysis of F genes including:
755
+ - **Gene Boundary Detection**: Extract F gene sequences from larger genomic sequences
756
+ - **Gene Validation**: Validate extracted sequences using machine learning
757
+ - **Phylogenetic Analysis**: Build maximum likelihood trees and simplified phylogenetic trees
758
+
759
+ **Instructions:**
760
+ 1. Enter your sequence directly or upload a FASTA file
761
+ 2. Adjust similarity threshold for phylogenetic analysis (1-99%)
762
+ 3. Choose whether to build maximum likelihood trees (requires MAFFT & IQ-TREE)
763
+ 4. Click "Run Analysis" to start the pipeline
764
+ """)
765
+
766
+ with gr.Tab("🔬 Analysis Pipeline"):
767
+ with gr.Row():
768
+ with gr.Column(scale=2):
769
+ # Input section
770
+ gr.Markdown("### Input Sequence")
771
+ dna_input = gr.Textbox(
772
+ label="DNA Sequence",
773
+ placeholder="Enter your DNA sequence here (ATCG format)...",
774
+ lines=5,
775
+ max_lines=10
776
+ )
777
+
778
+ fasta_file = gr.File(
779
+ label="Or Upload FASTA File",
780
+ file_types=[".fasta", ".fa", ".fas", ".txt"]
781
+ )
782
+
783
+ with gr.Row():
784
+ similarity_score = gr.Slider(
785
+ minimum=1,
786
+ maximum=99,
787
+ value=95.0,
788
+ step=1.0,
789
+ label="Similarity Threshold (%)",
790
+ info="Minimum similarity for phylogenetic analysis"
791
+ )
792
+
793
+ build_ml_tree = gr.Checkbox(
794
+ label="Build ML Tree",
795
+ value=False,
796
+ info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)"
797
+ )
798
+
799
+ # Action buttons
800
+ with gr.Row():
801
+ run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
802
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary")
803
+
804
+ with gr.Column(scale=1):
805
+ # Status and info
806
+ gr.Markdown("### Analysis Status")
807
+ status_display = gr.Textbox(
808
+ label="Status",
809
+ value="Ready to analyze",
810
+ interactive=False,
811
+ lines=3
812
+ )
813
+
814
+ # Model status
815
+ gr.Markdown("### Available Models")
816
+ model_status = []
817
+ if boundary_model:
818
+ model_status.append("✅ Boundary Detection Model")
819
+ else:
820
+ model_status.append("❌ Boundary Detection Model")
821
+
822
+ if keras_model:
823
+ model_status.append("✅ Gene Validation Model")
824
+ else:
825
+ model_status.append("❌ Gene Validation Model")
826
+
827
+ if analyzer:
828
+ model_status.append("✅ Tree Analysis Module")
829
+ else:
830
+ model_status.append("❌ Tree Analysis Module")
831
+
832
+ gr.Markdown("\n".join(model_status))
833
+
834
+ with gr.Tab("📊 Results"):
835
+ with gr.Row():
836
+ with gr.Column():
837
+ # Text outputs
838
+ boundary_output = gr.Textbox(
839
+ label="🎯 F Gene Extraction",
840
+ lines=5,
841
+ interactive=False
842
+ )
843
+
844
+ keras_output = gr.Textbox(
845
+ label="🔍 Gene Validation",
846
+ lines=3,
847
+ interactive=False
848
+ )
849
+
850
+ with gr.Column():
851
+ ml_tree_output = gr.Textbox(
852
+ label="🌳 Maximum Likelihood Tree",
853
+ lines=5,
854
+ interactive=False
855
+ )
856
+
857
+ simplified_ml_output = gr.Textbox(
858
+ label="📈 Simplified Phylogenetic Analysis",
859
+ lines=3,
860
+ interactive=False
861
+ )
862
+
863
+ # Tree visualization
864
+ gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
865
+ tree_html = gr.HTML(
866
+ label="Interactive Tree",
867
+ value="<p>No tree generated yet. Run analysis to see results.</p>"
868
+ )
869
+
870
+ # File downloads
871
+ gr.Markdown("### 📁 Download Results")
872
+ with gr.Row():
873
+ aligned_file = gr.File(
874
+ label="Aligned Sequences (FASTA)",
875
+ interactive=False
876
  )
877
+
878
+ phy_file = gr.File(
879
+ label="Phylogenetic Tree File",
880
+ interactive=False
 
 
 
 
881
  )
882
+
883
+ html_file = gr.File(
884
+ label="Interactive Tree (HTML)",
885
+ interactive=False
886
  )
887
+
888
+ with gr.Tab("ℹ️ Help & Info"):
889
+ gr.Markdown("""
890
+ ## About This Tool
891
+
892
+ ### F Gene Analysis Pipeline
893
+ This comprehensive pipeline analyzes F genes through multiple computational approaches:
894
+
895
+ #### 🎯 Gene Boundary Detection
896
+ - Uses deep learning to identify and extract F gene sequences from larger genomic sequences
897
+ - Provides confidence scores for detected boundaries
898
+ - Automatically trims sequences to focus on the F gene region
899
+
900
+ #### 🔍 Gene Validation
901
+ - Employs k-mer based machine learning models to validate extracted sequences
902
+ - Provides probability scores indicating likelihood of being a genuine F gene
903
+ - Uses 6-mer frequency patterns for classification
904
+
905
+ #### 🌳 Phylogenetic Analysis
906
+
907
+ **Maximum Likelihood Trees:**
908
+ - Requires MAFFT (sequence alignment) and IQ-TREE (phylogenetic reconstruction)
909
+ - Performs model selection and bootstrap analysis
910
+ - Generates publication-quality phylogenetic trees
911
+ - Provides detailed evolutionary analysis
912
+
913
+ **Simplified Trees:**
914
+ - Uses built-in algorithms for quick phylogenetic analysis
915
+ - Interactive visualization with similarity-based clustering
916
+ - Faster alternative when external tools are not available
917
+
918
+ ### Input Requirements
919
+ - **DNA Sequences**: ATCG format, minimum 50 bp for meaningful analysis
920
+ - **FASTA Files**: Standard FASTA format with single or multiple sequences
921
+ - **Similarity Threshold**: 1-99% for controlling phylogenetic analysis sensitivity
922
+
923
+ ### Dependencies
924
+
925
+ **Required for ML Trees:**
926
+ ```bash
927
+ # Ubuntu/Debian
928
+ sudo apt-get install mafft iqtree
929
+
930
+ # macOS
931
+ brew install mafft iqtree
932
+
933
+ # Conda
934
+ conda install -c bioconda mafft iqtree
935
+ ```
936
+
937
+ ### Output Files
938
+ - **Aligned FASTA**: Multiple sequence alignment in FASTA format
939
+ - **Tree File**: Newick format phylogenetic tree
940
+ - **HTML Tree**: Interactive visualization for web browsers
941
+
942
+ ### Troubleshooting
943
+
944
+ **Common Issues:**
945
+ - *"No similar sequences found"*: Lower the similarity threshold
946
+ - *"Sequence too short"*: Provide sequences longer than 50 bp
947
+ - *"MAFFT/IQ-TREE not found"*: Install required dependencies
948
+ - *"Model not available"*: Check model files are properly downloaded
949
+
950
+ **Performance Tips:**
951
+ - Use sequences between 100-2000 bp for optimal performance
952
+ - Limit to <50 sequences for faster tree construction
953
+ - Lower similarity thresholds find more distant relatives
954
+ - Higher thresholds focus on closely related sequences
955
+
956
+ ### Citation
957
+ If you use this tool in your research, please cite the appropriate methods and tools used.
958
+ """)
959
+
960
+ # Event handlers
961
+ def run_analysis_text(dna_seq, sim_score, build_tree):
962
+ return run_pipeline(dna_seq, sim_score, build_tree)
963
+
964
+ def run_analysis_file(file_obj, sim_score, build_tree):
965
+ return run_pipeline_from_file(file_obj, sim_score, build_tree)
966
+
967
+ def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
968
+ # Priority: file upload over text input
969
+ if file_obj is not None:
970
+ return run_pipeline_from_file(file_obj, sim_score, build_tree)
971
+ else:
972
+ return run_pipeline(dna_seq, sim_score, build_tree)
973
+
974
+ def clear_inputs():
975
+ return "", None, 95.0, False, "Ready to analyze"
976
+
977
+ # Connect events
978
+ run_btn.click(
979
+ fn=run_analysis_combined,
980
+ inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
981
+ outputs=[
982
+ boundary_output, keras_output, ml_tree_output,
983
+ simplified_ml_output, tree_html, aligned_file,
984
+ phy_file, html_file, status_display
985
+ ]
986
+ )
987
+
988
+ clear_btn.click(
989
+ fn=clear_inputs,
990
+ outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
991
+ )
992
+
993
+ # Example data loading
994
+ gr.Markdown("### 🧪 Example Data")
995
+ example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
996
+
997
+ def load_example():
998
+ example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
999
+ return example_seq, "Example F gene sequence loaded"
1000
+
1001
+ example_btn.click(
1002
+ fn=load_example,
1003
+ outputs=[dna_input, status_display]
1004
+ )
1005
 
1006
+ return iface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007
 
1008
+ # --- Main Execution ---
1009
+ if __name__ == "__main__":
1010
+ # Initialize and launch interface
1011
+ interface = create_interface()
1012
+
1013
+ # Launch with enhanced configuration
1014
+ interface.launch(
1015
+ server_name="0.0.0.0", # Allow external connections
1016
+ server_port=7860, # Default Gradio port
1017
+ share=False, # Set to True for public sharing
1018
+ debug=True, # Enable debug mode
1019
+ show_error=True, # Show detailed errors
1020
+ max_threads=4, # Limit concurrent threads
1021
+ auth=None, # Add authentication if needed: ("username", "password")
1022
+ ssl_verify=False, # For development environments
1023
+ quiet=False # Show startup messages
1024
+ )