re-type commited on
Commit
8c08884
·
verified ·
1 Parent(s): 9ab9398

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +826 -275
app.py CHANGED
@@ -25,7 +25,7 @@ import time
25
 
26
  # --- Global Variables ---
27
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
28
- MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
29
  IQTREE_PATH = os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
30
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
31
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
@@ -35,17 +35,20 @@ os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
35
  # --- Logging ---
36
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
37
 
38
- # --- Load Models (same as your original code) ---
 
39
  model_repo = "GGproject10/best_boundary_aware_model"
40
  csv_path = "f cleaned.csv"
 
 
41
  hf_token = os.getenv("HF_TOKEN")
42
 
 
43
  boundary_model = None
44
  keras_model = None
45
  kmer_to_index = None
46
- analyzer = None
47
 
48
- # [Include all your model loading code here - same as original]
49
  try:
50
  boundary_path = hf_hub_download(
51
  repo_id=model_repo,
@@ -55,9 +58,12 @@ try:
55
  if os.path.exists(boundary_path):
56
  boundary_model = GenePredictor(boundary_path)
57
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 
 
58
  except Exception as e:
59
  logging.error(f"Failed to load boundary model from HF Hub: {e}")
60
 
 
61
  try:
62
  keras_path = hf_hub_download(
63
  repo_id=model_repo,
@@ -74,11 +80,56 @@ try:
74
  keras_model = load_model(keras_path)
75
  with open(kmer_path, "rb") as f:
76
  kmer_to_index = pickle.load(f)
77
- logging.info("Keras model and k-mer index loaded successfully.")
 
 
78
  except Exception as e:
79
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
80
 
81
- # [Include all your helper functions - same as original]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def setup_binary_permissions():
83
  """Set executable permissions on MAFFT and IQ-TREE binaries"""
84
  binaries = [MAFFT_PATH, IQTREE_PATH]
@@ -86,92 +137,428 @@ def setup_binary_permissions():
86
  for binary in binaries:
87
  if os.path.exists(binary):
88
  try:
 
89
  current_mode = os.stat(binary).st_mode
90
  os.chmod(binary, current_mode | stat.S_IEXEC)
91
  logging.info(f"Set executable permission on {binary}")
92
  except Exception as e:
93
  logging.warning(f"Failed to set executable permission on {binary}: {e}")
 
 
94
 
95
  def check_tool_availability():
96
- """Enhanced check for MAFFT and IQ-TREE availability"""
 
 
97
  setup_binary_permissions()
98
 
99
  # Check MAFFT
100
  mafft_available = False
101
  mafft_cmd = None
102
 
 
103
  mafft_candidates = [
104
- MAFFT_PATH,
 
 
105
  'mafft',
106
  '/usr/bin/mafft',
107
  '/usr/local/bin/mafft',
 
 
 
 
 
 
108
  ]
109
 
110
  for candidate in mafft_candidates:
111
  if not candidate:
112
  continue
 
 
113
  if os.path.exists(candidate) or shutil.which(candidate):
 
114
  try:
115
- result = subprocess.run([candidate, "--help"], capture_output=True, text=True, timeout=10)
 
 
 
 
 
 
116
  if result.returncode == 0 or "mafft" in result.stderr.lower():
117
  mafft_available = True
118
  mafft_cmd = candidate
 
119
  break
120
- except:
 
121
  continue
122
 
123
- # Check IQ-TREE
124
  iqtree_available = False
125
  iqtree_cmd = None
126
 
 
127
  iqtree_candidates = [
128
- IQTREE_PATH,
129
  'iqtree2',
130
  'iqtree',
 
131
  '/usr/bin/iqtree2',
132
  '/usr/local/bin/iqtree2',
 
 
 
 
 
 
 
 
 
 
 
133
  ]
134
 
135
  for candidate in iqtree_candidates:
136
  if not candidate:
137
  continue
 
138
  if os.path.exists(candidate) or shutil.which(candidate):
139
  try:
140
- result = subprocess.run([candidate, "--help"], capture_output=True, text=True, timeout=10)
 
 
 
 
 
 
141
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
142
  iqtree_available = True
143
  iqtree_cmd = candidate
 
144
  break
145
- except:
 
146
  continue
147
 
148
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  def predict_with_keras(sequence):
151
  try:
152
  if not keras_model or not kmer_to_index:
153
- return f"Keras model not available."
154
 
155
  if len(sequence) < 6:
156
- return "Sequence too short for F gene validation."
157
 
 
158
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
159
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
160
 
 
161
  input_arr = np.array([indices])
162
  prediction = keras_model.predict(input_arr, verbose=0)[0]
163
- f_gene_prob = prediction[-1]
164
- percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
 
 
 
165
 
166
  return f"{percentage}% F gene"
167
  except Exception as e:
 
168
  return f"Keras prediction failed: {str(e)}"
169
 
 
170
  def read_fasta_file(file_obj):
171
  try:
172
  if file_obj is None:
173
  return ""
174
 
 
175
  if hasattr(file_obj, 'name'):
176
  with open(file_obj.name, "r") as f:
177
  content = f.read()
@@ -185,58 +572,7 @@ def read_fasta_file(file_obj):
185
  logging.error(f"Failed to read FASTA file: {e}")
186
  return ""
187
 
188
- # API-friendly wrapper functions
189
- def api_analyze_sequence(sequence: str, similarity_threshold: float = 95.0, enable_phylogeny: bool = False):
190
- """
191
- API endpoint for analyzing a DNA sequence
192
- Returns structured data suitable for API consumption
193
- """
194
- try:
195
- results = run_pipeline(sequence, similarity_threshold, enable_phylogeny)
196
-
197
- return {
198
- "status": "success",
199
- "input_length": len(sequence),
200
- "f_gene_sequence": results[0] if results[0] else "",
201
- "f_gene_validation": results[1] if results[1] else "",
202
- "phylogenetic_placement": results[2] if results[2] else "",
203
- "tree_analysis": results[3] if results[3] else "",
204
- "summary": results[4] if results[4] else "",
205
- "has_alignment_file": results[5] is not None,
206
- "has_tree_file": results[6] is not None,
207
- "has_html_tree": results[7] is not None
208
- }
209
- except Exception as e:
210
- return {
211
- "status": "error",
212
- "error_message": str(e),
213
- "input_length": len(sequence) if sequence else 0
214
- }
215
-
216
- def api_analyze_fasta(file_content: str, similarity_threshold: float = 95.0, enable_phylogeny: bool = False):
217
- """
218
- API endpoint for analyzing a FASTA file content
219
- """
220
- try:
221
- # Parse FASTA content
222
- lines = file_content.strip().split("\n")
223
- seq_lines = [line.strip() for line in lines if not line.startswith(">")]
224
- sequence = ''.join(seq_lines)
225
-
226
- if not sequence:
227
- return {
228
- "status": "error",
229
- "error_message": "No valid sequence found in FASTA content"
230
- }
231
-
232
- return api_analyze_sequence(sequence, similarity_threshold, enable_phylogeny)
233
- except Exception as e:
234
- return {
235
- "status": "error",
236
- "error_message": f"FASTA parsing error: {str(e)}"
237
- }
238
-
239
- # Main pipeline function (simplified version of your original)
240
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
241
  try:
242
  # Clean input
@@ -247,9 +583,10 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
247
  # Sanitize DNA sequence
248
  if not re.match('^[ACTGN]+$', dna_input):
249
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
 
250
 
251
- # Step 1: Boundary Prediction
252
- processed_sequence = dna_input
253
  boundary_output = ""
254
 
255
  if boundary_model:
@@ -257,257 +594,471 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
257
  predictions, probs, confidence = boundary_model.predict(dna_input)
258
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
259
  if regions:
260
- processed_sequence = regions[0]["sequence"]
261
- boundary_output = processed_sequence
 
262
  else:
263
- boundary_output = "No F gene regions found"
 
 
 
264
  except Exception as e:
 
265
  boundary_output = f"Boundary model error: {str(e)}"
 
266
  else:
267
- boundary_output = f"Boundary model not available. Using input: {len(dna_input)} bp"
 
268
 
269
- # Step 2: Keras Prediction
270
  keras_output = ""
271
  if processed_sequence and len(processed_sequence) >= 6:
272
- keras_output = predict_with_keras(processed_sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  else:
274
- keras_output = "Sequence too short for validation"
275
-
276
- # Step 3: ML Tree (simplified)
277
- ml_tree_output = "Phylogenetic analysis skipped"
278
- if build_ml_tree:
279
- mafft_available, iqtree_available, _, _ = check_tool_availability()
280
- if mafft_available and iqtree_available:
281
- ml_tree_output = "Phylogenetic tools available - analysis would run here"
282
  else:
283
- ml_tree_output = "Phylogenetic tools not available"
284
-
285
- # Step 4: Tree Analysis (simplified)
286
- tree_analysis_output = "Tree analysis not implemented in this version"
287
-
288
- # Summary
289
- summary_output = f"""
290
- ANALYSIS SUMMARY:
291
- Input: {len(dna_input)} bp
292
- F Gene: {len(processed_sequence)} bp
293
- Validation: {keras_output}
294
- Phylogeny: {ml_tree_output}
295
  """
296
 
297
  return (
298
  boundary_output,
299
- keras_output,
300
  ml_tree_output,
301
- tree_analysis_output,
302
- summary_output,
303
- None, # alignment_file
304
- None, # tree_file
305
- None, # html_file
306
- "No tree visualization available"
307
  )
308
 
309
  except Exception as e:
310
- error_msg = f"Pipeline error: {str(e)}"
311
- return error_msg, "", "", "", "", None, None, None, error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
- def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
 
 
314
  try:
315
- dna_input = read_fasta_file(fasta_file_obj)
316
- if not dna_input:
317
- return "Failed to read FASTA file", "", "", "", "", None, None, None, "No sequence"
318
- return run_pipeline(dna_input, similarity_score, build_ml_tree)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  except Exception as e:
320
- error_msg = f"File pipeline error: {str(e)}"
321
- return error_msg, "", "", "", "", None, None, None, error_msg
322
 
 
323
  def create_interface():
324
- """Create Gradio interface with proper API configuration"""
325
 
326
- with gr.Blocks(title="🧬 Gene Analysis Pipeline API") as iface:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- gr.HTML("""
329
- <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;">
330
- <h1 style="margin: 0; font-size: 2.5em;">🧬 Gene Analysis Pipeline</h1>
331
- <p style="margin: 10px 0 0 0; font-size: 1.2em; opacity: 0.9;">Advanced DNA Sequence Analysis with API Access</p>
332
- </div>
333
  """)
334
 
335
- # API Information
336
- with gr.Accordion("🔗 API Information", open=True):
337
- gr.HTML("""
338
- <div style="background: #f8fafc; padding: 15px; border-radius: 8px; border-left: 4px solid #3b82f6;">
339
- <h3 style="color: #1e40af; margin-top: 0;">API Endpoints Available:</h3>
340
- <ul style="line-height: 1.8;">
341
- <li><strong>POST /api/analyze_text</strong> - Analyze DNA sequence from text input</li>
342
- <li><strong>POST /api/analyze_file</strong> - Analyze DNA sequence from FASTA file</li>
343
- <li><strong>POST /api/api_analyze_sequence</strong> - Structured API response for sequence analysis</li>
344
- <li><strong>POST /api/api_analyze_fasta</strong> - Structured API response for FASTA content</li>
345
- </ul>
346
- <p style="margin: 15px 0 0 0; padding: 10px; background: #dbeafe; border-radius: 5px;">
347
- <strong>📝 Note:</strong> Access API documentation at <code>/docs</code> when the server is running
348
- </p>
349
- </div>
350
- """)
351
-
352
- # Input Section
353
  with gr.Row():
354
  with gr.Column(scale=2):
355
- with gr.Tabs():
356
- with gr.TabItem("✍️ Text Input"):
357
- dna_input = gr.Textbox(
358
- label="DNA Sequence",
359
- placeholder="Enter DNA sequence (A, T, C, G, N)...",
360
- lines=6,
361
- info="Input your DNA sequence for analysis"
362
- )
363
-
364
- with gr.TabItem("📁 File Upload"):
365
- fasta_file = gr.File(
366
- label="Upload FASTA File",
367
- file_types=[".fasta", ".fa", ".fas", ".txt"]
368
- )
369
-
370
- with gr.Column(scale=1):
371
- similarity_score = gr.Slider(
372
- minimum=70.0,
373
- maximum=99.0,
374
- value=95.0,
375
- step=1.0,
376
- label="Similarity Threshold (%)"
377
  )
378
 
379
- build_ml_tree = gr.Checkbox(
380
- label="🌳 Enable Phylogenetic Analysis",
381
- value=False
 
382
  )
383
 
384
- with gr.Row():
385
- analyze_text_btn = gr.Button("🚀 Analyze Text", variant="primary")
386
- analyze_file_btn = gr.Button("📁 Analyze File", variant="secondary")
387
-
388
- # Results Section
389
- with gr.Tabs():
390
- with gr.TabItem("🎯 F Gene"):
391
- f_gene_output = gr.Textbox(label="F Gene Sequence", lines=5)
392
-
393
- with gr.TabItem(" Validation"):
394
- keras_output = gr.Textbox(label="Gene Validation", lines=3)
395
-
396
- with gr.TabItem("🌳 Phylogeny"):
397
- ml_tree_output = gr.Textbox(label="Phylogenetic Analysis", lines=5)
398
-
399
- with gr.TabItem("📊 Summary"):
400
- summary_output = gr.Textbox(label="Analysis Summary", lines=8)
401
-
402
- # API Test Section
403
- with gr.Accordion("🧪 API Testing", open=False):
404
- gr.HTML("""
405
- <div style="background: #fef7e7; padding: 15px; border-radius: 8px; border-left: 4px solid #f59e0b;">
406
- <h4 style="color: #92400e; margin-top: 0;">Test API Endpoints:</h4>
407
- <p>Use these functions to test structured API responses:</p>
408
- </div>
409
- """)
410
-
411
- with gr.Row():
412
- api_sequence_input = gr.Textbox(
413
- label="Test Sequence for API",
414
- placeholder="ATCGATCG...",
415
- lines=2
416
  )
417
- api_test_btn = gr.Button("Test API Response", variant="primary")
418
-
419
- api_response = gr.JSON(label="API Response Structure")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
- # Event Handlers
422
- analyze_text_btn.click(
423
- fn=run_pipeline,
424
- inputs=[dna_input, similarity_score, build_ml_tree],
425
- outputs=[f_gene_output, keras_output, ml_tree_output, gr.Textbox(), summary_output,
426
- gr.File(), gr.File(), gr.File(), gr.HTML()],
427
- api_name="analyze_text"
428
- )
429
 
430
- analyze_file_btn.click(
431
- fn=run_pipeline_from_file,
432
- inputs=[fasta_file, similarity_score, build_ml_tree],
433
- outputs=[f_gene_output, keras_output, ml_tree_output, gr.Textbox(), summary_output,
434
- gr.File(), gr.File(), gr.File(), gr.HTML()],
435
- api_name="analyze_file"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  )
437
 
438
- # API Test Handler
439
- api_test_btn.click(
440
- fn=api_analyze_sequence,
441
- inputs=[api_sequence_input, similarity_score, build_ml_tree],
442
- outputs=[api_response],
443
- api_name="api_analyze_sequence"
 
 
 
 
 
 
 
 
 
444
  )
445
 
446
- # Additional API endpoint for FASTA content
447
- gr.Interface(
448
- fn=api_analyze_fasta,
449
- inputs=[
450
- gr.Textbox(label="FASTA Content", lines=5),
451
- gr.Slider(70, 99, 95, label="Similarity %"),
452
- gr.Checkbox(label="Enable Phylogeny")
453
- ],
454
- outputs=gr.JSON(label="API Response"),
455
- title="FASTA API Endpoint",
456
- api_name="api_analyze_fasta",
457
- visible=False # Hidden interface just for API
 
 
 
 
 
 
458
  )
459
 
460
  # Footer
461
- gr.HTML("""
462
- <div style="text-align: center; padding: 20px; margin-top: 20px; border-top: 2px solid #e5e7eb;">
463
- <p style="color: #6b7280; margin: 0;">🧬 Gene Analysis Pipeline with API Access</p>
464
- <p style="color: #9ca3af; font-size: 0.9em; margin: 5px 0 0 0;">
465
- Access API at <code>/api/endpoint_name</code> Documentation at <code>/docs</code>
466
- </p>
467
- </div>
468
  """)
469
 
470
- return iface
471
 
472
- # Replace the launch section at the end of your app.py file with this:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
 
474
- # Main execution
475
  if __name__ == "__main__":
 
 
 
 
 
 
 
476
  try:
477
- print("🧬 Starting Gene Analysis Pipeline with API Access")
478
- print("=" * 60)
479
- print(f"Boundary Model: {'✅' if boundary_model else '❌'}")
480
- print(f"Keras Model: {'✅' if keras_model else '❌'}")
481
-
482
- # Check tools
483
- mafft_available, iqtree_available, _, _ = check_tool_availability()
484
- print(f"MAFFT: {'✅' if mafft_available else '❌'}")
485
- print(f"IQ-TREE: {'✅' if iqtree_available else '❌'}")
486
-
487
- print("\n🚀 Launching with API enabled...")
488
- print("Access URLs:")
489
- print(" - Local: http://localhost:7861")
490
- print(" - Network: http://0.0.0.0:7861")
491
- print(" - API Docs: http://localhost:7861/docs")
492
-
493
- # Create and launch interface
494
- iface = create_interface()
495
-
496
- # Launch with broader accessibility
497
- iface.launch(
498
- share=False, # Set to True if you want public sharing
499
- server_name="0.0.0.0", # Allow external connections
500
- server_port=8080, # Your current port
501
- show_error=True,
502
- show_api=True,
503
- quiet=False,
504
- inbrowser=True, # Try to open browser automatically
505
- prevent_thread_lock=False
506
  )
507
 
508
  except Exception as e:
509
- logging.error(f"Failed to start application: {e}")
510
- import traceback
511
- print(f"Error: {e}")
512
- print(f"Traceback: {traceback.format_exc()}")
513
- sys.exit(1)
 
 
 
 
 
 
25
 
26
  # --- Global Variables ---
27
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
28
+ MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft") # Updated path
29
  IQTREE_PATH = os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
30
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
31
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
 
35
  # --- Logging ---
36
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
37
 
38
+ # --- Paths ---
39
+ # Model repository and file paths
40
  model_repo = "GGproject10/best_boundary_aware_model"
41
  csv_path = "f cleaned.csv"
42
+
43
+ # Get HF token from environment (if available)
44
  hf_token = os.getenv("HF_TOKEN")
45
 
46
+ # --- Load Models ---
47
  boundary_model = None
48
  keras_model = None
49
  kmer_to_index = None
 
50
 
51
+ # Try to load boundary model from Hugging Face Hub
52
  try:
53
  boundary_path = hf_hub_download(
54
  repo_id=model_repo,
 
58
  if os.path.exists(boundary_path):
59
  boundary_model = GenePredictor(boundary_path)
60
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
61
+ else:
62
+ logging.warning(f"Boundary model file not found after download")
63
  except Exception as e:
64
  logging.error(f"Failed to load boundary model from HF Hub: {e}")
65
 
66
+ # Try to load Keras model from Hugging Face Hub
67
  try:
68
  keras_path = hf_hub_download(
69
  repo_id=model_repo,
 
80
  keras_model = load_model(keras_path)
81
  with open(kmer_path, "rb") as f:
82
  kmer_to_index = pickle.load(f)
83
+ logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
84
+ else:
85
+ logging.warning(f"Keras model or kmer files not found after download")
86
  except Exception as e:
87
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
88
 
89
+ # --- Initialize New Tree Analyzer ---
90
+ analyzer = None
91
+ try:
92
+ analyzer = PhylogeneticTreeAnalyzer()
93
+
94
+ # Try multiple potential locations for the CSV file
95
+ csv_candidates = [
96
+ csv_path,
97
+ os.path.join(BASE_DIR, csv_path),
98
+ os.path.join(BASE_DIR, "app", csv_path),
99
+ os.path.join(os.path.dirname(__file__), csv_path),
100
+ "f_cleaned.csv", # Alternative naming
101
+ os.path.join(BASE_DIR, "f_cleaned.csv")
102
+ ]
103
+
104
+ csv_loaded = False
105
+ for csv_candidate in csv_candidates:
106
+ if os.path.exists(csv_candidate):
107
+ if analyzer.load_data(csv_candidate):
108
+ logging.info(f"Tree analyzer data loaded from: {csv_candidate}")
109
+ csv_loaded = True
110
+ csv_path = csv_candidate # Update path for consistency
111
+ break
112
+ else:
113
+ logging.warning(f"Failed to load data from: {csv_candidate}")
114
+
115
+ if not csv_loaded:
116
+ logging.error("Failed to load CSV data from any candidate location")
117
+ analyzer = None
118
+ else:
119
+ # Try to train AI model (optional)
120
+ try:
121
+ if analyzer.train_ai_model():
122
+ logging.info("AI model training completed successfully")
123
+ else:
124
+ logging.warning("AI model training failed; proceeding with basic analysis.")
125
+ except Exception as e:
126
+ logging.warning(f"AI model training failed: {e}")
127
+
128
+ except Exception as e:
129
+ logging.error(f"Failed to initialize tree analyzer: {e}")
130
+ analyzer = None
131
+
132
+ # --- Enhanced Tool Detection with Binary Permission Setup ---
133
  def setup_binary_permissions():
134
  """Set executable permissions on MAFFT and IQ-TREE binaries"""
135
  binaries = [MAFFT_PATH, IQTREE_PATH]
 
137
  for binary in binaries:
138
  if os.path.exists(binary):
139
  try:
140
+ # Set executable permission
141
  current_mode = os.stat(binary).st_mode
142
  os.chmod(binary, current_mode | stat.S_IEXEC)
143
  logging.info(f"Set executable permission on {binary}")
144
  except Exception as e:
145
  logging.warning(f"Failed to set executable permission on {binary}: {e}")
146
+ else:
147
+ logging.warning(f"Binary not found: {binary}")
148
 
149
  def check_tool_availability():
150
+ """Enhanced check for MAFFT and IQ-TREE availability with improved path validation"""
151
+
152
+ # First, ensure binaries have executable permissions
153
  setup_binary_permissions()
154
 
155
  # Check MAFFT
156
  mafft_available = False
157
  mafft_cmd = None
158
 
159
+ # Updated MAFFT candidates list based on your new API
160
  mafft_candidates = [
161
+ MAFFT_PATH, # Primary path from your new API
162
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft"),
163
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft.bat"), # Windows fallback
164
  'mafft',
165
  '/usr/bin/mafft',
166
  '/usr/local/bin/mafft',
167
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafftdir", "bin", "mafft"),
168
+ # Add potential conda/miniconda paths
169
+ os.path.expanduser("~/anaconda3/bin/mafft"),
170
+ os.path.expanduser("~/miniconda3/bin/mafft"),
171
+ "/opt/conda/bin/mafft",
172
+ "/usr/local/miniconda3/bin/mafft"
173
  ]
174
 
175
  for candidate in mafft_candidates:
176
  if not candidate:
177
  continue
178
+
179
+ # First check if file exists or is in PATH
180
  if os.path.exists(candidate) or shutil.which(candidate):
181
+ # Now test actual execution
182
  try:
183
+ test_cmd = [candidate, "--help"]
184
+ result = subprocess.run(
185
+ test_cmd,
186
+ capture_output=True,
187
+ text=True,
188
+ timeout=10
189
+ )
190
  if result.returncode == 0 or "mafft" in result.stderr.lower():
191
  mafft_available = True
192
  mafft_cmd = candidate
193
+ logging.info(f"MAFFT found and tested successfully at: {candidate}")
194
  break
195
+ except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError) as e:
196
+ logging.debug(f"MAFFT test failed for {candidate}: {e}")
197
  continue
198
 
199
+ # Check IQ-TREE with similar approach
200
  iqtree_available = False
201
  iqtree_cmd = None
202
 
203
+ # Updated IQ-TREE candidates list
204
  iqtree_candidates = [
205
+ IQTREE_PATH, # Primary path from your new API
206
  'iqtree2',
207
  'iqtree',
208
+ 'iqtree3',
209
  '/usr/bin/iqtree2',
210
  '/usr/local/bin/iqtree2',
211
+ '/usr/bin/iqtree',
212
+ '/usr/local/bin/iqtree',
213
+ 'iqtree2.exe', # Windows
214
+ 'iqtree.exe', # Windows
215
+ 'iqtree3.exe', # Windows
216
+ os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree2"),
217
+ # Add potential conda paths
218
+ os.path.expanduser("~/anaconda3/bin/iqtree2"),
219
+ os.path.expanduser("~/miniconda3/bin/iqtree2"),
220
+ "/opt/conda/bin/iqtree2",
221
+ "/usr/local/miniconda3/bin/iqtree2"
222
  ]
223
 
224
  for candidate in iqtree_candidates:
225
  if not candidate:
226
  continue
227
+
228
  if os.path.exists(candidate) or shutil.which(candidate):
229
  try:
230
+ test_cmd = [candidate, "--help"]
231
+ result = subprocess.run(
232
+ test_cmd,
233
+ capture_output=True,
234
+ text=True,
235
+ timeout=10
236
+ )
237
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
238
  iqtree_available = True
239
  iqtree_cmd = candidate
240
+ logging.info(f"IQ-TREE found and tested successfully at: {candidate}")
241
  break
242
+ except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError) as e:
243
+ logging.debug(f"IQ-TREE test failed for {candidate}: {e}")
244
  continue
245
 
246
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
247
 
248
+ def install_dependencies_guide():
249
+ """Provide installation guidance for missing dependencies"""
250
+ guide = """
251
+ 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
252
+
253
+ For MAFFT:
254
+ - Ubuntu/Debian: sudo apt-get install mafft
255
+ - CentOS/RHEL: sudo yum install mafft
256
+ - macOS: brew install mafft
257
+ - Windows: Download from https://mafft.cbrc.jp/alignment/software/
258
+ - Conda: conda install -c bioconda mafft
259
+
260
+ For IQ-TREE:
261
+ - Ubuntu/Debian: sudo apt-get install iqtree
262
+ - CentOS/RHEL: sudo yum install iqtree
263
+ - macOS: brew install iqtree
264
+ - Windows: Download from http://www.iqtree.org/
265
+ - Conda: conda install -c bioconda iqtree
266
+
267
+ Alternative: Use conda/mamba (RECOMMENDED):
268
+ - conda install -c bioconda mafft iqtree
269
+
270
+ Docker option:
271
+ - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
272
+ - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
273
+
274
+ TROUBLESHOOTING:
275
+ If tools are installed but not detected, try:
276
+ 1. Add installation directory to PATH
277
+ 2. Use absolute paths in the configuration
278
+ 3. Check permissions on executable files
279
+ 4. Ensure binaries have executable permissions (chmod +x)
280
+ """
281
+ return guide
282
+
283
+ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
284
+ """
285
+ Improved phylogenetic placement using the new API approach.
286
+ This adds the query sequence to a reference alignment and tree.
287
+ """
288
+ try:
289
+ # Validate sequence
290
+ if len(sequence.strip()) < 100:
291
+ return False, "Error: Sequence is too short for phylogenetic placement (minimum 100 bp).", None, None
292
+
293
+ # Generate unique query ID
294
+ query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
295
+ query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
296
+ aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
297
+ output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
298
+
299
+ # Check if reference files exist
300
+ if not os.path.exists(ALIGNMENT_PATH):
301
+ return False, f"Reference alignment not found: {ALIGNMENT_PATH}", None, None
302
+
303
+ if not os.path.exists(TREE_PATH):
304
+ return False, f"Reference tree not found: {TREE_PATH}", None, None
305
+
306
+ # Save query sequence as FASTA (improved error handling)
307
+ try:
308
+ query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
309
+ SeqIO.write([query_record], query_fasta, "fasta")
310
+ logging.info(f"Query sequence saved: {query_fasta}")
311
+ except Exception as e:
312
+ return False, f"Error writing query sequence: {e}", None, None
313
+
314
+ # Step 1: Add query sequence to reference alignment using MAFFT (improved approach)
315
+ logging.info("Adding query sequence to reference alignment...")
316
+ try:
317
+ with open(aligned_with_query, "w") as output_file:
318
+ mafft_result = subprocess.run([
319
+ mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
320
+ ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
321
+
322
+ # Verify alignment file was created and is not empty
323
+ if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
324
+ return False, "MAFFT alignment failed: output file is empty", None, None
325
+
326
+ logging.info(f"MAFFT alignment completed: {aligned_with_query}")
327
+
328
+ except subprocess.CalledProcessError as e:
329
+ error_msg = e.stderr if e.stderr else "Unknown MAFFT error"
330
+ return False, f"MAFFT alignment failed: {error_msg}", None, None
331
+ except subprocess.TimeoutExpired:
332
+ return False, "MAFFT alignment timeout (>10 minutes)", None, None
333
+ except FileNotFoundError:
334
+ return False, f"MAFFT executable not found: {mafft_cmd}", None, None
335
+ except Exception as e:
336
+ return False, f"MAFFT execution error: {e}", None, None
337
+
338
+ # Step 2: Place sequence in phylogenetic tree using IQ-TREE (improved approach)
339
+ logging.info("Placing sequence in phylogenetic tree...")
340
+ try:
341
+ iqtree_result = subprocess.run([
342
+ iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
343
+ "-m", "GTR+G", "-pre", output_prefix, "-redo"
344
+ ], capture_output=True, text=True, timeout=1200, check=True)
345
+
346
+ # Check if treefile was generated
347
+ treefile = f"{output_prefix}.treefile"
348
+ if not os.path.exists(treefile) or os.path.getsize(treefile) == 0:
349
+ return False, "IQ-TREE placement failed: treefile not generated", aligned_with_query, None
350
+
351
+ logging.info(f"IQ-TREE placement completed: {treefile}")
352
+
353
+ # Generate success message with details
354
+ success_msg = "✅ Phylogenetic placement completed successfully!\n"
355
+ success_msg += f"- Query ID: {query_id}\n"
356
+ success_msg += f"- Alignment: {os.path.basename(aligned_with_query)}\n"
357
+ success_msg += f"- Tree: {os.path.basename(treefile)}\n"
358
+
359
+ # Try to extract model information from log
360
+ log_file = f"{output_prefix}.log"
361
+ if os.path.exists(log_file):
362
+ try:
363
+ with open(log_file, 'r') as f:
364
+ log_content = f.read()
365
+ if "Log-likelihood" in log_content:
366
+ log_lines = [line for line in log_content.split('\n') if "Log-likelihood" in line]
367
+ if log_lines:
368
+ success_msg += f"- {log_lines[0].strip()}\n"
369
+ except Exception as e:
370
+ logging.warning(f"Could not read log file: {e}")
371
+
372
+ return True, success_msg, aligned_with_query, treefile
373
+
374
+ except subprocess.CalledProcessError as e:
375
+ error_msg = e.stderr if e.stderr else "Unknown IQ-TREE error"
376
+ return False, f"IQ-TREE placement failed: {error_msg}", aligned_with_query, None
377
+ except subprocess.TimeoutExpired:
378
+ return False, "IQ-TREE placement timeout (>20 minutes)", aligned_with_query, None
379
+ except FileNotFoundError:
380
+ return False, f"IQ-TREE executable not found: {iqtree_cmd}", aligned_with_query, None
381
+ except Exception as e:
382
+ return False, f"IQ-TREE execution error: {e}", aligned_with_query, None
383
+
384
+ except Exception as e:
385
+ logging.error(f"Phylogenetic placement failed: {e}")
386
+ return False, f"Phylogenetic placement failed: {str(e)}", None, None
387
+ finally:
388
+ # Clean up temporary query file
389
+ if 'query_fasta' in locals() and os.path.exists(query_fasta):
390
+ try:
391
+ os.unlink(query_fasta)
392
+ except:
393
+ pass
394
+
395
+ def build_maximum_likelihood_tree(f_gene_sequence):
396
+ """
397
+ Build maximum likelihood phylogenetic tree using the improved phylogenetic placement approach.
398
+ """
399
+ try:
400
+ # Check tool availability with enhanced detection
401
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
402
+
403
+ # Prepare status message
404
+ status_msg = "🔍 Checking dependencies...\n"
405
+
406
+ if not mafft_available:
407
+ status_msg += "❌ MAFFT not found or not executable\n"
408
+ else:
409
+ status_msg += f"✅ MAFFT found and tested: {mafft_cmd}\n"
410
+
411
+ if not iqtree_available:
412
+ status_msg += "❌ IQ-TREE not found or not executable\n"
413
+ else:
414
+ status_msg += f"✅ IQ-TREE found and tested: {iqtree_cmd}\n"
415
+
416
+ # Check for reference files
417
+ if not os.path.exists(ALIGNMENT_PATH):
418
+ status_msg += f"❌ Reference alignment not found: {ALIGNMENT_PATH}\n"
419
+ else:
420
+ status_msg += f"✅ Reference alignment found\n"
421
+
422
+ if not os.path.exists(TREE_PATH):
423
+ status_msg += f"❌ Reference tree not found: {TREE_PATH}\n"
424
+ else:
425
+ status_msg += f"✅ Reference tree found\n"
426
+
427
+ # If any required component is missing, provide installation guide
428
+ if not mafft_available or not iqtree_available:
429
+ guide = install_dependencies_guide()
430
+ return False, f"{status_msg}\n{guide}", None, None
431
+
432
+ if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
433
+ status_msg += "\n❌ Reference alignment and/or tree files are missing.\n"
434
+ status_msg += "Please ensure f_gene_sequences_aligned.fasta and f_gene_sequences.phy.treefile are available."
435
+ return False, status_msg, None, None
436
+
437
+ # Perform phylogenetic placement using improved method
438
+ logging.info("Starting phylogenetic placement...")
439
+ placement_success, placement_message, aligned_file, tree_file = phylogenetic_placement(
440
+ f_gene_sequence, mafft_cmd, iqtree_cmd
441
+ )
442
+
443
+ if placement_success:
444
+ final_message = f"{status_msg}\n{placement_message}"
445
+
446
+ # Copy files to standard locations for compatibility
447
+ if aligned_file and os.path.exists(aligned_file):
448
+ standard_aligned = "query_with_references_aligned.fasta"
449
+ shutil.copy2(aligned_file, standard_aligned)
450
+ aligned_file = standard_aligned
451
+
452
+ if tree_file and os.path.exists(tree_file):
453
+ standard_tree = "query_placement_tree.treefile"
454
+ shutil.copy2(tree_file, standard_tree)
455
+ tree_file = standard_tree
456
+
457
+ logging.info("Phylogenetic placement completed successfully")
458
+ return True, final_message, aligned_file, tree_file
459
+ else:
460
+ return False, f"{status_msg}\n{placement_message}", aligned_file, tree_file
461
+
462
+ except Exception as e:
463
+ logging.error(f"ML tree construction failed: {e}")
464
+ return False, f"ML tree construction failed: {str(e)}", None, None
465
+
466
+ # --- NEW Tree Analysis Function (Using the new analyzer API) ---
467
+ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tuple:
468
+ """
469
+ Analyze sequence and create phylogenetic tree using the new analyzer API
470
+
471
+ Args:
472
+ sequence (str): DNA sequence to analyze
473
+ matching_percentage (float): Similarity threshold percentage
474
+
475
+ Returns:
476
+ tuple: (status_message, html_file_path)
477
+ """
478
+ try:
479
+ if not analyzer:
480
+ return "❌ Error: Tree analyzer not initialized. Please check if the CSV data file is available.", None
481
+
482
+ if not sequence:
483
+ return "❌ Error: Please provide a sequence.", None
484
+
485
+ if not (1 <= matching_percentage <= 99):
486
+ return "❌ Error: Matching percentage must be between 1 and 99.", None
487
+
488
+ # Validate inputs
489
+ sequence = sequence.strip()
490
+ if len(sequence) < 10:
491
+ return "❌ Error: Invalid or missing sequence. Must be ≥10 nucleotides.", None
492
+
493
+ # Find query sequence
494
+ if not analyzer.find_query_sequence(sequence):
495
+ return "❌ Error: Sequence not accepted.", None
496
+
497
+ # Find similar sequences
498
+ matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
499
+
500
+ if not matched_ids:
501
+ return f"❌ Error: No similar sequences found at {matching_percentage}% similarity threshold.", None
502
+
503
+ logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.2f}% similarity")
504
+
505
+ # Build tree structure
506
+ analyzer.build_tree_structure_with_ml_safe(matched_ids)
507
+
508
+ # Create interactive tree
509
+ fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
510
+
511
+ # Save to temporary file that Gradio can access
512
+ temp_dir = tempfile.gettempdir()
513
+ output_path = os.path.join(temp_dir, 'phylogenetic_tree_interactive.html')
514
+ fig.write_html(output_path)
515
+
516
+ success_msg = f"✅ Analysis complete! Found {len(matched_ids)} similar sequences with {actual_percentage:.2f}% average similarity."
517
+
518
+ return success_msg, output_path
519
+
520
+ except Exception as e:
521
+ error_msg = f"❌ Error during analysis: {str(e)}"
522
+ logging.error(error_msg)
523
+ import traceback
524
+ logging.error(f"Full traceback: {traceback.format_exc()}")
525
+ return error_msg, None
526
+
527
+ # --- Keras Prediction ---
528
  def predict_with_keras(sequence):
529
  try:
530
  if not keras_model or not kmer_to_index:
531
+ return f"Keras model not available. Input sequence: {sequence[:100]}..."
532
 
533
  if len(sequence) < 6:
534
+ return "Skipped: sequence too short for F gene validation (minimum 6 nucleotides required)."
535
 
536
+ # Generate k-mers
537
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
538
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
539
 
540
+ # Prepare input
541
  input_arr = np.array([indices])
542
  prediction = keras_model.predict(input_arr, verbose=0)[0]
543
+
544
+ # Assume the last value is the F gene probability (adjust index if model outputs differ)
545
+ f_gene_prob = prediction[-1] # Take the probability of the F gene class
546
+
547
+ # Convert to percentage with a buffer (e.g., add 5% to account for minor mismatches)
548
+ percentage = min(100, max(0, int(f_gene_prob * 100 + 5))) # Ensure 0-100% range
549
 
550
  return f"{percentage}% F gene"
551
  except Exception as e:
552
+ logging.error(f"Keras prediction failed: {e}")
553
  return f"Keras prediction failed: {str(e)}"
554
 
555
+ # --- FASTA Reader ---
556
  def read_fasta_file(file_obj):
557
  try:
558
  if file_obj is None:
559
  return ""
560
 
561
+ # Handle file object
562
  if hasattr(file_obj, 'name'):
563
  with open(file_obj.name, "r") as f:
564
  content = f.read()
 
572
  logging.error(f"Failed to read FASTA file: {e}")
573
  return ""
574
 
575
+ # --- Core Pipeline Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
577
  try:
578
  # Clean input
 
583
  # Sanitize DNA sequence
584
  if not re.match('^[ACTGN]+$', dna_input):
585
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
586
+ logging.info("DNA sequence sanitized")
587
 
588
+ # Step 1: Boundary Prediction - Extract F gene sequence
589
+ processed_sequence = dna_input # This will be the sequence used for downstream analysis
590
  boundary_output = ""
591
 
592
  if boundary_model:
 
594
  predictions, probs, confidence = boundary_model.predict(dna_input)
595
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
596
  if regions:
597
+ processed_sequence = regions[0]["sequence"] # Use the extracted gene region
598
+ boundary_output = processed_sequence # Output the actual F gene sequence
599
+ logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
600
  else:
601
+ boundary_output = f"No F gene regions found in input sequence"
602
+ processed_sequence = dna_input
603
+ logging.warning("No gene regions found, using full sequence")
604
+ logging.info("Boundary model prediction completed")
605
  except Exception as e:
606
+ logging.error(f"Boundary model failed: {e}")
607
  boundary_output = f"Boundary model error: {str(e)}"
608
+ processed_sequence = dna_input # Fall back to original sequence
609
  else:
610
+ boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
611
+ processed_sequence = dna_input
612
 
613
+ # Step 2: Keras Prediction (F gene validation)
614
  keras_output = ""
615
  if processed_sequence and len(processed_sequence) >= 6:
616
+ keras_prediction = predict_with_keras(processed_sequence)
617
+ # Use the prediction directly as it's now a percentage
618
+ keras_output = keras_prediction
619
+ else:
620
+ keras_output = "Skipped: sequence too short for F gene validation"
621
+
622
+ # Step 3: Maximum Likelihood Tree (Phylogenetic Placement) - Using improved API
623
+ aligned_file = None
624
+ phy_file = None
625
+ ml_tree_output = ""
626
+
627
+ if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
628
+ try:
629
+ logging.info("Starting phylogenetic placement...")
630
+ ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
631
+
632
+ if ml_success:
633
+ ml_tree_output = ml_message
634
+ aligned_file = ml_aligned
635
+ phy_file = ml_tree
636
+ else:
637
+ ml_tree_output = ml_message # This now includes detailed error information
638
+
639
+ except Exception as e:
640
+ ml_tree_output = f"❌ Phylogenetic placement failed: {str(e)}"
641
+ logging.error(f"Phylogenetic placement failed: {e}")
642
+ elif build_ml_tree:
643
+ ml_tree_output = "❌ F gene sequence too short for phylogenetic placement (minimum 100 bp)"
644
+ else:
645
+ ml_tree_output = "Phylogenetic placement skipped (not requested)"
646
+
647
+ # Step 4: NEW Simplified Tree Analysis (using the new analyzer API)
648
+ html_file = None
649
+ tree_html_content = "No tree generated"
650
+ simplified_ml_output = ""
651
+
652
+ if analyzer and processed_sequence and len(processed_sequence) >= 10:
653
+ try:
654
+ logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
655
+
656
+ # Use the new analyze_sequence_for_tree function
657
+ tree_result, html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
658
+
659
+ if html_path and os.path.exists(html_path):
660
+ # Success - copy the HTML file to a location Gradio can serve
661
+ output_dir = "output"
662
+ os.makedirs(output_dir, exist_ok=True)
663
+
664
+ # Create a safe filename
665
+ safe_seq_name = re.sub(r'[^a-zA-Z0-9_-]', '', processed_sequence[:20])
666
+ timestamp = str(int(time.time()))
667
+ html_filename = f"tree_{safe_seq_name}_{timestamp}.html"
668
+ final_html_path = os.path.join(output_dir, html_filename)
669
+
670
+ # Copy the HTML file
671
+ shutil.copy2(html_path, final_html_path)
672
+ html_file = final_html_path
673
+
674
+ # Read HTML content for display
675
+ with open(html_path, 'r', encoding='utf-8') as f:
676
+ tree_html_content = f.read()
677
+
678
+ simplified_ml_output = tree_result
679
+ logging.info("Simplified ML tree analysis completed successfully")
680
+ else:
681
+ simplified_ml_output = tree_result # This contains the error message
682
+ logging.warning(f"Simplified ML tree analysis failed: {tree_result}")
683
+
684
+ except Exception as e:
685
+ simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
686
+ logging.error(f"Simplified ML tree analysis failed: {e}")
687
  else:
688
+ if not analyzer:
689
+ simplified_ml_output = "❌ Tree analyzer not available"
690
+ elif not processed_sequence:
691
+ simplified_ml_output = " No sequence to analyze"
 
 
 
 
692
  else:
693
+ simplified_ml_output = " Sequence too short for tree analysis (minimum 10 bp)"
694
+
695
+ # Prepare summary
696
+ summary = f"""
697
+ === ANALYSIS SUMMARY ===
698
+ Input Length: {len(dna_input)} bp
699
+ F Gene Length: {len(processed_sequence)} bp
700
+ F Gene Validation: {keras_output}
701
+ Phylogenetic Analysis: {'✅ Completed' if html_file else '❌ Not performed or failed'}
 
 
 
702
  """
703
 
704
  return (
705
  boundary_output,
706
+ keras_output,
707
  ml_tree_output,
708
+ simplified_ml_output,
709
+ summary,
710
+ aligned_file,
711
+ phy_file,
712
+ html_file,
713
+ tree_html_content
714
  )
715
 
716
  except Exception as e:
717
+ error_msg = f"Pipeline failed: {str(e)}"
718
+ logging.error(error_msg)
719
+ return error_msg, "", "", "", "", None, None, None, "Pipeline error occurred"
720
+
721
+ # --- API-Compatible Wrapper Function ---
722
+ def run_pipeline_api(dna_input, similarity_score=95.0, build_ml_tree=False):
723
+ """
724
+ API-compatible wrapper that returns only serializable data types
725
+ """
726
+ try:
727
+ # Run the main pipeline
728
+ results = run_pipeline(dna_input, similarity_score, build_ml_tree)
729
+
730
+ # Extract text results (first 5 are strings)
731
+ boundary_output = results[0] if results[0] else "No boundary analysis"
732
+ keras_output = results[1] if results[1] else "No F gene validation"
733
+ ml_tree_output = results[2] if results[2] else "No ML tree analysis"
734
+ simplified_ml_output = results[3] if results[3] else "No simplified analysis"
735
+ summary = results[4] if results[4] else "No summary"
736
+
737
+ # Handle file outputs - return file paths or status
738
+ aligned_file_status = "Available" if results[5] and os.path.exists(results[5]) else "Not generated"
739
+ phy_file_status = "Available" if results[6] and os.path.exists(results[6]) else "Not generated"
740
+ html_file_status = "Available" if results[7] and os.path.exists(results[7]) else "Not generated"
741
+
742
+ # HTML content (truncated for API)
743
+ html_content = results[8] if results[8] else "No HTML content"
744
+ if len(html_content) > 1000: # Truncate for API response
745
+ html_content = html_content[:1000] + "... [truncated for API response]"
746
+
747
+ return {
748
+ "boundary_analysis": boundary_output,
749
+ "f_gene_validation": keras_output,
750
+ "ml_tree_analysis": ml_tree_output,
751
+ "simplified_tree_analysis": simplified_ml_output,
752
+ "summary": summary,
753
+ "aligned_file_status": aligned_file_status,
754
+ "phylogenetic_file_status": phy_file_status,
755
+ "html_tree_status": html_file_status,
756
+ "html_preview": html_content
757
+ }
758
+
759
+ except Exception as e:
760
+ return {
761
+ "error": f"API pipeline failed: {str(e)}",
762
+ "boundary_analysis": "",
763
+ "f_gene_validation": "",
764
+ "ml_tree_analysis": "",
765
+ "simplified_tree_analysis": "",
766
+ "summary": "",
767
+ "aligned_file_status": "Error",
768
+ "phylogenetic_file_status": "Error",
769
+ "html_tree_status": "Error",
770
+ "html_preview": ""
771
+ }
772
 
773
+ # --- File Upload Handler ---
774
+ def handle_file_upload_api(file_content):
775
+ """API-compatible file upload handler"""
776
  try:
777
+ if not file_content:
778
+ return "No file provided"
779
+
780
+ # Try to decode if it's bytes
781
+ if isinstance(file_content, bytes):
782
+ content = file_content.decode('utf-8')
783
+ else:
784
+ content = str(file_content)
785
+
786
+ # Extract sequence from FASTA format
787
+ lines = content.strip().split('\n')
788
+ sequence_lines = [line.strip() for line in lines if not line.startswith('>')]
789
+ sequence = ''.join(sequence_lines)
790
+
791
+ # Clean sequence
792
+ sequence = sequence.upper().strip()
793
+ sequence = ''.join(c if c in 'ACTGN' else 'N' for c in sequence)
794
+
795
+ return sequence
796
+
797
  except Exception as e:
798
+ return f"File processing error: {str(e)}"
 
799
 
800
+ # --- Create Gradio Interface ---
801
  def create_interface():
802
+ """Create the Gradio interface with API support"""
803
 
804
+ # Custom CSS for better appearance
805
+ css = """
806
+ .gradio-container {
807
+ max-width: 1200px !important;
808
+ margin: auto !important;
809
+ }
810
+ .output-html {
811
+ height: 600px !important;
812
+ overflow: auto !important;
813
+ }
814
+ """
815
+
816
+ with gr.Blocks(css=css, title="F Gene Analysis Pipeline") as app:
817
+ gr.Markdown("""
818
+ # 🧬 F Gene Analysis Pipeline
819
+
820
+ **Comprehensive F gene boundary detection, validation, and phylogenetic analysis**
821
 
822
+ This tool performs:
823
+ 1. **Boundary Detection**: Extracts F gene sequences from input DNA
824
+ 2. **F Gene Validation**: Validates extracted sequences using ML models
825
+ 3. **Phylogenetic Analysis**: Places sequences in evolutionary context
826
+ 4. **Interactive Trees**: Generates interactive phylogenetic visualizations
827
  """)
828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  with gr.Row():
830
  with gr.Column(scale=2):
831
+ # Input section
832
+ gr.Markdown("### 📥 Input")
833
+
834
+ sequence_input = gr.Textbox(
835
+ label="DNA Sequence",
836
+ placeholder="Enter DNA sequence (ACTG) or upload FASTA file...",
837
+ lines=5,
838
+ max_lines=10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  )
840
 
841
+ file_input = gr.File(
842
+ label="Upload FASTA File (optional)",
843
+ file_types=[".fasta", ".fa", ".txt"],
844
+ type="filepath"
845
  )
846
 
847
+ # Parameters
848
+ gr.Markdown("### ⚙️ Parameters")
849
+
850
+ similarity_slider = gr.Slider(
851
+ minimum=50,
852
+ maximum=99,
853
+ value=95,
854
+ step=1,
855
+ label="Similarity Threshold (%)",
856
+ info="Minimum similarity for phylogenetic grouping"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857
  )
858
+
859
+ ml_tree_checkbox = gr.Checkbox(
860
+ label="Build Maximum Likelihood Tree",
861
+ value=False,
862
+ info="Requires MAFFT and IQ-TREE (slower but more accurate)"
863
+ )
864
+
865
+ # Action buttons
866
+ analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary", size="lg")
867
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary")
868
+
869
+ with gr.Column(scale=3):
870
+ # Output section
871
+ gr.Markdown("### 📊 Results")
872
+
873
+ with gr.Tabs():
874
+ with gr.TabItem("📈 Analysis Results"):
875
+ boundary_output = gr.Textbox(
876
+ label="1. Boundary Detection & F Gene Extraction",
877
+ lines=3,
878
+ interactive=False
879
+ )
880
+
881
+ keras_output = gr.Textbox(
882
+ label="2. F Gene Validation",
883
+ lines=2,
884
+ interactive=False
885
+ )
886
+
887
+ ml_output = gr.Textbox(
888
+ label="3. Maximum Likelihood Tree Analysis",
889
+ lines=4,
890
+ interactive=False
891
+ )
892
+
893
+ simplified_output = gr.Textbox(
894
+ label="4. Simplified Phylogenetic Analysis",
895
+ lines=3,
896
+ interactive=False
897
+ )
898
+
899
+ summary_output = gr.Textbox(
900
+ label="📋 Summary",
901
+ lines=4,
902
+ interactive=False
903
+ )
904
+
905
+ with gr.TabItem("🌳 Interactive Tree"):
906
+ tree_html = gr.HTML(
907
+ label="Phylogenetic Tree Visualization",
908
+ elem_classes=["output-html"]
909
+ )
910
+
911
+ with gr.TabItem("📁 Downloads"):
912
+ gr.Markdown("### Available Downloads")
913
+
914
+ aligned_file = gr.File(
915
+ label="Aligned Sequences (FASTA)",
916
+ interactive=False
917
+ )
918
+
919
+ phy_file = gr.File(
920
+ label="Phylogenetic Tree (Newick)",
921
+ interactive=False
922
+ )
923
+
924
+ html_file = gr.File(
925
+ label="Interactive Tree (HTML)",
926
+ interactive=False
927
+ )
928
 
929
+ # Event handlers
930
+ def handle_file_upload(file_obj):
931
+ if file_obj:
932
+ return read_fasta_file(file_obj)
933
+ return ""
 
 
 
934
 
935
+ def clear_all():
936
+ return (
937
+ "", # sequence_input
938
+ None, # file_input
939
+ 95, # similarity_slider
940
+ False, # ml_tree_checkbox
941
+ "", # boundary_output
942
+ "", # keras_output
943
+ "", # ml_output
944
+ "", # simplified_output
945
+ "", # summary_output
946
+ "", # tree_html
947
+ None, # aligned_file
948
+ None, # phy_file
949
+ None # html_file
950
+ )
951
+
952
+ # File upload handler
953
+ file_input.change(
954
+ fn=handle_file_upload,
955
+ inputs=[file_input],
956
+ outputs=[sequence_input]
957
  )
958
 
959
+ # Main analysis handler
960
+ analyze_btn.click(
961
+ fn=run_pipeline,
962
+ inputs=[sequence_input, similarity_slider, ml_tree_checkbox],
963
+ outputs=[
964
+ boundary_output,
965
+ keras_output,
966
+ ml_output,
967
+ simplified_output,
968
+ summary_output,
969
+ aligned_file,
970
+ phy_file,
971
+ html_file,
972
+ tree_html
973
+ ]
974
  )
975
 
976
+ # Clear handler
977
+ clear_btn.click(
978
+ fn=clear_all,
979
+ outputs=[
980
+ sequence_input,
981
+ file_input,
982
+ similarity_slider,
983
+ ml_tree_checkbox,
984
+ boundary_output,
985
+ keras_output,
986
+ ml_output,
987
+ simplified_output,
988
+ summary_output,
989
+ tree_html,
990
+ aligned_file,
991
+ phy_file,
992
+ html_file
993
+ ]
994
  )
995
 
996
  # Footer
997
+ gr.Markdown("""
998
+ ---
999
+ **💡 Tips:**
1000
+ - For best results, use sequences > 100 bp
1001
+ - ML tree analysis requires external tools (MAFFT, IQ-TREE)
1002
+ - Interactive trees work best with 10-100 sequences
1003
+ - API endpoint available at `/api/predict/`
1004
  """)
1005
 
1006
+ return app
1007
 
1008
+ # --- API Interface Creation ---
1009
+ def create_api_interface():
1010
+ """Create a separate API-only interface"""
1011
+
1012
+ api_interface = gr.Interface(
1013
+ fn=run_pipeline_api,
1014
+ inputs=[
1015
+ gr.Textbox(label="DNA Sequence", placeholder="Enter DNA sequence..."),
1016
+ gr.Slider(minimum=50, maximum=99, value=95, label="Similarity Threshold (%)"),
1017
+ gr.Checkbox(label="Build ML Tree", value=False)
1018
+ ],
1019
+ outputs=gr.JSON(label="Analysis Results"),
1020
+ title="F Gene Analysis API",
1021
+ description="API endpoint for F gene analysis pipeline",
1022
+ allow_flagging="never"
1023
+ )
1024
+
1025
+ return api_interface
1026
 
1027
+ # --- Main Application Setup ---
1028
  if __name__ == "__main__":
1029
+ # Create the main interface
1030
+ main_app = create_interface()
1031
+
1032
+ # Create API interface
1033
+ api_app = create_api_interface()
1034
+
1035
+ # Try to launch with API enabled
1036
  try:
1037
+ # Mount both interfaces
1038
+ app = gr.TabbedInterface(
1039
+ [main_app, api_app],
1040
+ ["Main Interface", "API"],
1041
+ title="F Gene Analysis Pipeline"
1042
+ )
1043
+
1044
+ # Launch with API enabled
1045
+ app.launch(
1046
+ server_name="0.0.0.0",
1047
+ server_port=7860,
1048
+ share=False,
1049
+ enable_api=True, # This should work now
1050
+ api_open=True,
1051
+ show_error=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1052
  )
1053
 
1054
  except Exception as e:
1055
+ logging.error(f"Failed to launch with API: {e}")
1056
+ logging.info("Falling back to main interface without API...")
1057
+
1058
+ # Fallback: launch main interface without API
1059
+ main_app.launch(
1060
+ server_name="0.0.0.0",
1061
+ server_port=7860,
1062
+ share=False,
1063
+ show_error=True
1064
+ )