re-type commited on
Commit
b03fbbd
·
verified ·
1 Parent(s): 4af0cff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -80
app.py CHANGED
@@ -9,7 +9,6 @@ import logging
9
  import numpy as np
10
  from predictor import EnhancedGenePredictor
11
  from tensorflow.keras.models import load_model
12
- # Import the new analyzer
13
  from analyzer import PhylogeneticTreeAnalyzer
14
  import tempfile
15
  import shutil
@@ -23,6 +22,21 @@ from Bio.SeqRecord import SeqRecord
23
  import stat
24
  import time
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # --- Global Variables ---
27
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
28
  MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft") # Updated path
@@ -156,16 +170,15 @@ def check_tool_availability():
156
  mafft_available = False
157
  mafft_cmd = None
158
 
159
- # Updated MAFFT candidates list based on your new API
160
  mafft_candidates = [
161
- MAFFT_PATH, # Primary path from your new API
162
  os.path.join(BASE_DIR, "binaries", "mafft", "mafft"),
163
- os.path.join(BASE_DIR, "binaries", "mafft", "mafft.bat"), # Windows fallback
164
  'mafft',
165
  '/usr/bin/mafft',
166
  '/usr/local/bin/mafft',
167
  os.path.join(BASE_DIR, "binaries", "mafft", "mafftdir", "bin", "mafft"),
168
- # Add potential conda/miniconda paths
169
  os.path.expanduser("~/anaconda3/bin/mafft"),
170
  os.path.expanduser("~/miniconda3/bin/mafft"),
171
  "/opt/conda/bin/mafft",
@@ -176,9 +189,7 @@ def check_tool_availability():
176
  if not candidate:
177
  continue
178
 
179
- # First check if file exists or is in PATH
180
  if os.path.exists(candidate) or shutil.which(candidate):
181
- # Now test actual execution
182
  try:
183
  test_cmd = [candidate, "--help"]
184
  result = subprocess.run(
@@ -196,13 +207,13 @@ def check_tool_availability():
196
  logging.debug(f"MAFFT test failed for {candidate}: {e}")
197
  continue
198
 
199
- # Check IQ-TREE with similar approach
200
  iqtree_available = False
201
  iqtree_cmd = None
202
 
203
  # Updated IQ-TREE candidates list
204
  iqtree_candidates = [
205
- IQTREE_PATH, # Primary path from your new API
206
  'iqtree2',
207
  'iqtree',
208
  'iqtree3',
@@ -210,11 +221,10 @@ def check_tool_availability():
210
  '/usr/local/bin/iqtree2',
211
  '/usr/bin/iqtree',
212
  '/usr/local/bin/iqtree',
213
- 'iqtree2.exe', # Windows
214
- 'iqtree.exe', # Windows
215
- 'iqtree3.exe', # Windows
216
  os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree2"),
217
- # Add potential conda paths
218
  os.path.expanduser("~/anaconda3/bin/iqtree2"),
219
  os.path.expanduser("~/miniconda3/bin/iqtree2"),
220
  "/opt/conda/bin/iqtree2",
@@ -303,7 +313,7 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
303
  if not os.path.exists(TREE_PATH):
304
  return False, f"Reference tree not found: {TREE_PATH}", None, None
305
 
306
- # Save query sequence as FASTA (improved error handling)
307
  try:
308
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
309
  SeqIO.write([query_record], query_fasta, "fasta")
@@ -311,7 +321,7 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
311
  except Exception as e:
312
  return False, f"Error writing query sequence: {e}", None, None
313
 
314
- # Step 1: Add query sequence to reference alignment using MAFFT (improved approach)
315
  logging.info("Adding query sequence to reference alignment...")
316
  try:
317
  with open(aligned_with_query, "w") as output_file:
@@ -319,7 +329,6 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
319
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
320
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
321
 
322
- # Verify alignment file was created and is not empty
323
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
324
  return False, "MAFFT alignment failed: output file is empty", None, None
325
 
@@ -335,7 +344,7 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
335
  except Exception as e:
336
  return False, f"MAFFT execution error: {e}", None, None
337
 
338
- # Step 2: Place sequence in phylogenetic tree using IQ-TREE (improved approach)
339
  logging.info("Placing sequence in phylogenetic tree...")
340
  try:
341
  iqtree_result = subprocess.run([
@@ -385,7 +394,6 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
385
  logging.error(f"Phylogenetic placement failed: {e}")
386
  return False, f"Phylogenetic placement failed: {str(e)}", None, None
387
  finally:
388
- # Clean up temporary query file
389
  if 'query_fasta' in locals() and os.path.exists(query_fasta):
390
  try:
391
  os.unlink(query_fasta)
@@ -397,10 +405,8 @@ def build_maximum_likelihood_tree(f_gene_sequence):
397
  Build maximum likelihood phylogenetic tree using the improved phylogenetic placement approach.
398
  """
399
  try:
400
- # Check tool availability with enhanced detection
401
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
402
 
403
- # Prepare status message
404
  status_msg = "🔍 Checking dependencies...\n"
405
 
406
  if not mafft_available:
@@ -413,7 +419,6 @@ def build_maximum_likelihood_tree(f_gene_sequence):
413
  else:
414
  status_msg += f"✅ IQ-TREE found and tested: {iqtree_cmd}\n"
415
 
416
- # Check for reference files
417
  if not os.path.exists(ALIGNMENT_PATH):
418
  status_msg += f"❌ Reference alignment not found: {ALIGNMENT_PATH}\n"
419
  else:
@@ -424,7 +429,6 @@ def build_maximum_likelihood_tree(f_gene_sequence):
424
  else:
425
  status_msg += f"✅ Reference tree found\n"
426
 
427
- # If any required component is missing, provide installation guide
428
  if not mafft_available or not iqtree_available:
429
  guide = install_dependencies_guide()
430
  return False, f"{status_msg}\n{guide}", None, None
@@ -434,7 +438,6 @@ def build_maximum_likelihood_tree(f_gene_sequence):
434
  status_msg += "Please ensure f_gene_sequences_aligned.fasta and f_gene_sequences.phy.treefile are available."
435
  return False, status_msg, None, None
436
 
437
- # Perform phylogenetic placement using improved method
438
  logging.info("Starting phylogenetic placement...")
439
  placement_success, placement_message, aligned_file, tree_file = phylogenetic_placement(
440
  f_gene_sequence, mafft_cmd, iqtree_cmd
@@ -443,7 +446,6 @@ def build_maximum_likelihood_tree(f_gene_sequence):
443
  if placement_success:
444
  final_message = f"{status_msg}\n{placement_message}"
445
 
446
- # Copy files to standard locations for compatibility
447
  if aligned_file and os.path.exists(aligned_file):
448
  standard_aligned = "query_with_references_aligned.fasta"
449
  shutil.copy2(aligned_file, standard_aligned)
@@ -463,19 +465,9 @@ def build_maximum_likelihood_tree(f_gene_sequence):
463
  logging.error(f"ML tree construction failed: {e}")
464
  return False, f"ML tree construction failed: {str(e)}", None, None
465
 
466
- # --- NEW Tree Analysis Function (Using the new analyzer API) ---
467
- # Replace this part in your analyze_sequence_for_tree function:
468
-
469
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tuple:
470
  """
471
  Analyze sequence and create phylogenetic tree and detailed report using the new analyzer API
472
-
473
- Args:
474
- sequence (str): DNA sequence to analyze
475
- matching_percentage (float): Similarity threshold percentage
476
-
477
- Returns:
478
- tuple: (status_message, tree_html_path, report_html_path)
479
  """
480
  try:
481
  if not analyzer:
@@ -487,16 +479,13 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tupl
487
  if not (1 <= matching_percentage <= 99):
488
  return "❌ Error: Matching percentage must be between 1 and 99.", None, None
489
 
490
- # Validate inputs
491
  sequence = sequence.strip()
492
  if len(sequence) < 10:
493
  return "❌ Error: Invalid or missing sequence. Must be ≥10 nucleotides.", None, None
494
 
495
- # Find query sequence
496
  if not analyzer.find_query_sequence(sequence):
497
  return "❌ Error: Sequence not accepted.", None, None
498
 
499
- # Find similar sequences
500
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
501
 
502
  if not matched_ids:
@@ -504,22 +493,17 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tupl
504
 
505
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.2f}% similarity")
506
 
507
- # Build tree structure
508
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
509
 
510
- # Create interactive tree
511
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
512
 
513
- # Save tree to temporary file
514
  temp_dir = tempfile.gettempdir()
515
  query_id = analyzer.query_id or f"query_{int(time.time())}"
516
  tree_html_path = os.path.join(temp_dir, f'phylogenetic_tree_interactive_{query_id}.html')
517
  fig.write_html(tree_html_path)
518
 
519
- # Ensure the analyzer has the correct user input threshold for the report
520
  analyzer.matching_percentage = matching_percentage
521
 
522
- # Generate detailed report - FIXED: Only pass the two required parameters
523
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
524
  report_html_path = None
525
  if report_success:
@@ -540,7 +524,8 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tupl
540
  logging.error(error_msg)
541
  import traceback
542
  logging.error(f"Full traceback: {traceback.format_exc()}")
543
- return error_msg, None, None# --- Keras Prediction ---
 
544
  def predict_with_keras(sequence):
545
  try:
546
  if not keras_model or not kmer_to_index:
@@ -549,32 +534,26 @@ def predict_with_keras(sequence):
549
  if len(sequence) < 6:
550
  return "Skipped: sequence too short for F gene validation (minimum 6 nucleotides required)."
551
 
552
- # Generate k-mers
553
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
554
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
555
 
556
- # Prepare input
557
  input_arr = np.array([indices])
558
  prediction = keras_model.predict(input_arr, verbose=0)[0]
559
 
560
- # Assume the last value is the F gene probability (adjust index if model outputs differ)
561
- f_gene_prob = prediction[-1] # Take the probability of the F gene class
562
 
563
- # Convert to percentage with a buffer (e.g., add 5% to account for minor mismatches)
564
- percentage = min(100, max(0, int(f_gene_prob * 100 + 5))) # Ensure 0-100% range
565
 
566
  return f"{percentage}% F gene"
567
  except Exception as e:
568
  logging.error(f"Keras prediction failed: {e}")
569
  return f"Keras prediction failed: {str(e)}"
570
 
571
- # --- FASTA Reader ---
572
  def read_fasta_file(file_obj):
573
  try:
574
  if file_obj is None:
575
  return ""
576
 
577
- # Handle file object
578
  if hasattr(file_obj, 'name'):
579
  with open(file_obj.name, "r") as f:
580
  content = f.read()
@@ -588,7 +567,21 @@ def read_fasta_file(file_obj):
588
  logging.error(f"Failed to read FASTA file: {e}")
589
  return ""
590
 
591
- # --- Full Pipeline ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
593
  try:
594
  dna_input = read_fasta_file(fasta_file_obj)
@@ -602,17 +595,14 @@ def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
602
 
603
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
604
  try:
605
- # Clean input
606
  dna_input = dna_input.upper().strip()
607
  if not dna_input:
608
  return "Empty input", "", "", "", "", None, None, None, None, "No input provided"
609
 
610
- # Sanitize DNA sequence
611
  if not re.match('^[ACTGN]+$', dna_input):
612
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
613
  logging.info("DNA sequence sanitized")
614
 
615
- # Step 1: Boundary Prediction - Extract F gene sequence
616
  processed_sequence = dna_input
617
  boundary_output = ""
618
 
@@ -640,7 +630,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
640
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
641
  processed_sequence = dna_input
642
 
643
- # Step 2: Keras Prediction (F gene validation)
644
  keras_output = ""
645
  if processed_sequence and len(processed_sequence) >= 6:
646
  keras_prediction = predict_with_keras(processed_sequence)
@@ -648,7 +637,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
648
  else:
649
  keras_output = "Skipped: sequence too short for F gene validation"
650
 
651
- # Step 3: Maximum Likelihood Tree (Phylogenetic Placement)
652
  aligned_file = None
653
  phy_file = None
654
  ml_tree_output = ""
@@ -673,7 +661,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
673
  else:
674
  ml_tree_output = "Phylogenetic placement skipped (not requested)"
675
 
676
- # Step 4: Simplified Tree Analysis
677
  tree_html_file = None
678
  report_html_file = None
679
  tree_html_content = "No tree generated"
@@ -684,11 +671,9 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
684
  try:
685
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
686
 
687
- # Updated call to analyze_sequence_for_tree
688
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
689
 
690
  if tree_html_path and os.path.exists(tree_html_path):
691
- # Copy tree HTML to output directory
692
  output_dir = "output"
693
  os.makedirs(output_dir, exist_ok=True)
694
  safe_seq_name = re.sub(r'[^a-zA-Z0-9_-]', '', processed_sequence[:20])
@@ -698,28 +683,23 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
698
  shutil.copy2(tree_html_path, tree_html_final_path)
699
  tree_html_file = tree_html_final_path
700
 
701
- # Read tree HTML content for display
702
  with open(tree_html_path, 'r', encoding='utf-8') as f:
703
  tree_html_content = f.read()
704
 
705
- # Clean up temporary tree file
706
  try:
707
  os.unlink(tree_html_path)
708
  except:
709
  pass
710
 
711
  if report_html_path and os.path.exists(report_html_path):
712
- # Copy report HTML to output directory
713
  report_html_filename = f"report_{safe_seq_name}_{timestamp}.html"
714
  report_html_final_path = os.path.join(output_dir, report_html_filename)
715
  shutil.copy2(report_html_path, report_html_final_path)
716
  report_html_file = report_html_final_path
717
 
718
- # Read report HTML content for display
719
  with open(report_html_path, 'r', encoding='utf-8') as f:
720
  report_html_content = f.read()
721
 
722
- # Clean up temporary report file
723
  try:
724
  os.unlink(report_html_path)
725
  except:
@@ -750,7 +730,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
750
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
751
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
752
 
753
- # Final summary
754
  summary_output = f"""
755
  🧬 ANALYSIS SUMMARY:
756
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -784,9 +763,82 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
784
  logging.error(f"Full traceback: {traceback.format_exc()}")
785
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
 
788
- # --- Gradio Interface ---
789
- def create_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  """Create and configure the Gradio interface"""
791
 
792
  custom_css = """
@@ -1026,10 +1078,15 @@ def create_interface():
1026
  )
1027
 
1028
  return iface
 
 
 
 
 
 
1029
  # --- Main Execution ---
1030
  if __name__ == "__main__":
1031
  try:
1032
- # Print startup information
1033
  print("🧬 Advanced Gene Analysis Pipeline")
1034
  print("=" * 50)
1035
  print(f"Base Directory: {BASE_DIR}")
@@ -1037,24 +1094,23 @@ if __name__ == "__main__":
1037
  print(f"Keras Model: {'✅ Loaded' if keras_model else '❌ Not Available'}")
1038
  print(f"Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Not Available'}")
1039
 
1040
- # Check tool availability
1041
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1042
  print(f"MAFFT: {'✅ Available' if mafft_available else '❌ Not Found'}")
1043
  print(f"IQ-TREE: {'✅ Available' if iqtree_available else '❌ Not Found'}")
1044
 
1045
  if not mafft_available or not iqtree_available:
1046
- print("\n⚠️ Warning: Some phylogenetic tools are missing!")
1047
  print("Install with: conda install -c bioconda mafft iqtree")
1048
 
1049
- print("\n🚀 Starting Gradio interface...")
 
 
1050
 
1051
- # Create and launch interface
1052
- iface = create_interface()
1053
- iface.launch(
1054
- share=True, # Set to True if you want to create a public link
1055
- server_name="0.0.0.0", # Allow connections from any IP
1056
- server_port=7860, # Default Gradio port
1057
- show_error=True, # Show errors in the interface
1058
  )
1059
 
1060
  except Exception as e:
@@ -1062,4 +1118,4 @@ if __name__ == "__main__":
1062
  import traceback
1063
  print(f"Error: {e}")
1064
  print(f"Traceback: {traceback.format_exc()}")
1065
- sys.exit(1)
 
9
  import numpy as np
10
  from predictor import EnhancedGenePredictor
11
  from tensorflow.keras.models import load_model
 
12
  from analyzer import PhylogeneticTreeAnalyzer
13
  import tempfile
14
  import shutil
 
22
  import stat
23
  import time
24
 
25
+ # FastAPI imports
26
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form
27
+ from fastapi.responses import HTMLResponse, FileResponse
28
+ from fastapi.staticfiles import StaticFiles
29
+ from pydantic import BaseModel
30
+ from typing import Optional
31
+ import uvicorn
32
+
33
+ # --- FastAPI App Setup ---
34
+ app = FastAPI(
35
+ title="🧬 Advanced Gene Analysis Pipeline",
36
+ description="F Gene Boundary Detection • Validation • Phylogenetic Analysis",
37
+ version="1.0.0"
38
+ )
39
+
40
  # --- Global Variables ---
41
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
42
  MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft") # Updated path
 
170
  mafft_available = False
171
  mafft_cmd = None
172
 
173
+ # Updated MAFFT candidates list
174
  mafft_candidates = [
175
+ MAFFT_PATH,
176
  os.path.join(BASE_DIR, "binaries", "mafft", "mafft"),
177
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft.bat"),
178
  'mafft',
179
  '/usr/bin/mafft',
180
  '/usr/local/bin/mafft',
181
  os.path.join(BASE_DIR, "binaries", "mafft", "mafftdir", "bin", "mafft"),
 
182
  os.path.expanduser("~/anaconda3/bin/mafft"),
183
  os.path.expanduser("~/miniconda3/bin/mafft"),
184
  "/opt/conda/bin/mafft",
 
189
  if not candidate:
190
  continue
191
 
 
192
  if os.path.exists(candidate) or shutil.which(candidate):
 
193
  try:
194
  test_cmd = [candidate, "--help"]
195
  result = subprocess.run(
 
207
  logging.debug(f"MAFFT test failed for {candidate}: {e}")
208
  continue
209
 
210
+ # Check IQ-TREE
211
  iqtree_available = False
212
  iqtree_cmd = None
213
 
214
  # Updated IQ-TREE candidates list
215
  iqtree_candidates = [
216
+ IQTREE_PATH,
217
  'iqtree2',
218
  'iqtree',
219
  'iqtree3',
 
221
  '/usr/local/bin/iqtree2',
222
  '/usr/bin/iqtree',
223
  '/usr/local/bin/iqtree',
224
+ 'iqtree2.exe',
225
+ 'iqtree.exe',
226
+ 'iqtree3.exe',
227
  os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree2"),
 
228
  os.path.expanduser("~/anaconda3/bin/iqtree2"),
229
  os.path.expanduser("~/miniconda3/bin/iqtree2"),
230
  "/opt/conda/bin/iqtree2",
 
313
  if not os.path.exists(TREE_PATH):
314
  return False, f"Reference tree not found: {TREE_PATH}", None, None
315
 
316
+ # Save query sequence as FASTA
317
  try:
318
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
319
  SeqIO.write([query_record], query_fasta, "fasta")
 
321
  except Exception as e:
322
  return False, f"Error writing query sequence: {e}", None, None
323
 
324
+ # Step 1: Add query sequence to reference alignment using MAFFT
325
  logging.info("Adding query sequence to reference alignment...")
326
  try:
327
  with open(aligned_with_query, "w") as output_file:
 
329
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
330
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
331
 
 
332
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
333
  return False, "MAFFT alignment failed: output file is empty", None, None
334
 
 
344
  except Exception as e:
345
  return False, f"MAFFT execution error: {e}", None, None
346
 
347
+ # Step 2: Place sequence in phylogenetic tree using IQ-TREE
348
  logging.info("Placing sequence in phylogenetic tree...")
349
  try:
350
  iqtree_result = subprocess.run([
 
394
  logging.error(f"Phylogenetic placement failed: {e}")
395
  return False, f"Phylogenetic placement failed: {str(e)}", None, None
396
  finally:
 
397
  if 'query_fasta' in locals() and os.path.exists(query_fasta):
398
  try:
399
  os.unlink(query_fasta)
 
405
  Build maximum likelihood phylogenetic tree using the improved phylogenetic placement approach.
406
  """
407
  try:
 
408
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
409
 
 
410
  status_msg = "🔍 Checking dependencies...\n"
411
 
412
  if not mafft_available:
 
419
  else:
420
  status_msg += f"✅ IQ-TREE found and tested: {iqtree_cmd}\n"
421
 
 
422
  if not os.path.exists(ALIGNMENT_PATH):
423
  status_msg += f"❌ Reference alignment not found: {ALIGNMENT_PATH}\n"
424
  else:
 
429
  else:
430
  status_msg += f"✅ Reference tree found\n"
431
 
 
432
  if not mafft_available or not iqtree_available:
433
  guide = install_dependencies_guide()
434
  return False, f"{status_msg}\n{guide}", None, None
 
438
  status_msg += "Please ensure f_gene_sequences_aligned.fasta and f_gene_sequences.phy.treefile are available."
439
  return False, status_msg, None, None
440
 
 
441
  logging.info("Starting phylogenetic placement...")
442
  placement_success, placement_message, aligned_file, tree_file = phylogenetic_placement(
443
  f_gene_sequence, mafft_cmd, iqtree_cmd
 
446
  if placement_success:
447
  final_message = f"{status_msg}\n{placement_message}"
448
 
 
449
  if aligned_file and os.path.exists(aligned_file):
450
  standard_aligned = "query_with_references_aligned.fasta"
451
  shutil.copy2(aligned_file, standard_aligned)
 
465
  logging.error(f"ML tree construction failed: {e}")
466
  return False, f"ML tree construction failed: {str(e)}", None, None
467
 
 
 
 
468
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tuple:
469
  """
470
  Analyze sequence and create phylogenetic tree and detailed report using the new analyzer API
 
 
 
 
 
 
 
471
  """
472
  try:
473
  if not analyzer:
 
479
  if not (1 <= matching_percentage <= 99):
480
  return "❌ Error: Matching percentage must be between 1 and 99.", None, None
481
 
 
482
  sequence = sequence.strip()
483
  if len(sequence) < 10:
484
  return "❌ Error: Invalid or missing sequence. Must be ≥10 nucleotides.", None, None
485
 
 
486
  if not analyzer.find_query_sequence(sequence):
487
  return "❌ Error: Sequence not accepted.", None, None
488
 
 
489
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
490
 
491
  if not matched_ids:
 
493
 
494
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.2f}% similarity")
495
 
 
496
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
497
 
 
498
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
499
 
 
500
  temp_dir = tempfile.gettempdir()
501
  query_id = analyzer.query_id or f"query_{int(time.time())}"
502
  tree_html_path = os.path.join(temp_dir, f'phylogenetic_tree_interactive_{query_id}.html')
503
  fig.write_html(tree_html_path)
504
 
 
505
  analyzer.matching_percentage = matching_percentage
506
 
 
507
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
508
  report_html_path = None
509
  if report_success:
 
524
  logging.error(error_msg)
525
  import traceback
526
  logging.error(f"Full traceback: {traceback.format_exc()}")
527
+ return error_msg, None, None
528
+
529
  def predict_with_keras(sequence):
530
  try:
531
  if not keras_model or not kmer_to_index:
 
534
  if len(sequence) < 6:
535
  return "Skipped: sequence too short for F gene validation (minimum 6 nucleotides required)."
536
 
 
537
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
538
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
539
 
 
540
  input_arr = np.array([indices])
541
  prediction = keras_model.predict(input_arr, verbose=0)[0]
542
 
543
+ f_gene_prob = prediction[-1]
 
544
 
545
+ percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
546
 
547
  return f"{percentage}% F gene"
548
  except Exception as e:
549
  logging.error(f"Keras prediction failed: {e}")
550
  return f"Keras prediction failed: {str(e)}"
551
 
 
552
  def read_fasta_file(file_obj):
553
  try:
554
  if file_obj is None:
555
  return ""
556
 
 
557
  if hasattr(file_obj, 'name'):
558
  with open(file_obj.name, "r") as f:
559
  content = f.read()
 
567
  logging.error(f"Failed to read FASTA file: {e}")
568
  return ""
569
 
570
+ # --- Pydantic Models for FastAPI ---
571
+ class AnalysisRequest(BaseModel):
572
+ sequence: str
573
+ similarity_score: float = 95.0
574
+ build_ml_tree: bool = False
575
+
576
+ class AnalysisResponse(BaseModel):
577
+ boundary_output: str
578
+ keras_output: str
579
+ ml_tree_output: str
580
+ tree_analysis_output: str
581
+ summary_output: str
582
+ success: bool
583
+ error_message: Optional[str] = None
584
+
585
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
586
  try:
587
  dna_input = read_fasta_file(fasta_file_obj)
 
595
 
596
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
597
  try:
 
598
  dna_input = dna_input.upper().strip()
599
  if not dna_input:
600
  return "Empty input", "", "", "", "", None, None, None, None, "No input provided"
601
 
 
602
  if not re.match('^[ACTGN]+$', dna_input):
603
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
604
  logging.info("DNA sequence sanitized")
605
 
 
606
  processed_sequence = dna_input
607
  boundary_output = ""
608
 
 
630
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
631
  processed_sequence = dna_input
632
 
 
633
  keras_output = ""
634
  if processed_sequence and len(processed_sequence) >= 6:
635
  keras_prediction = predict_with_keras(processed_sequence)
 
637
  else:
638
  keras_output = "Skipped: sequence too short for F gene validation"
639
 
 
640
  aligned_file = None
641
  phy_file = None
642
  ml_tree_output = ""
 
661
  else:
662
  ml_tree_output = "Phylogenetic placement skipped (not requested)"
663
 
 
664
  tree_html_file = None
665
  report_html_file = None
666
  tree_html_content = "No tree generated"
 
671
  try:
672
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
673
 
 
674
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
675
 
676
  if tree_html_path and os.path.exists(tree_html_path):
 
677
  output_dir = "output"
678
  os.makedirs(output_dir, exist_ok=True)
679
  safe_seq_name = re.sub(r'[^a-zA-Z0-9_-]', '', processed_sequence[:20])
 
683
  shutil.copy2(tree_html_path, tree_html_final_path)
684
  tree_html_file = tree_html_final_path
685
 
 
686
  with open(tree_html_path, 'r', encoding='utf-8') as f:
687
  tree_html_content = f.read()
688
 
 
689
  try:
690
  os.unlink(tree_html_path)
691
  except:
692
  pass
693
 
694
  if report_html_path and os.path.exists(report_html_path):
 
695
  report_html_filename = f"report_{safe_seq_name}_{timestamp}.html"
696
  report_html_final_path = os.path.join(output_dir, report_html_filename)
697
  shutil.copy2(report_html_path, report_html_final_path)
698
  report_html_file = report_html_final_path
699
 
 
700
  with open(report_html_path, 'r', encoding='utf-8') as f:
701
  report_html_content = f.read()
702
 
 
703
  try:
704
  os.unlink(report_html_path)
705
  except:
 
730
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
731
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
732
 
 
733
  summary_output = f"""
734
  🧬 ANALYSIS SUMMARY:
735
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
763
  logging.error(f"Full traceback: {traceback.format_exc()}")
764
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
765
 
766
+ # --- FastAPI Endpoints ---
767
+ @app.get("/")
768
+ async def root():
769
+ return {"message": "🧬 Advanced Gene Analysis Pipeline API", "docs": "/docs"}
770
+
771
+ @app.post("/analyze", response_model=AnalysisResponse)
772
+ async def analyze_sequence(request: AnalysisRequest):
773
+ """
774
+ Analyze a DNA sequence through the complete pipeline
775
+ """
776
+ try:
777
+ result = run_pipeline(
778
+ request.sequence,
779
+ request.similarity_score,
780
+ request.build_ml_tree
781
+ )
782
+
783
+ return AnalysisResponse(
784
+ boundary_output=result[0],
785
+ keras_output=result[1],
786
+ ml_tree_output=result[2],
787
+ tree_analysis_output=result[3],
788
+ summary_output=result[4],
789
+ success=True
790
+ )
791
+ except Exception as e:
792
+ raise HTTPException(status_code=500, detail=str(e))
793
 
794
+ @app.post("/analyze-file")
795
+ async def analyze_file(
796
+ file: UploadFile = File(...),
797
+ similarity_score: float = Form(95.0),
798
+ build_ml_tree: bool = Form(False)
799
+ ):
800
+ """
801
+ Analyze a FASTA file through the complete pipeline
802
+ """
803
+ try:
804
+ # Save uploaded file temporarily
805
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") as temp_file:
806
+ content = await file.read()
807
+ temp_file.write(content)
808
+ temp_file_path = temp_file.name
809
+
810
+ result = run_pipeline_from_file(
811
+ temp_file_path,
812
+ similarity_score,
813
+ build_ml_tree
814
+ )
815
+
816
+ # Clean up
817
+ os.unlink(temp_file_path)
818
+
819
+ return AnalysisResponse(
820
+ boundary_output=result[0],
821
+ keras_output=result[1],
822
+ ml_tree_output=result[2],
823
+ tree_analysis_output=result[3],
824
+ summary_output=result[4],
825
+ success=True
826
+ )
827
+ except Exception as e:
828
+ raise HTTPException(status_code=500, detail=str(e))
829
+
830
+ @app.get("/health")
831
+ async def health_check():
832
+ """Health check endpoint"""
833
+ return {
834
+ "status": "healthy",
835
+ "boundary_model": boundary_model is not None,
836
+ "keras_model": keras_model is not None,
837
+ "tree_analyzer": analyzer is not None
838
+ }
839
+
840
+ # --- Create Gradio Interface ---
841
+ def create_gradio_interface():
842
  """Create and configure the Gradio interface"""
843
 
844
  custom_css = """
 
1078
  )
1079
 
1080
  return iface
1081
+
1082
+ # --- Mount Gradio App in FastAPI ---
1083
+ gradio_app = create_gradio_interface()
1084
+ app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
1085
+
1086
+
1087
  # --- Main Execution ---
1088
  if __name__ == "__main__":
1089
  try:
 
1090
  print("🧬 Advanced Gene Analysis Pipeline")
1091
  print("=" * 50)
1092
  print(f"Base Directory: {BASE_DIR}")
 
1094
  print(f"Keras Model: {'✅ Loaded' if keras_model else '❌ Not Available'}")
1095
  print(f"Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Not Available'}")
1096
 
 
1097
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1098
  print(f"MAFFT: {'✅ Available' if mafft_available else '❌ Not Found'}")
1099
  print(f"IQ-TREE: {'✅ Available' if iqtree_available else '❌ Not Found'}")
1100
 
1101
  if not mafft_available or not iqtree_available:
1102
+ print("\n⚠️ Warning: Some phylogenetic tools are missing!")
1103
  print("Install with: conda install -c bioconda mafft iqtree")
1104
 
1105
+ print("\n🚀 Starting FastAPI + Gradio server...")
1106
+ print("📖 API Documentation: http://localhost:8000/docs")
1107
+ print("🎨 Gradio Interface: http://localhost:8000/gradio")
1108
 
1109
+ uvicorn.run(
1110
+ app,
1111
+ host="0.0.0.0",
1112
+ port=8000,
1113
+ reload=False # Set to True for development
 
 
1114
  )
1115
 
1116
  except Exception as e:
 
1118
  import traceback
1119
  print(f"Error: {e}")
1120
  print(f"Traceback: {traceback.format_exc()}")
1121
+ sys.exit(1)