re-type commited on
Commit
664ad2e
·
verified ·
1 Parent(s): a780000

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -216
app.py CHANGED
@@ -22,8 +22,6 @@ from Bio.SeqRecord import SeqRecord
22
  import stat
23
  import time
24
  import asyncio
25
-
26
- # FastAPI imports
27
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
28
  from fastapi.responses import HTMLResponse
29
  from pydantic import BaseModel
@@ -43,20 +41,16 @@ app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
43
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44
  log_handler = logging.StreamHandler()
45
  log_handler.setFormatter(log_formatter)
46
-
47
- # File handler with error handling
48
  try:
49
  file_handler = logging.FileHandler('/tmp/app.log')
50
  file_handler.setFormatter(log_formatter)
51
  logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
52
  except Exception:
53
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
54
-
55
  logger = logging.getLogger(__name__)
56
 
57
  # --- Global Variables ---
58
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
59
- MODELS_DIR = os.path.join(BASE_DIR, "models") # Local models directory
60
  MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
61
  IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
62
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
@@ -64,10 +58,10 @@ TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
64
  QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
65
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
66
 
67
- # --- Paths ---
68
- model_repo = "GGproject10/best_boundary_aware_model"
69
- csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
70
- hf_token = os.getenv("HF_TOKEN")
71
 
72
  # Initialize models as None
73
  boundary_model = None
@@ -75,84 +69,58 @@ keras_model = None
75
  kmer_to_index = None
76
  analyzer = None
77
 
78
- # --- Enhanced Model Loading with Local Priority ---
79
  def load_models_safely():
80
  global boundary_model, keras_model, kmer_to_index, analyzer
 
81
 
82
- logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
83
- logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
84
-
85
- if os.path.exists(MODELS_DIR):
86
- logger.info(f"📂 Contents of models directory: {os.listdir(MODELS_DIR)}")
87
 
88
- # Load Boundary Model - Try local first, then HF
89
  try:
90
- # Local model paths
91
- local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
92
-
93
- if os.path.exists(local_boundary_path):
94
- logger.info(f"✅ Loading boundary model from local path: {local_boundary_path}")
95
- boundary_model = EnhancedGenePredictor(local_boundary_path)
96
- logger.info("✅ Boundary model loaded successfully from local directory")
97
- elif hf_token:
98
- logger.info("🌐 Attempting to load boundary model from Hugging Face...")
99
- boundary_path = hf_hub_download(
100
- repo_id=model_repo,
101
- filename="best_boundary_aware_model.pth",
102
- token=hf_token,
103
- cache_dir="/tmp/hf_cache"
104
- )
105
- if os.path.exists(boundary_path):
106
- boundary_model = EnhancedGenePredictor(boundary_path)
107
- logger.info("✅ Boundary model loaded successfully from HF")
108
- else:
109
- logger.warning("❌ Boundary model file not found after HF download")
110
  else:
111
- logger.warning("❌ No local boundary model found and no HF_TOKEN available")
112
  except Exception as e:
113
  logger.error(f"❌ Failed to load boundary model: {e}")
114
  boundary_model = None
115
 
116
- # Load Keras Model - Try local first, then HF
117
  try:
118
- # Local model paths
119
- local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
120
- local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
121
-
122
- if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
123
- logger.info(f"✅ Loading Keras model from local paths:")
124
- logger.info(f" - Keras model: {local_keras_path}")
125
- logger.info(f" - K-mer index: {local_kmer_path}")
126
-
127
- keras_model = load_model(local_keras_path)
128
- with open(local_kmer_path, "rb") as f:
 
 
 
 
 
 
 
129
  kmer_to_index = pickle.load(f)
130
- logger.info("✅ Keras model loaded successfully from local directory")
131
-
132
- elif hf_token:
133
- logger.info("🌐 Attempting to load Keras model from Hugging Face...")
134
- keras_path = hf_hub_download(
135
- repo_id=model_repo,
136
- filename="best_model.keras",
137
- token=hf_token,
138
- cache_dir="/tmp/hf_cache"
139
- )
140
- kmer_path = hf_hub_download(
141
- repo_id=model_repo,
142
- filename="kmer_to_index.pkl",
143
- token=hf_token,
144
- cache_dir="/tmp/hf_cache"
145
- )
146
-
147
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
148
- keras_model = load_model(keras_path)
149
- with open(kmer_path, "rb") as f:
150
- kmer_to_index = pickle.load(f)
151
- logger.info("✅ Keras model loaded successfully from HF")
152
- else:
153
- logger.warning("❌ Keras model files not found after HF download")
154
  else:
155
- logger.warning("❌ No local Keras model found and no HF_TOKEN available")
156
  except Exception as e:
157
  logger.error(f"❌ Failed to load Keras model: {e}")
158
  keras_model = None
@@ -162,38 +130,22 @@ def load_models_safely():
162
  try:
163
  logger.info("🌳 Initializing tree analyzer...")
164
  analyzer = PhylogeneticTreeAnalyzer()
165
-
166
- # Try multiple CSV locations
167
- csv_candidates = [
168
- csv_path,
169
- os.path.join(BASE_DIR, "f cleaned.csv"),
170
- "f_cleaned.csv",
171
- os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
172
- os.path.join(MODELS_DIR, "f_cleaned.csv") # Also check models directory
173
- ]
174
-
175
- csv_loaded = False
176
- for csv_candidate in csv_candidates:
177
- if os.path.exists(csv_candidate):
178
- try:
179
- logger.info(f"📊 Trying to load CSV from: {csv_candidate}")
180
- if analyzer.load_data(csv_candidate):
181
- logger.info(f"✅ Tree analyzer loaded CSV from: {csv_candidate}")
182
- csv_loaded = True
183
- break
184
- except Exception as e:
185
- logger.warning(f"Failed to load CSV from {csv_candidate}: {e}")
186
- continue
187
-
188
- if not csv_loaded:
189
- logger.error("❌ Failed to load CSV data from any location")
190
- logger.info("📂 Available files in base directory:")
191
- try:
192
- for file in os.listdir(BASE_DIR):
193
- if file.endswith('.csv'):
194
- logger.info(f" - {file}")
195
- except:
196
- pass
197
  analyzer = None
198
  except Exception as e:
199
  logger.error(f"❌ Failed to initialize tree analyzer: {e}")
@@ -214,12 +166,9 @@ def setup_binary_permissions():
214
 
215
  def check_tool_availability():
216
  setup_binary_permissions()
217
-
218
- # Check MAFFT
219
  mafft_available = False
220
  mafft_cmd = None
221
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
222
-
223
  for candidate in mafft_candidates:
224
  if shutil.which(candidate) or os.path.exists(candidate):
225
  try:
@@ -236,12 +185,9 @@ def check_tool_availability():
236
  break
237
  except Exception as e:
238
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
239
-
240
- # Check IQ-TREE
241
  iqtree_available = False
242
  iqtree_cmd = None
243
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
244
-
245
  for candidate in iqtree_candidates:
246
  if shutil.which(candidate) or os.path.exists(candidate):
247
  try:
@@ -258,46 +204,36 @@ def check_tool_availability():
258
  break
259
  except Exception as e:
260
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
261
-
262
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
263
 
264
- # --- Pipeline Functions (keeping your original logic) ---
265
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
266
  try:
267
  if len(sequence.strip()) < 100:
268
  return False, "Sequence too short (<100 bp).", None, None
269
-
270
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
271
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
272
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
273
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
274
-
275
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
276
  return False, "Reference alignment or tree not found.", None, None
277
-
278
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
279
  SeqIO.write([query_record], query_fasta, "fasta")
280
-
281
  with open(aligned_with_query, "w") as output_file:
282
  subprocess.run([
283
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
284
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
285
-
286
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
287
  return False, "MAFFT alignment failed.", None, None
288
-
289
  subprocess.run([
290
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
291
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
292
  ], capture_output=True, text=True, timeout=1200, check=True)
293
-
294
  treefile = f"{output_prefix}.treefile"
295
  if not os.path.exists(treefile):
296
  return False, "IQ-TREE placement failed.", aligned_with_query, None
297
-
298
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
299
  return True, success_msg, aligned_with_query, treefile
300
-
301
  except Exception as e:
302
  logger.error(f"Phylogenetic placement failed: {e}")
303
  return False, f"Error: {str(e)}", None, None
@@ -312,18 +248,14 @@ def predict_with_keras(sequence):
312
  try:
313
  if not keras_model or not kmer_to_index:
314
  return "❌ Keras model not available."
315
-
316
  if len(sequence) < 6:
317
  return "❌ Sequence too short (<6 bp)."
318
-
319
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
320
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
321
  input_arr = np.array([indices])
322
-
323
  prediction = keras_model.predict(input_arr, verbose=0)[0]
324
  f_gene_prob = prediction[-1]
325
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
326
-
327
  return f"✅ {percentage}% F gene confidence"
328
  except Exception as e:
329
  logger.error(f"Keras prediction failed: {e}")
@@ -334,14 +266,9 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
334
  dna_input = dna_input.upper().strip()
335
  if not dna_input:
336
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
337
-
338
- # Clean sequence
339
  if not re.match('^[ACTGN]+$', dna_input):
340
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
341
-
342
  processed_sequence = dna_input
343
-
344
- # Boundary prediction
345
  boundary_output = ""
346
  if boundary_model:
347
  try:
@@ -358,15 +285,10 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
358
  processed_sequence = dna_input
359
  else:
360
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
361
-
362
- # Keras prediction
363
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
364
-
365
- # ML Tree (keeping your original logic)
366
  aligned_file = None
367
  phy_file = None
368
  ml_tree_output = ""
369
-
370
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
371
  try:
372
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
@@ -383,29 +305,23 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
383
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
384
  else:
385
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
386
-
387
- # Tree analysis
388
  tree_html_content = "No tree generated."
389
  report_html_content = "No report generated."
390
  simplified_ml_output = ""
391
-
392
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
393
  try:
394
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
395
  simplified_ml_output = tree_result
396
-
397
  if tree_html_path and os.path.exists(tree_html_path):
398
  with open(tree_html_path, 'r', encoding='utf-8') as f:
399
  tree_html_content = f.read()
400
  else:
401
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
402
-
403
  if report_html_path and os.path.exists(report_html_path):
404
  with open(report_html_path, 'r', encoding='utf-8') as f:
405
  report_html_content = f.read()
406
  else:
407
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
408
-
409
  except Exception as e:
410
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
411
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
@@ -414,8 +330,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
414
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
415
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
416
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
417
-
418
- # Summary
419
  summary_output = f"""
420
  📊 ANALYSIS SUMMARY:
421
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -426,49 +340,37 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
426
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
427
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
428
  """
429
-
430
  return (
431
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
432
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
433
  )
434
-
435
  except Exception as e:
436
  logger.error(f"Pipeline error: {e}")
437
  error_msg = f"❌ Pipeline Error: {str(e)}"
438
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
439
 
440
- # Keep your other functions (analyze_sequence_for_tree, build_maximum_likelihood_tree, etc.)
441
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
442
  try:
443
  if not analyzer:
444
  return "❌ Tree analyzer not initialized.", None, None
445
-
446
  if not sequence or len(sequence.strip()) < 10:
447
  return "❌ Invalid sequence.", None, None
448
-
449
  if not (1 <= matching_percentage <= 99):
450
  return "❌ Matching percentage must be 1-99.", None, None
451
-
452
  if not analyzer.find_query_sequence(sequence):
453
  return "❌ Sequence not accepted.", None, None
454
-
455
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
456
  if not matched_ids:
457
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
458
-
459
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
460
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
461
-
462
  query_id = analyzer.query_id or f"query_{int(time.time())}"
463
  tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
464
  fig.write_html(tree_html_path)
465
-
466
  analyzer.matching_percentage = matching_percentage
467
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
468
  report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
469
-
470
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
471
-
472
  except Exception as e:
473
  logger.error(f"Tree analysis failed: {e}")
474
  return f"❌ Error: {str(e)}", None, None
@@ -477,17 +379,14 @@ def read_fasta_file(file_obj):
477
  try:
478
  if file_obj is None:
479
  return ""
480
-
481
  if isinstance(file_obj, str):
482
  with open(file_obj, "r") as f:
483
  content = f.read()
484
  else:
485
  content = file_obj.read().decode("utf-8")
486
-
487
  lines = content.strip().split("\n")
488
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
489
  return ''.join(seq_lines)
490
-
491
  except Exception as e:
492
  logger.error(f"Failed to read FASTA file: {e}")
493
  return ""
@@ -548,13 +447,11 @@ async def health_check():
548
  },
549
  "paths": {
550
  "base_dir": BASE_DIR,
551
- "models_dir": MODELS_DIR,
552
- "models_dir_exists": os.path.exists(MODELS_DIR),
553
- "csv_path": csv_path,
554
- "csv_exists": os.path.exists(csv_path)
555
  },
556
  "recommendations": {
557
- "models": "Models loaded from local directory" if (boundary_model and keras_model) else "Check models directory",
558
  "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
559
  }
560
  }
@@ -594,9 +491,7 @@ async def analyze_file(
594
  content = await file.read()
595
  temp_file.write(content)
596
  temp_file_path = temp_file.name
597
-
598
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
599
-
600
  return AnalysisResponse(
601
  boundary_output=result[0] or "",
602
  keras_output=result[1] or "",
@@ -633,10 +528,7 @@ def create_gradio_interface():
633
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
634
  """
635
  ) as iface:
636
-
637
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
638
-
639
- # Status display
640
  with gr.Row():
641
  with gr.Column():
642
  status_display = gr.HTML(value=f"""
@@ -648,9 +540,7 @@ def create_gradio_interface():
648
  <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
649
  <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
650
  """)
651
-
652
  with gr.Tabs() as tabs:
653
- # Tab 1: Text Input
654
  with gr.TabItem("📝 Text Input"):
655
  with gr.Row():
656
  with gr.Column(scale=2):
@@ -660,7 +550,6 @@ def create_gradio_interface():
660
  lines=5,
661
  info="Paste your DNA sequence here"
662
  )
663
-
664
  with gr.Column(scale=1):
665
  similarity_score = gr.Slider(
666
  minimum=1,
@@ -670,16 +559,12 @@ def create_gradio_interface():
670
  label="🎯 Similarity Threshold (%)",
671
  info="Minimum similarity for tree analysis"
672
  )
673
-
674
  build_ml_tree = gr.Checkbox(
675
  label="🌲 Build ML Tree",
676
  value=False,
677
  info="Generate phylogenetic placement (slower)"
678
  )
679
-
680
  analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
681
-
682
- # Tab 2: File Upload
683
  with gr.TabItem("📁 File Upload"):
684
  with gr.Row():
685
  with gr.Column(scale=2):
@@ -688,7 +573,6 @@ def create_gradio_interface():
688
  file_types=[".fasta", ".fa", ".fas", ".txt"],
689
  info="Upload a FASTA file containing your sequence"
690
  )
691
-
692
  with gr.Column(scale=1):
693
  file_similarity_score = gr.Slider(
694
  minimum=1,
@@ -697,17 +581,12 @@ def create_gradio_interface():
697
  step=1.0,
698
  label="🎯 Similarity Threshold (%)"
699
  )
700
-
701
  file_build_ml_tree = gr.Checkbox(
702
  label="🌲 Build ML Tree",
703
  value=False
704
  )
705
-
706
  analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
707
-
708
- # Results Section
709
  gr.Markdown("## 📊 Analysis Results")
710
-
711
  with gr.Row():
712
  with gr.Column():
713
  boundary_output = gr.Textbox(
@@ -715,53 +594,41 @@ def create_gradio_interface():
715
  interactive=False,
716
  lines=2
717
  )
718
-
719
  keras_output = gr.Textbox(
720
  label="🧠 F Gene Validation",
721
  interactive=False,
722
  lines=2
723
  )
724
-
725
  with gr.Column():
726
  ml_tree_output = gr.Textbox(
727
  label="🌲 Phylogenetic Placement",
728
  interactive=False,
729
  lines=2
730
  )
731
-
732
  tree_analysis_output = gr.Textbox(
733
  label="🌳 Tree Analysis",
734
  interactive=False,
735
  lines=2
736
  )
737
-
738
- # Summary
739
  summary_output = gr.Textbox(
740
  label="📋 Summary",
741
  interactive=False,
742
  lines=8
743
  )
744
-
745
- # File Downloads
746
  with gr.Row():
747
  aligned_file = gr.File(label="📄 Alignment File", visible=False)
748
  tree_file = gr.File(label="🌲 Tree File", visible=False)
749
-
750
- # Interactive Visualizations
751
  with gr.Tabs():
752
  with gr.TabItem("🌳 Interactive Tree"):
753
  tree_html = gr.HTML(
754
  label="Phylogenetic Tree",
755
  value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
756
  )
757
-
758
  with gr.TabItem("📊 Detailed Report"):
759
  report_html = gr.HTML(
760
  label="Analysis Report",
761
  value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
762
  )
763
-
764
- # Event handlers
765
  analyze_btn.click(
766
  fn=run_pipeline,
767
  inputs=[dna_input, similarity_score, build_ml_tree],
@@ -772,7 +639,6 @@ def create_gradio_interface():
772
  tree_html, report_html
773
  ]
774
  )
775
-
776
  analyze_file_btn.click(
777
  fn=run_pipeline_from_file,
778
  inputs=[file_input, file_similarity_score, file_build_ml_tree],
@@ -783,51 +649,38 @@ def create_gradio_interface():
783
  tree_html, report_html
784
  ]
785
  )
786
-
787
- # Examples
788
  gr.Markdown("## 🔬 Example Sequences")
789
-
790
  example_sequences = [
791
  ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
792
  ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True],
793
  ]
794
-
795
  gr.Examples(
796
  examples=example_sequences,
797
  inputs=[dna_input, similarity_score, build_ml_tree],
798
  label="Click to load example sequences"
799
  )
800
-
801
- # Help Section
802
  with gr.Accordion("❓ Help & Information", open=False):
803
  gr.Markdown("""
804
  ### 🧬 Gene Analysis Pipeline
805
-
806
  This tool performs comprehensive analysis of F gene sequences:
807
-
808
  **🎯 Boundary Detection**: Identifies F gene regions within your sequence
809
  **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
810
  **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
811
  **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
812
-
813
  ### 📋 Input Requirements
814
  - DNA sequences in ATCG format
815
  - Minimum 10 bp for basic analysis
816
  - Minimum 100 bp for phylogenetic placement
817
  - FASTA files supported for upload
818
-
819
  ### ⚙️ Parameters
820
  - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
821
  - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
822
-
823
  ### 📊 Output Files
824
  - Alignment files (.fa format)
825
  - Tree files (.treefile format)
826
  - Interactive HTML visualizations
827
  """)
828
-
829
  return iface
830
-
831
  except Exception as e:
832
  logger.error(f"Failed to create Gradio interface: {e}")
833
  return None
@@ -850,17 +703,13 @@ mount_gradio_app()
850
  # --- Main Application ---
851
  if __name__ == "__main__":
852
  import argparse
853
-
854
  parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
855
  parser.add_argument("--host", default="0.0.0.0", help="Host address")
856
  parser.add_argument("--port", type=int, default=7860, help="Port number")
857
  parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
858
  parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
859
-
860
  args = parser.parse_args()
861
-
862
  if args.gradio_only:
863
- # Run Gradio interface only
864
  logger.info("🚀 Starting Gradio interface only...")
865
  iface = create_gradio_interface()
866
  if iface:
@@ -874,11 +723,9 @@ if __name__ == "__main__":
874
  logger.error("❌ Failed to create Gradio interface")
875
  sys.exit(1)
876
  else:
877
- # Run FastAPI with Gradio mounted
878
  logger.info(f"🚀 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
879
  logger.info("📊 API Documentation: http://localhost:7860/docs")
880
  logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
881
-
882
  try:
883
  uvicorn.run(
884
  "app:app" if args.reload else app,
 
22
  import stat
23
  import time
24
  import asyncio
 
 
25
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
26
  from fastapi.responses import HTMLResponse
27
  from pydantic import BaseModel
 
41
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
42
  log_handler = logging.StreamHandler()
43
  log_handler.setFormatter(log_formatter)
 
 
44
  try:
45
  file_handler = logging.FileHandler('/tmp/app.log')
46
  file_handler.setFormatter(log_formatter)
47
  logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
48
  except Exception:
49
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
 
50
  logger = logging.getLogger(__name__)
51
 
52
  # --- Global Variables ---
53
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
54
  MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
55
  IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
56
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
 
58
  QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
59
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
60
 
61
+ # --- Hugging Face Repositories ---
62
+ MODEL_REPO = "GGproject10/best_boundary_aware_model"
63
+ DATA_REPO = "GGproject10/simplified_tree_AI"
64
+ HF_TOKEN = os.getenv("HF_TOKEN")
65
 
66
  # Initialize models as None
67
  boundary_model = None
 
69
  kmer_to_index = None
70
  analyzer = None
71
 
72
+ # --- Model Loading ---
73
  def load_models_safely():
74
  global boundary_model, keras_model, kmer_to_index, analyzer
75
+ logger.info("🔍 Loading models and data from Hugging Face repositories")
76
 
77
+ if not HF_TOKEN:
78
+ logger.error(" HF_TOKEN environment variable not set")
79
+ return
 
 
80
 
81
+ # Load Boundary Model
82
  try:
83
+ logger.info(f"🌐 Downloading boundary model from {MODEL_REPO}")
84
+ boundary_path = hf_hub_download(
85
+ repo_id=MODEL_REPO,
86
+ filename="best_boundary_aware_model.pth",
87
+ token=HF_TOKEN,
88
+ cache_dir="/tmp/hf_cache"
89
+ )
90
+ if os.path.exists(boundary_path):
91
+ logger.info(f" Boundary model downloaded to: {boundary_path}")
92
+ boundary_model = EnhancedGenePredictor(boundary_path)
93
+ logger.info("✅ Boundary model loaded successfully")
 
 
 
 
 
 
 
 
 
94
  else:
95
+ logger.warning(f"❌ Boundary model not found at: {boundary_path}")
96
  except Exception as e:
97
  logger.error(f"❌ Failed to load boundary model: {e}")
98
  boundary_model = None
99
 
100
+ # Load Keras Model
101
  try:
102
+ logger.info(f"🌐 Downloading Keras model and kmer index from {MODEL_REPO}")
103
+ keras_path = hf_hub_download(
104
+ repo_id=MODEL_REPO,
105
+ filename="best_model.keras",
106
+ token=HF_TOKEN,
107
+ cache_dir="/tmp/hf_cache"
108
+ )
109
+ kmer_path = hf_hub_download(
110
+ repo_id=MODEL_REPO,
111
+ filename="kmer_to_index.pkl",
112
+ token=HF_TOKEN,
113
+ cache_dir="/tmp/hf_cache"
114
+ )
115
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
116
+ logger.info(f"✅ Keras model downloaded to: {keras_path}")
117
+ logger.info(f"✅ Kmer index downloaded to: {kmer_path}")
118
+ keras_model = load_model(keras_path)
119
+ with open(kmer_path, "rb") as f:
120
  kmer_to_index = pickle.load(f)
121
+ logger.info("✅ Keras model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  else:
123
+ logger.warning(f"❌ Keras model files not found: keras={os.path.exists(keras_path)}, kmer={os.path.exists(kmer_path)}")
124
  except Exception as e:
125
  logger.error(f"❌ Failed to load Keras model: {e}")
126
  keras_model = None
 
130
  try:
131
  logger.info("🌳 Initializing tree analyzer...")
132
  analyzer = PhylogeneticTreeAnalyzer()
133
+ logger.info(f"🌐 Downloading CSV from {DATA_REPO}")
134
+ csv_path = hf_hub_download(
135
+ repo_id=DATA_REPO,
136
+ filename="f_cleaned.csv",
137
+ token=HF_TOKEN,
138
+ cache_dir="/tmp/hf_cache"
139
+ )
140
+ if os.path.exists(csv_path):
141
+ logger.info(f"📊 CSV downloaded to: {csv_path}")
142
+ if analyzer.load_data(csv_path):
143
+ logger.info(f"✅ Tree analyzer loaded CSV successfully")
144
+ else:
145
+ logger.error("❌ Failed to load CSV data")
146
+ analyzer = None
147
+ else:
148
+ logger.warning(f"❌ CSV not found at: {csv_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  analyzer = None
150
  except Exception as e:
151
  logger.error(f"❌ Failed to initialize tree analyzer: {e}")
 
166
 
167
  def check_tool_availability():
168
  setup_binary_permissions()
 
 
169
  mafft_available = False
170
  mafft_cmd = None
171
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
172
  for candidate in mafft_candidates:
173
  if shutil.which(candidate) or os.path.exists(candidate):
174
  try:
 
185
  break
186
  except Exception as e:
187
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
 
 
188
  iqtree_available = False
189
  iqtree_cmd = None
190
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
191
  for candidate in iqtree_candidates:
192
  if shutil.which(candidate) or os.path.exists(candidate):
193
  try:
 
204
  break
205
  except Exception as e:
206
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
 
207
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
208
 
209
+ # --- Pipeline Functions ---
210
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
211
  try:
212
  if len(sequence.strip()) < 100:
213
  return False, "Sequence too short (<100 bp).", None, None
 
214
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
215
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
216
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
217
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
 
218
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
219
  return False, "Reference alignment or tree not found.", None, None
 
220
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
221
  SeqIO.write([query_record], query_fasta, "fasta")
 
222
  with open(aligned_with_query, "w") as output_file:
223
  subprocess.run([
224
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
225
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
 
226
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
227
  return False, "MAFFT alignment failed.", None, None
 
228
  subprocess.run([
229
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
230
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
231
  ], capture_output=True, text=True, timeout=1200, check=True)
 
232
  treefile = f"{output_prefix}.treefile"
233
  if not os.path.exists(treefile):
234
  return False, "IQ-TREE placement failed.", aligned_with_query, None
 
235
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
236
  return True, success_msg, aligned_with_query, treefile
 
237
  except Exception as e:
238
  logger.error(f"Phylogenetic placement failed: {e}")
239
  return False, f"Error: {str(e)}", None, None
 
248
  try:
249
  if not keras_model or not kmer_to_index:
250
  return "❌ Keras model not available."
 
251
  if len(sequence) < 6:
252
  return "❌ Sequence too short (<6 bp)."
 
253
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
254
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
255
  input_arr = np.array([indices])
 
256
  prediction = keras_model.predict(input_arr, verbose=0)[0]
257
  f_gene_prob = prediction[-1]
258
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
259
  return f"✅ {percentage}% F gene confidence"
260
  except Exception as e:
261
  logger.error(f"Keras prediction failed: {e}")
 
266
  dna_input = dna_input.upper().strip()
267
  if not dna_input:
268
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
 
 
269
  if not re.match('^[ACTGN]+$', dna_input):
270
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
 
271
  processed_sequence = dna_input
 
 
272
  boundary_output = ""
273
  if boundary_model:
274
  try:
 
285
  processed_sequence = dna_input
286
  else:
287
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
 
288
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
 
 
289
  aligned_file = None
290
  phy_file = None
291
  ml_tree_output = ""
 
292
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
293
  try:
294
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
 
305
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
306
  else:
307
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
 
308
  tree_html_content = "No tree generated."
309
  report_html_content = "No report generated."
310
  simplified_ml_output = ""
 
311
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
312
  try:
313
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
314
  simplified_ml_output = tree_result
 
315
  if tree_html_path and os.path.exists(tree_html_path):
316
  with open(tree_html_path, 'r', encoding='utf-8') as f:
317
  tree_html_content = f.read()
318
  else:
319
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
320
  if report_html_path and os.path.exists(report_html_path):
321
  with open(report_html_path, 'r', encoding='utf-8') as f:
322
  report_html_content = f.read()
323
  else:
324
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
325
  except Exception as e:
326
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
327
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
 
330
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
331
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
332
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
 
333
  summary_output = f"""
334
  📊 ANALYSIS SUMMARY:
335
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
340
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
341
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
342
  """
 
343
  return (
344
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
345
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
346
  )
 
347
  except Exception as e:
348
  logger.error(f"Pipeline error: {e}")
349
  error_msg = f"❌ Pipeline Error: {str(e)}"
350
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
351
 
 
352
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
353
  try:
354
  if not analyzer:
355
  return "❌ Tree analyzer not initialized.", None, None
 
356
  if not sequence or len(sequence.strip()) < 10:
357
  return "❌ Invalid sequence.", None, None
 
358
  if not (1 <= matching_percentage <= 99):
359
  return "❌ Matching percentage must be 1-99.", None, None
 
360
  if not analyzer.find_query_sequence(sequence):
361
  return "❌ Sequence not accepted.", None, None
 
362
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
363
  if not matched_ids:
364
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
 
365
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
366
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
 
367
  query_id = analyzer.query_id or f"query_{int(time.time())}"
368
  tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
369
  fig.write_html(tree_html_path)
 
370
  analyzer.matching_percentage = matching_percentage
371
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
372
  report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
 
373
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
 
374
  except Exception as e:
375
  logger.error(f"Tree analysis failed: {e}")
376
  return f"❌ Error: {str(e)}", None, None
 
379
  try:
380
  if file_obj is None:
381
  return ""
 
382
  if isinstance(file_obj, str):
383
  with open(file_obj, "r") as f:
384
  content = f.read()
385
  else:
386
  content = file_obj.read().decode("utf-8")
 
387
  lines = content.strip().split("\n")
388
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
389
  return ''.join(seq_lines)
 
390
  except Exception as e:
391
  logger.error(f"Failed to read FASTA file: {e}")
392
  return ""
 
447
  },
448
  "paths": {
449
  "base_dir": BASE_DIR,
450
+ "hf_cache": "/tmp/hf_cache",
451
+ "hf_cache_exists": os.path.exists("/tmp/hf_cache")
 
 
452
  },
453
  "recommendations": {
454
+ "models": "Models loaded from Hugging Face" if (boundary_model and keras_model) else "Check HF_TOKEN and repository",
455
  "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
456
  }
457
  }
 
491
  content = await file.read()
492
  temp_file.write(content)
493
  temp_file_path = temp_file.name
 
494
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
495
  return AnalysisResponse(
496
  boundary_output=result[0] or "",
497
  keras_output=result[1] or "",
 
528
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
529
  """
530
  ) as iface:
 
531
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
 
 
532
  with gr.Row():
533
  with gr.Column():
534
  status_display = gr.HTML(value=f"""
 
540
  <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
541
  <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
542
  """)
 
543
  with gr.Tabs() as tabs:
 
544
  with gr.TabItem("📝 Text Input"):
545
  with gr.Row():
546
  with gr.Column(scale=2):
 
550
  lines=5,
551
  info="Paste your DNA sequence here"
552
  )
 
553
  with gr.Column(scale=1):
554
  similarity_score = gr.Slider(
555
  minimum=1,
 
559
  label="🎯 Similarity Threshold (%)",
560
  info="Minimum similarity for tree analysis"
561
  )
 
562
  build_ml_tree = gr.Checkbox(
563
  label="🌲 Build ML Tree",
564
  value=False,
565
  info="Generate phylogenetic placement (slower)"
566
  )
 
567
  analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
 
 
568
  with gr.TabItem("📁 File Upload"):
569
  with gr.Row():
570
  with gr.Column(scale=2):
 
573
  file_types=[".fasta", ".fa", ".fas", ".txt"],
574
  info="Upload a FASTA file containing your sequence"
575
  )
 
576
  with gr.Column(scale=1):
577
  file_similarity_score = gr.Slider(
578
  minimum=1,
 
581
  step=1.0,
582
  label="🎯 Similarity Threshold (%)"
583
  )
 
584
  file_build_ml_tree = gr.Checkbox(
585
  label="🌲 Build ML Tree",
586
  value=False
587
  )
 
588
  analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
 
 
589
  gr.Markdown("## 📊 Analysis Results")
 
590
  with gr.Row():
591
  with gr.Column():
592
  boundary_output = gr.Textbox(
 
594
  interactive=False,
595
  lines=2
596
  )
 
597
  keras_output = gr.Textbox(
598
  label="🧠 F Gene Validation",
599
  interactive=False,
600
  lines=2
601
  )
 
602
  with gr.Column():
603
  ml_tree_output = gr.Textbox(
604
  label="🌲 Phylogenetic Placement",
605
  interactive=False,
606
  lines=2
607
  )
 
608
  tree_analysis_output = gr.Textbox(
609
  label="🌳 Tree Analysis",
610
  interactive=False,
611
  lines=2
612
  )
 
 
613
  summary_output = gr.Textbox(
614
  label="📋 Summary",
615
  interactive=False,
616
  lines=8
617
  )
 
 
618
  with gr.Row():
619
  aligned_file = gr.File(label="📄 Alignment File", visible=False)
620
  tree_file = gr.File(label="🌲 Tree File", visible=False)
 
 
621
  with gr.Tabs():
622
  with gr.TabItem("🌳 Interactive Tree"):
623
  tree_html = gr.HTML(
624
  label="Phylogenetic Tree",
625
  value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
626
  )
 
627
  with gr.TabItem("📊 Detailed Report"):
628
  report_html = gr.HTML(
629
  label="Analysis Report",
630
  value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
631
  )
 
 
632
  analyze_btn.click(
633
  fn=run_pipeline,
634
  inputs=[dna_input, similarity_score, build_ml_tree],
 
639
  tree_html, report_html
640
  ]
641
  )
 
642
  analyze_file_btn.click(
643
  fn=run_pipeline_from_file,
644
  inputs=[file_input, file_similarity_score, file_build_ml_tree],
 
649
  tree_html, report_html
650
  ]
651
  )
 
 
652
  gr.Markdown("## 🔬 Example Sequences")
 
653
  example_sequences = [
654
  ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
655
  ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True],
656
  ]
 
657
  gr.Examples(
658
  examples=example_sequences,
659
  inputs=[dna_input, similarity_score, build_ml_tree],
660
  label="Click to load example sequences"
661
  )
 
 
662
  with gr.Accordion("❓ Help & Information", open=False):
663
  gr.Markdown("""
664
  ### 🧬 Gene Analysis Pipeline
 
665
  This tool performs comprehensive analysis of F gene sequences:
 
666
  **🎯 Boundary Detection**: Identifies F gene regions within your sequence
667
  **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
668
  **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
669
  **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
 
670
  ### 📋 Input Requirements
671
  - DNA sequences in ATCG format
672
  - Minimum 10 bp for basic analysis
673
  - Minimum 100 bp for phylogenetic placement
674
  - FASTA files supported for upload
 
675
  ### ⚙️ Parameters
676
  - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
677
  - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
 
678
  ### 📊 Output Files
679
  - Alignment files (.fa format)
680
  - Tree files (.treefile format)
681
  - Interactive HTML visualizations
682
  """)
 
683
  return iface
 
684
  except Exception as e:
685
  logger.error(f"Failed to create Gradio interface: {e}")
686
  return None
 
703
  # --- Main Application ---
704
  if __name__ == "__main__":
705
  import argparse
 
706
  parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
707
  parser.add_argument("--host", default="0.0.0.0", help="Host address")
708
  parser.add_argument("--port", type=int, default=7860, help="Port number")
709
  parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
710
  parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
 
711
  args = parser.parse_args()
 
712
  if args.gradio_only:
 
713
  logger.info("🚀 Starting Gradio interface only...")
714
  iface = create_gradio_interface()
715
  if iface:
 
723
  logger.error("❌ Failed to create Gradio interface")
724
  sys.exit(1)
725
  else:
 
726
  logger.info(f"🚀 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
727
  logger.info("📊 API Documentation: http://localhost:7860/docs")
728
  logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
 
729
  try:
730
  uvicorn.run(
731
  "app:app" if args.reload else app,