re-type commited on
Commit
5ff3d5b
·
verified ·
1 Parent(s): 664ad2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +415 -227
app.py CHANGED
@@ -22,6 +22,8 @@ from Bio.SeqRecord import SeqRecord
22
  import stat
23
  import time
24
  import asyncio
 
 
25
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
26
  from fastapi.responses import HTMLResponse
27
  from pydantic import BaseModel
@@ -41,16 +43,20 @@ app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
41
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
42
  log_handler = logging.StreamHandler()
43
  log_handler.setFormatter(log_formatter)
 
 
44
  try:
45
  file_handler = logging.FileHandler('/tmp/app.log')
46
  file_handler.setFormatter(log_formatter)
47
  logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
48
  except Exception:
49
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
 
50
  logger = logging.getLogger(__name__)
51
 
52
  # --- Global Variables ---
53
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
54
  MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
55
  IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
56
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
@@ -58,10 +64,11 @@ TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
58
  QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
59
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
60
 
61
- # --- Hugging Face Repositories ---
62
- MODEL_REPO = "GGproject10/best_boundary_aware_model"
63
- DATA_REPO = "GGproject10/simplified_tree_AI"
64
- HF_TOKEN = os.getenv("HF_TOKEN")
 
65
 
66
  # Initialize models as None
67
  boundary_model = None
@@ -69,58 +76,84 @@ keras_model = None
69
  kmer_to_index = None
70
  analyzer = None
71
 
72
- # --- Model Loading ---
73
  def load_models_safely():
74
  global boundary_model, keras_model, kmer_to_index, analyzer
75
- logger.info("🔍 Loading models and data from Hugging Face repositories")
76
 
77
- if not HF_TOKEN:
78
- logger.error(" HF_TOKEN environment variable not set")
79
- return
 
 
80
 
81
- # Load Boundary Model
82
  try:
83
- logger.info(f"🌐 Downloading boundary model from {MODEL_REPO}")
84
- boundary_path = hf_hub_download(
85
- repo_id=MODEL_REPO,
86
- filename="best_boundary_aware_model.pth",
87
- token=HF_TOKEN,
88
- cache_dir="/tmp/hf_cache"
89
- )
90
- if os.path.exists(boundary_path):
91
- logger.info(f" Boundary model downloaded to: {boundary_path}")
92
- boundary_model = EnhancedGenePredictor(boundary_path)
93
- logger.info("✅ Boundary model loaded successfully")
 
 
 
 
 
 
 
 
 
94
  else:
95
- logger.warning(f"❌ Boundary model not found at: {boundary_path}")
96
  except Exception as e:
97
  logger.error(f"❌ Failed to load boundary model: {e}")
98
  boundary_model = None
99
 
100
- # Load Keras Model
101
  try:
102
- logger.info(f"🌐 Downloading Keras model and kmer index from {MODEL_REPO}")
103
- keras_path = hf_hub_download(
104
- repo_id=MODEL_REPO,
105
- filename="best_model.keras",
106
- token=HF_TOKEN,
107
- cache_dir="/tmp/hf_cache"
108
- )
109
- kmer_path = hf_hub_download(
110
- repo_id=MODEL_REPO,
111
- filename="kmer_to_index.pkl",
112
- token=HF_TOKEN,
113
- cache_dir="/tmp/hf_cache"
114
- )
115
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
116
- logger.info(f"✅ Keras model downloaded to: {keras_path}")
117
- logger.info(f"✅ Kmer index downloaded to: {kmer_path}")
118
- keras_model = load_model(keras_path)
119
- with open(kmer_path, "rb") as f:
120
  kmer_to_index = pickle.load(f)
121
- logger.info("✅ Keras model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  else:
123
- logger.warning(f"❌ Keras model files not found: keras={os.path.exists(keras_path)}, kmer={os.path.exists(kmer_path)}")
124
  except Exception as e:
125
  logger.error(f"❌ Failed to load Keras model: {e}")
126
  keras_model = None
@@ -130,22 +163,38 @@ def load_models_safely():
130
  try:
131
  logger.info("🌳 Initializing tree analyzer...")
132
  analyzer = PhylogeneticTreeAnalyzer()
133
- logger.info(f"🌐 Downloading CSV from {DATA_REPO}")
134
- csv_path = hf_hub_download(
135
- repo_id=DATA_REPO,
136
- filename="f_cleaned.csv",
137
- token=HF_TOKEN,
138
- cache_dir="/tmp/hf_cache"
139
- )
140
- if os.path.exists(csv_path):
141
- logger.info(f"📊 CSV downloaded to: {csv_path}")
142
- if analyzer.load_data(csv_path):
143
- logger.info(f"✅ Tree analyzer loaded CSV successfully")
144
- else:
145
- logger.error("❌ Failed to load CSV data")
146
- analyzer = None
147
- else:
148
- logger.warning(f"❌ CSV not found at: {csv_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  analyzer = None
150
  except Exception as e:
151
  logger.error(f"❌ Failed to initialize tree analyzer: {e}")
@@ -166,9 +215,12 @@ def setup_binary_permissions():
166
 
167
  def check_tool_availability():
168
  setup_binary_permissions()
 
 
169
  mafft_available = False
170
  mafft_cmd = None
171
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
172
  for candidate in mafft_candidates:
173
  if shutil.which(candidate) or os.path.exists(candidate):
174
  try:
@@ -185,9 +237,12 @@ def check_tool_availability():
185
  break
186
  except Exception as e:
187
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
 
 
188
  iqtree_available = False
189
  iqtree_cmd = None
190
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
191
  for candidate in iqtree_candidates:
192
  if shutil.which(candidate) or os.path.exists(candidate):
193
  try:
@@ -204,36 +259,46 @@ def check_tool_availability():
204
  break
205
  except Exception as e:
206
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
 
207
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
208
 
209
- # --- Pipeline Functions ---
210
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
211
  try:
212
  if len(sequence.strip()) < 100:
213
  return False, "Sequence too short (<100 bp).", None, None
 
214
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
215
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
216
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
217
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
 
218
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
219
  return False, "Reference alignment or tree not found.", None, None
 
220
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
221
  SeqIO.write([query_record], query_fasta, "fasta")
 
222
  with open(aligned_with_query, "w") as output_file:
223
  subprocess.run([
224
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
225
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
 
226
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
227
  return False, "MAFFT alignment failed.", None, None
 
228
  subprocess.run([
229
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
230
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
231
  ], capture_output=True, text=True, timeout=1200, check=True)
 
232
  treefile = f"{output_prefix}.treefile"
233
  if not os.path.exists(treefile):
234
  return False, "IQ-TREE placement failed.", aligned_with_query, None
 
235
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
236
  return True, success_msg, aligned_with_query, treefile
 
237
  except Exception as e:
238
  logger.error(f"Phylogenetic placement failed: {e}")
239
  return False, f"Error: {str(e)}", None, None
@@ -248,14 +313,18 @@ def predict_with_keras(sequence):
248
  try:
249
  if not keras_model or not kmer_to_index:
250
  return "❌ Keras model not available."
 
251
  if len(sequence) < 6:
252
  return "❌ Sequence too short (<6 bp)."
 
253
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
254
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
255
  input_arr = np.array([indices])
 
256
  prediction = keras_model.predict(input_arr, verbose=0)[0]
257
  f_gene_prob = prediction[-1]
258
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
259
  return f"✅ {percentage}% F gene confidence"
260
  except Exception as e:
261
  logger.error(f"Keras prediction failed: {e}")
@@ -266,9 +335,14 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
266
  dna_input = dna_input.upper().strip()
267
  if not dna_input:
268
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
 
 
269
  if not re.match('^[ACTGN]+$', dna_input):
270
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
 
271
  processed_sequence = dna_input
 
 
272
  boundary_output = ""
273
  if boundary_model:
274
  try:
@@ -285,10 +359,15 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
285
  processed_sequence = dna_input
286
  else:
287
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
 
288
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
 
 
289
  aligned_file = None
290
  phy_file = None
291
  ml_tree_output = ""
 
292
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
293
  try:
294
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
@@ -305,23 +384,29 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
305
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
306
  else:
307
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
 
308
  tree_html_content = "No tree generated."
309
  report_html_content = "No report generated."
310
  simplified_ml_output = ""
 
311
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
312
  try:
313
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
314
  simplified_ml_output = tree_result
 
315
  if tree_html_path and os.path.exists(tree_html_path):
316
  with open(tree_html_path, 'r', encoding='utf-8') as f:
317
  tree_html_content = f.read()
318
  else:
319
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
320
  if report_html_path and os.path.exists(report_html_path):
321
  with open(report_html_path, 'r', encoding='utf-8') as f:
322
  report_html_content = f.read()
323
  else:
324
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
325
  except Exception as e:
326
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
327
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
@@ -330,6 +415,8 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
330
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
331
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
332
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
 
333
  summary_output = f"""
334
  📊 ANALYSIS SUMMARY:
335
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -340,37 +427,49 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
340
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
341
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
342
  """
 
343
  return (
344
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
345
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
346
  )
 
347
  except Exception as e:
348
  logger.error(f"Pipeline error: {e}")
349
  error_msg = f"❌ Pipeline Error: {str(e)}"
350
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
351
 
 
352
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
353
  try:
354
  if not analyzer:
355
  return "❌ Tree analyzer not initialized.", None, None
 
356
  if not sequence or len(sequence.strip()) < 10:
357
  return "❌ Invalid sequence.", None, None
 
358
  if not (1 <= matching_percentage <= 99):
359
  return "❌ Matching percentage must be 1-99.", None, None
 
360
  if not analyzer.find_query_sequence(sequence):
361
  return "❌ Sequence not accepted.", None, None
 
362
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
363
  if not matched_ids:
364
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
 
365
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
366
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
 
367
  query_id = analyzer.query_id or f"query_{int(time.time())}"
368
  tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
369
  fig.write_html(tree_html_path)
 
370
  analyzer.matching_percentage = matching_percentage
371
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
372
  report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
 
373
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
 
374
  except Exception as e:
375
  logger.error(f"Tree analysis failed: {e}")
376
  return f"❌ Error: {str(e)}", None, None
@@ -379,14 +478,17 @@ def read_fasta_file(file_obj):
379
  try:
380
  if file_obj is None:
381
  return ""
 
382
  if isinstance(file_obj, str):
383
  with open(file_obj, "r") as f:
384
  content = f.read()
385
  else:
386
  content = file_obj.read().decode("utf-8")
 
387
  lines = content.strip().split("\n")
388
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
389
  return ''.join(seq_lines)
 
390
  except Exception as e:
391
  logger.error(f"Failed to read FASTA file: {e}")
392
  return ""
@@ -447,11 +549,17 @@ async def health_check():
447
  },
448
  "paths": {
449
  "base_dir": BASE_DIR,
450
- "hf_cache": "/tmp/hf_cache",
451
- "hf_cache_exists": os.path.exists("/tmp/hf_cache")
 
 
 
 
 
 
452
  },
453
  "recommendations": {
454
- "models": "Models loaded from Hugging Face" if (boundary_model and keras_model) else "Check HF_TOKEN and repository",
455
  "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
456
  }
457
  }
@@ -491,7 +599,9 @@ async def analyze_file(
491
  content = await file.read()
492
  temp_file.write(content)
493
  temp_file_path = temp_file.name
 
494
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
495
  return AnalysisResponse(
496
  boundary_output=result[0] or "",
497
  keras_output=result[1] or "",
@@ -514,7 +624,7 @@ async def analyze_file(
514
  except:
515
  pass
516
 
517
- # --- Enhanced Gradio Interface ---
518
  def create_gradio_interface():
519
  try:
520
  with gr.Blocks(
@@ -528,7 +638,10 @@ def create_gradio_interface():
528
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
529
  """
530
  ) as iface:
 
531
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
 
 
532
  with gr.Row():
533
  with gr.Column():
534
  status_display = gr.HTML(value=f"""
@@ -536,206 +649,281 @@ def create_gradio_interface():
536
  <h3>🔧 System Status</h3>
537
  <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
538
  <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
539
- <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}</p>
540
- <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
541
- <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
542
  """)
543
- with gr.Tabs() as tabs:
 
 
544
  with gr.TabItem("📝 Text Input"):
545
- with gr.Row():
546
- with gr.Column(scale=2):
547
- dna_input = gr.Textbox(
548
- label="🧬 DNA Sequence",
549
- placeholder="Enter DNA sequence (ATCG format)...",
550
- lines=5,
551
- info="Paste your DNA sequence here"
552
- )
553
- with gr.Column(scale=1):
554
- similarity_score = gr.Slider(
555
- minimum=1,
556
- maximum=99,
557
- value=95.0,
558
- step=1.0,
559
- label="🎯 Similarity Threshold (%)",
560
- info="Minimum similarity for tree analysis"
561
- )
562
- build_ml_tree = gr.Checkbox(
563
- label="🌲 Build ML Tree",
564
- value=False,
565
- info="Generate phylogenetic placement (slower)"
566
- )
567
- analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
568
  with gr.TabItem("📁 File Upload"):
569
- with gr.Row():
570
- with gr.Column(scale=2):
571
- file_input = gr.File(
572
- label="📄 Upload FASTA File",
573
- file_types=[".fasta", ".fa", ".fas", ".txt"],
574
- info="Upload a FASTA file containing your sequence"
575
- )
576
- with gr.Column(scale=1):
577
- file_similarity_score = gr.Slider(
578
- minimum=1,
579
- maximum=99,
580
- value=95.0,
581
- step=1.0,
582
- label="🎯 Similarity Threshold (%)"
583
- )
584
- file_build_ml_tree = gr.Checkbox(
585
- label="🌲 Build ML Tree",
586
- value=False
587
- )
588
- analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
589
- gr.Markdown("## 📊 Analysis Results")
590
  with gr.Row():
591
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  boundary_output = gr.Textbox(
593
- label="🎯 Boundary Detection",
594
- interactive=False,
595
- lines=2
596
  )
 
 
597
  keras_output = gr.Textbox(
598
- label="🧠 F Gene Validation",
599
- interactive=False,
600
- lines=2
601
  )
602
- with gr.Column():
 
603
  ml_tree_output = gr.Textbox(
604
- label="🌲 Phylogenetic Placement",
605
- interactive=False,
606
- lines=2
607
  )
 
 
608
  tree_analysis_output = gr.Textbox(
609
- label="🌳 Tree Analysis",
610
- interactive=False,
611
- lines=2
612
  )
613
- summary_output = gr.Textbox(
614
- label="📋 Summary",
615
- interactive=False,
616
- lines=8
617
- )
618
- with gr.Row():
619
- aligned_file = gr.File(label="📄 Alignment File", visible=False)
620
- tree_file = gr.File(label="🌲 Tree File", visible=False)
 
621
  with gr.Tabs():
622
  with gr.TabItem("🌳 Interactive Tree"):
623
  tree_html = gr.HTML(
624
- label="Phylogenetic Tree",
625
- value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
626
  )
 
627
  with gr.TabItem("📊 Detailed Report"):
628
  report_html = gr.HTML(
629
  label="Analysis Report",
630
- value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
631
  )
632
- analyze_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633
  fn=run_pipeline,
634
- inputs=[dna_input, similarity_score, build_ml_tree],
635
  outputs=[
636
- boundary_output, keras_output, ml_tree_output,
637
- tree_analysis_output, summary_output,
638
- aligned_file, tree_file, gr.State(), gr.State(),
639
- tree_html, report_html
 
 
 
 
 
 
 
640
  ]
641
  )
 
 
642
  analyze_file_btn.click(
643
  fn=run_pipeline_from_file,
644
- inputs=[file_input, file_similarity_score, file_build_ml_tree],
645
  outputs=[
646
- boundary_output, keras_output, ml_tree_output,
647
- tree_analysis_output, summary_output,
648
- aligned_file, tree_file, gr.State(), gr.State(),
649
- tree_html, report_html
 
 
 
 
 
 
 
650
  ]
651
  )
652
- gr.Markdown("## 🔬 Example Sequences")
653
- example_sequences = [
654
- ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
655
- ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True],
656
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  gr.Examples(
658
- examples=example_sequences,
659
- inputs=[dna_input, similarity_score, build_ml_tree],
 
 
 
 
660
  label="Click to load example sequences"
661
  )
662
- with gr.Accordion("❓ Help & Information", open=False):
663
- gr.Markdown("""
664
- ### 🧬 Gene Analysis Pipeline
665
- This tool performs comprehensive analysis of F gene sequences:
666
- **🎯 Boundary Detection**: Identifies F gene regions within your sequence
667
- **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
668
- **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
669
- **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
670
- ### 📋 Input Requirements
671
- - DNA sequences in ATCG format
672
- - Minimum 10 bp for basic analysis
673
- - Minimum 100 bp for phylogenetic placement
674
- - FASTA files supported for upload
675
- ### ⚙️ Parameters
676
- - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
677
- - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
678
- ### 📊 Output Files
679
- - Alignment files (.fa format)
680
- - Tree files (.treefile format)
681
- - Interactive HTML visualizations
682
- """)
 
 
 
 
 
 
 
683
  return iface
 
684
  except Exception as e:
685
  logger.error(f"Failed to create Gradio interface: {e}")
686
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
 
688
  # --- Application Startup ---
689
- def mount_gradio_app():
690
  try:
691
- gradio_app = create_gradio_interface()
692
- if gradio_app:
693
- app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
694
- logger.info("✅ Gradio interface mounted at /gradio")
695
- else:
696
- logger.error("❌ Failed to create Gradio interface")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  except Exception as e:
698
- logger.error(f"❌ Failed to mount Gradio app: {e}")
699
-
700
- # Initialize Gradio
701
- mount_gradio_app()
702
-
703
- # --- Main Application ---
704
- if __name__ == "__main__":
705
- import argparse
706
- parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
707
- parser.add_argument("--host", default="0.0.0.0", help="Host address")
708
- parser.add_argument("--port", type=int, default=7860, help="Port number")
709
- parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
710
- parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
711
- args = parser.parse_args()
712
- if args.gradio_only:
713
- logger.info("🚀 Starting Gradio interface only...")
714
- iface = create_gradio_interface()
715
- if iface:
716
- iface.launch(
717
- server_name=args.host,
718
- server_port=args.port,
719
- share=False,
720
- show_error=True
721
- )
722
- else:
723
- logger.error("❌ Failed to create Gradio interface")
724
- sys.exit(1)
725
- else:
726
- logger.info(f"🚀 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
727
- logger.info("📊 API Documentation: http://localhost:7860/docs")
728
- logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
729
- try:
730
- uvicorn.run(
731
- "app:app" if args.reload else app,
732
- host=args.host,
733
- port=args.port,
734
- reload=args.reload,
735
- log_level="info"
736
- )
737
- except KeyboardInterrupt:
738
- logger.info("🛑 Application stopped by user")
739
- except Exception as e:
740
- logger.error(f"❌ Application failed: {e}")
741
- sys.exit(1)
 
22
  import stat
23
  import time
24
  import asyncio
25
+
26
+ # FastAPI imports
27
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
28
  from fastapi.responses import HTMLResponse
29
  from pydantic import BaseModel
 
43
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44
  log_handler = logging.StreamHandler()
45
  log_handler.setFormatter(log_formatter)
46
+
47
+ # File handler with error handling
48
  try:
49
  file_handler = logging.FileHandler('/tmp/app.log')
50
  file_handler.setFormatter(log_formatter)
51
  logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
52
  except Exception:
53
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
54
+
55
  logger = logging.getLogger(__name__)
56
 
57
  # --- Global Variables ---
58
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
59
+ MODELS_DIR = os.path.join(BASE_DIR, "models") # Local models directory
60
  MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
61
  IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
62
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
 
64
  QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
65
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
66
 
67
+ # --- Corrected Paths ---
68
+ boundary_model_repo = "GGproject10/best_boundary_aware_model"
69
+ other_models_repo = "GGproject10/simplified_tree_AI"
70
+ csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
71
+ hf_token = os.getenv("HF_TOKEN")
72
 
73
  # Initialize models as None
74
  boundary_model = None
 
76
  kmer_to_index = None
77
  analyzer = None
78
 
79
+ # --- Enhanced Model Loading with Correct Paths ---
80
  def load_models_safely():
81
  global boundary_model, keras_model, kmer_to_index, analyzer
 
82
 
83
+ logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
84
+ logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
85
+
86
+ if os.path.exists(MODELS_DIR):
87
+ logger.info(f"📂 Contents of models directory: {os.listdir(MODELS_DIR)}")
88
 
89
+ # Load Boundary Model - Try local first, then HF from correct repo
90
  try:
91
+ # Local model paths
92
+ local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
93
+
94
+ if os.path.exists(local_boundary_path):
95
+ logger.info(f"✅ Loading boundary model from local path: {local_boundary_path}")
96
+ boundary_model = EnhancedGenePredictor(local_boundary_path)
97
+ logger.info("✅ Boundary model loaded successfully from local directory")
98
+ elif hf_token:
99
+ logger.info("🌐 Attempting to load boundary model from Hugging Face...")
100
+ boundary_path = hf_hub_download(
101
+ repo_id=boundary_model_repo, # Correct repo for boundary model
102
+ filename="best_boundary_aware_model.pth",
103
+ token=hf_token,
104
+ cache_dir="/tmp/hf_cache"
105
+ )
106
+ if os.path.exists(boundary_path):
107
+ boundary_model = EnhancedGenePredictor(boundary_path)
108
+ logger.info("✅ Boundary model loaded successfully from HF")
109
+ else:
110
+ logger.warning("❌ Boundary model file not found after HF download")
111
  else:
112
+ logger.warning("❌ No local boundary model found and no HF_TOKEN available")
113
  except Exception as e:
114
  logger.error(f"❌ Failed to load boundary model: {e}")
115
  boundary_model = None
116
 
117
+ # Load Keras Model - Try local first, then HF from correct repo
118
  try:
119
+ # Local model paths
120
+ local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
121
+ local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
122
+
123
+ if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
124
+ logger.info(f"✅ Loading Keras model from local paths:")
125
+ logger.info(f" - Keras model: {local_keras_path}")
126
+ logger.info(f" - K-mer index: {local_kmer_path}")
127
+
128
+ keras_model = load_model(local_keras_path)
129
+ with open(local_kmer_path, "rb") as f:
 
 
 
 
 
 
 
130
  kmer_to_index = pickle.load(f)
131
+ logger.info("✅ Keras model loaded successfully from local directory")
132
+
133
+ elif hf_token:
134
+ logger.info("🌐 Attempting to load Keras model from Hugging Face...")
135
+ keras_path = hf_hub_download(
136
+ repo_id=other_models_repo, # Correct repo for other models
137
+ filename="best_model.keras",
138
+ token=hf_token,
139
+ cache_dir="/tmp/hf_cache"
140
+ )
141
+ kmer_path = hf_hub_download(
142
+ repo_id=other_models_repo, # Correct repo for other models
143
+ filename="kmer_to_index.pkl",
144
+ token=hf_token,
145
+ cache_dir="/tmp/hf_cache"
146
+ )
147
+
148
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
149
+ keras_model = load_model(keras_path)
150
+ with open(kmer_path, "rb") as f:
151
+ kmer_to_index = pickle.load(f)
152
+ logger.info("✅ Keras model loaded successfully from HF")
153
+ else:
154
+ logger.warning("❌ Keras model files not found after HF download")
155
  else:
156
+ logger.warning("❌ No local Keras model found and no HF_TOKEN available")
157
  except Exception as e:
158
  logger.error(f"❌ Failed to load Keras model: {e}")
159
  keras_model = None
 
163
  try:
164
  logger.info("🌳 Initializing tree analyzer...")
165
  analyzer = PhylogeneticTreeAnalyzer()
166
+
167
+ # Try multiple CSV locations
168
+ csv_candidates = [
169
+ csv_path,
170
+ os.path.join(BASE_DIR, "f cleaned.csv"),
171
+ "f_cleaned.csv",
172
+ os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
173
+ os.path.join(MODELS_DIR, "f_cleaned.csv") # Also check models directory
174
+ ]
175
+
176
+ csv_loaded = False
177
+ for csv_candidate in csv_candidates:
178
+ if os.path.exists(csv_candidate):
179
+ try:
180
+ logger.info(f"📊 Trying to load CSV from: {csv_candidate}")
181
+ if analyzer.load_data(csv_candidate):
182
+ logger.info(f"✅ Tree analyzer loaded CSV from: {csv_candidate}")
183
+ csv_loaded = True
184
+ break
185
+ except Exception as e:
186
+ logger.warning(f"Failed to load CSV from {csv_candidate}: {e}")
187
+ continue
188
+
189
+ if not csv_loaded:
190
+ logger.error("❌ Failed to load CSV data from any location")
191
+ logger.info("📂 Available files in base directory:")
192
+ try:
193
+ for file in os.listdir(BASE_DIR):
194
+ if file.endswith('.csv'):
195
+ logger.info(f" - {file}")
196
+ except:
197
+ pass
198
  analyzer = None
199
  except Exception as e:
200
  logger.error(f"❌ Failed to initialize tree analyzer: {e}")
 
215
 
216
  def check_tool_availability():
217
  setup_binary_permissions()
218
+
219
+ # Check MAFFT
220
  mafft_available = False
221
  mafft_cmd = None
222
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
223
+
224
  for candidate in mafft_candidates:
225
  if shutil.which(candidate) or os.path.exists(candidate):
226
  try:
 
237
  break
238
  except Exception as e:
239
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
240
+
241
+ # Check IQ-TREE
242
  iqtree_available = False
243
  iqtree_cmd = None
244
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
245
+
246
  for candidate in iqtree_candidates:
247
  if shutil.which(candidate) or os.path.exists(candidate):
248
  try:
 
259
  break
260
  except Exception as e:
261
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
262
+
263
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
264
 
265
+ # --- Pipeline Functions (keeping your original logic) ---
266
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
267
  try:
268
  if len(sequence.strip()) < 100:
269
  return False, "Sequence too short (<100 bp).", None, None
270
+
271
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
272
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
273
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
274
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
275
+
276
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
277
  return False, "Reference alignment or tree not found.", None, None
278
+
279
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
280
  SeqIO.write([query_record], query_fasta, "fasta")
281
+
282
  with open(aligned_with_query, "w") as output_file:
283
  subprocess.run([
284
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
285
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
286
+
287
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
288
  return False, "MAFFT alignment failed.", None, None
289
+
290
  subprocess.run([
291
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
292
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
293
  ], capture_output=True, text=True, timeout=1200, check=True)
294
+
295
  treefile = f"{output_prefix}.treefile"
296
  if not os.path.exists(treefile):
297
  return False, "IQ-TREE placement failed.", aligned_with_query, None
298
+
299
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
300
  return True, success_msg, aligned_with_query, treefile
301
+
302
  except Exception as e:
303
  logger.error(f"Phylogenetic placement failed: {e}")
304
  return False, f"Error: {str(e)}", None, None
 
313
  try:
314
  if not keras_model or not kmer_to_index:
315
  return "❌ Keras model not available."
316
+
317
  if len(sequence) < 6:
318
  return "❌ Sequence too short (<6 bp)."
319
+
320
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
321
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
322
  input_arr = np.array([indices])
323
+
324
  prediction = keras_model.predict(input_arr, verbose=0)[0]
325
  f_gene_prob = prediction[-1]
326
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
327
+
328
  return f"✅ {percentage}% F gene confidence"
329
  except Exception as e:
330
  logger.error(f"Keras prediction failed: {e}")
 
335
  dna_input = dna_input.upper().strip()
336
  if not dna_input:
337
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
338
+
339
+ # Clean sequence
340
  if not re.match('^[ACTGN]+$', dna_input):
341
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
342
+
343
  processed_sequence = dna_input
344
+
345
+ # Boundary prediction
346
  boundary_output = ""
347
  if boundary_model:
348
  try:
 
359
  processed_sequence = dna_input
360
  else:
361
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
362
+
363
+ # Keras prediction
364
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
365
+
366
+ # ML Tree (keeping your original logic)
367
  aligned_file = None
368
  phy_file = None
369
  ml_tree_output = ""
370
+
371
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
372
  try:
373
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
 
384
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
385
  else:
386
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
387
+
388
+ # Tree analysis
389
  tree_html_content = "No tree generated."
390
  report_html_content = "No report generated."
391
  simplified_ml_output = ""
392
+
393
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
394
  try:
395
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
396
  simplified_ml_output = tree_result
397
+
398
  if tree_html_path and os.path.exists(tree_html_path):
399
  with open(tree_html_path, 'r', encoding='utf-8') as f:
400
  tree_html_content = f.read()
401
  else:
402
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
403
+
404
  if report_html_path and os.path.exists(report_html_path):
405
  with open(report_html_path, 'r', encoding='utf-8') as f:
406
  report_html_content = f.read()
407
  else:
408
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
409
+
410
  except Exception as e:
411
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
412
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
 
415
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
416
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
417
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
418
+
419
+ # Summary
420
  summary_output = f"""
421
  📊 ANALYSIS SUMMARY:
422
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
427
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
428
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
429
  """
430
+
431
  return (
432
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
433
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
434
  )
435
+
436
  except Exception as e:
437
  logger.error(f"Pipeline error: {e}")
438
  error_msg = f"❌ Pipeline Error: {str(e)}"
439
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
440
 
441
+ # Keep your other functions (analyze_sequence_for_tree, build_maximum_likelihood_tree, etc.)
442
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
443
  try:
444
  if not analyzer:
445
  return "❌ Tree analyzer not initialized.", None, None
446
+
447
  if not sequence or len(sequence.strip()) < 10:
448
  return "❌ Invalid sequence.", None, None
449
+
450
  if not (1 <= matching_percentage <= 99):
451
  return "❌ Matching percentage must be 1-99.", None, None
452
+
453
  if not analyzer.find_query_sequence(sequence):
454
  return "❌ Sequence not accepted.", None, None
455
+
456
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
457
  if not matched_ids:
458
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
459
+
460
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
461
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
462
+
463
  query_id = analyzer.query_id or f"query_{int(time.time())}"
464
  tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
465
  fig.write_html(tree_html_path)
466
+
467
  analyzer.matching_percentage = matching_percentage
468
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
469
  report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
470
+
471
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
472
+
473
  except Exception as e:
474
  logger.error(f"Tree analysis failed: {e}")
475
  return f"❌ Error: {str(e)}", None, None
 
478
  try:
479
  if file_obj is None:
480
  return ""
481
+
482
  if isinstance(file_obj, str):
483
  with open(file_obj, "r") as f:
484
  content = f.read()
485
  else:
486
  content = file_obj.read().decode("utf-8")
487
+
488
  lines = content.strip().split("\n")
489
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
490
  return ''.join(seq_lines)
491
+
492
  except Exception as e:
493
  logger.error(f"Failed to read FASTA file: {e}")
494
  return ""
 
549
  },
550
  "paths": {
551
  "base_dir": BASE_DIR,
552
+ "models_dir": MODELS_DIR,
553
+ "models_dir_exists": os.path.exists(MODELS_DIR),
554
+ "csv_path": csv_path,
555
+ "csv_exists": os.path.exists(csv_path)
556
+ },
557
+ "model_repos": {
558
+ "boundary_model": boundary_model_repo,
559
+ "other_models": other_models_repo
560
  },
561
  "recommendations": {
562
+ "models": "Models loaded from local directory" if (boundary_model and keras_model) else "Check models directory",
563
  "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
564
  }
565
  }
 
599
  content = await file.read()
600
  temp_file.write(content)
601
  temp_file_path = temp_file.name
602
+
603
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
604
+
605
  return AnalysisResponse(
606
  boundary_output=result[0] or "",
607
  keras_output=result[1] or "",
 
624
  except:
625
  pass
626
 
627
+ # --- Fixed Gradio Interface ---
628
  def create_gradio_interface():
629
  try:
630
  with gr.Blocks(
 
638
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
639
  """
640
  ) as iface:
641
+
642
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
643
+
644
+ # Status display
645
  with gr.Row():
646
  with gr.Column():
647
  status_display = gr.HTML(value=f"""
 
649
  <h3>🔧 System Status</h3>
650
  <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
651
  <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
652
+ <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}
653
+ <p>🔬 MAFFT/IQ-TREE: {'✅ Available' if check_tool_availability()[0] and check_tool_availability()[1] else '❌ Missing'}</p>
 
654
  """)
655
+
656
+ # Input tabs
657
+ with gr.Tabs():
658
  with gr.TabItem("📝 Text Input"):
659
+ dna_input = gr.Textbox(
660
+ label="🧬 DNA Sequence",
661
+ placeholder="Enter DNA sequence (ATCG format)...",
662
+ lines=5,
663
+ max_lines=10
664
+ )
665
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  with gr.TabItem("📁 File Upload"):
667
+ fasta_file = gr.File(
668
+ label="📄 Upload FASTA File",
669
+ file_types=[".fasta", ".fa", ".txt"],
670
+ file_count="single"
671
+ )
672
+
673
+ # Analysis options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
  with gr.Row():
675
  with gr.Column():
676
+ similarity_slider = gr.Slider(
677
+ minimum=1,
678
+ maximum=99,
679
+ value=95,
680
+ step=1,
681
+ label="🎯 Similarity Threshold (%)",
682
+ info="Minimum similarity for phylogenetic analysis"
683
+ )
684
+
685
+ with gr.Column():
686
+ ml_tree_checkbox = gr.Checkbox(
687
+ label="🌲 Build ML Tree",
688
+ value=False,
689
+ info="Perform phylogenetic placement (slower)"
690
+ )
691
+
692
+ # Action buttons
693
+ with gr.Row():
694
+ analyze_text_btn = gr.Button("🔍 Analyze Text", variant="primary", size="lg")
695
+ analyze_file_btn = gr.Button("📁 Analyze File", variant="secondary", size="lg")
696
+ clear_btn = gr.Button("🗑️ Clear", variant="stop")
697
+
698
+ # Results section
699
+ gr.Markdown("## 📊 Analysis Results")
700
+
701
+ with gr.Tabs():
702
+ with gr.TabItem("🎯 Boundary Prediction"):
703
  boundary_output = gr.Textbox(
704
+ label="🔍 F Gene Boundary Detection",
705
+ lines=3,
706
+ interactive=False
707
  )
708
+
709
+ with gr.TabItem("🧠 Keras Validation"):
710
  keras_output = gr.Textbox(
711
+ label="🤖 Neural Network Validation",
712
+ lines=3,
713
+ interactive=False
714
  )
715
+
716
+ with gr.TabItem("🌲 ML Tree Placement"):
717
  ml_tree_output = gr.Textbox(
718
+ label="🌳 Maximum Likelihood Tree",
719
+ lines=5,
720
+ interactive=False
721
  )
722
+
723
+ with gr.TabItem("📈 Tree Analysis"):
724
  tree_analysis_output = gr.Textbox(
725
+ label="📊 Phylogenetic Analysis",
726
+ lines=5,
727
+ interactive=False
728
  )
729
+
730
+ with gr.TabItem("📋 Summary"):
731
+ summary_output = gr.Textbox(
732
+ label="📝 Analysis Summary",
733
+ lines=10,
734
+ interactive=False
735
+ )
736
+
737
+ # Visualization section
738
  with gr.Tabs():
739
  with gr.TabItem("🌳 Interactive Tree"):
740
  tree_html = gr.HTML(
741
+ label="Phylogenetic Tree Visualization",
742
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Tree visualization will appear here after analysis</div>"
743
  )
744
+
745
  with gr.TabItem("📊 Detailed Report"):
746
  report_html = gr.HTML(
747
  label="Analysis Report",
748
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Detailed report will appear here after analysis</div>"
749
  )
750
+
751
+ # File downloads
752
+ gr.Markdown("## 📥 Download Results")
753
+ with gr.Row():
754
+ aligned_file = gr.File(
755
+ label="📄 Aligned Sequences",
756
+ interactive=False
757
+ )
758
+ tree_file = gr.File(
759
+ label="🌳 Tree File",
760
+ interactive=False
761
+ )
762
+
763
+ # Event handlers
764
+ def clear_all():
765
+ return (
766
+ "", # dna_input
767
+ None, # fasta_file
768
+ "", # boundary_output
769
+ "", # keras_output
770
+ "", # ml_tree_output
771
+ "", # tree_analysis_output
772
+ "", # summary_output
773
+ "<div style='text-align: center; padding: 20px; color: #666;'>Tree visualization will appear here after analysis</div>", # tree_html
774
+ "<div style='text-align: center; padding: 20px; color: #666;'>Detailed report will appear here after analysis</div>", # report_html
775
+ None, # aligned_file
776
+ None # tree_file
777
+ )
778
+
779
+ # Text analysis
780
+ analyze_text_btn.click(
781
  fn=run_pipeline,
782
+ inputs=[dna_input, similarity_slider, ml_tree_checkbox],
783
  outputs=[
784
+ boundary_output,
785
+ keras_output,
786
+ ml_tree_output,
787
+ tree_analysis_output,
788
+ summary_output,
789
+ aligned_file,
790
+ tree_file,
791
+ gr.State(), # placeholder for additional outputs
792
+ gr.State(), # placeholder for additional outputs
793
+ tree_html,
794
+ report_html
795
  ]
796
  )
797
+
798
+ # File analysis
799
  analyze_file_btn.click(
800
  fn=run_pipeline_from_file,
801
+ inputs=[fasta_file, similarity_slider, ml_tree_checkbox],
802
  outputs=[
803
+ boundary_output,
804
+ keras_output,
805
+ ml_tree_output,
806
+ tree_analysis_output,
807
+ summary_output,
808
+ aligned_file,
809
+ tree_file,
810
+ gr.State(), # placeholder for additional outputs
811
+ gr.State(), # placeholder for additional outputs
812
+ tree_html,
813
+ report_html
814
  ]
815
  )
816
+
817
+ # Clear button
818
+ clear_btn.click(
819
+ fn=clear_all,
820
+ outputs=[
821
+ dna_input,
822
+ fasta_file,
823
+ boundary_output,
824
+ keras_output,
825
+ ml_tree_output,
826
+ tree_analysis_output,
827
+ summary_output,
828
+ tree_html,
829
+ report_html,
830
+ aligned_file,
831
+ tree_file
832
+ ]
833
+ )
834
+
835
+ # Examples
836
+ gr.Markdown("## 🧪 Example Sequences")
837
  gr.Examples(
838
+ examples=[
839
+ ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGCAGCAGCAGCAGCAGCAGCAGCAGCAGC", 95.0, False],
840
+ ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGCAGCAGCAGCAGCAGCAGCAGCAGCAGC", 85.0, True],
841
+ ["ATGGAGCTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAG", 90.0, False]
842
+ ],
843
+ inputs=[dna_input, similarity_slider, ml_tree_checkbox],
844
  label="Click to load example sequences"
845
  )
846
+
847
+ # Footer
848
+ gr.Markdown("""
849
+ ---
850
+
851
+ ### 🔬 About This Pipeline
852
+
853
+ This tool performs comprehensive analysis of DNA sequences using multiple approaches:
854
+
855
+ - **🎯 Boundary Detection**: Identifies F gene regions using ML models
856
+ - **🧠 Keras Validation**: Neural network-based sequence validation
857
+ - **🌲 ML Tree Placement**: Phylogenetic placement using MAFFT + IQ-TREE
858
+ - **📈 Tree Analysis**: Interactive phylogenetic analysis and visualization
859
+
860
+ ### 📝 Usage Notes
861
+
862
+ - Sequences should be in ATCG format (other characters will be converted to N)
863
+ - Minimum 100 bp recommended for phylogenetic placement
864
+ - Higher similarity thresholds = fewer but more similar sequences
865
+ - ML tree building requires MAFFT and IQ-TREE (slower but more accurate)
866
+
867
+ ### ⚠️ System Requirements
868
+
869
+ - Python packages: gradio, torch, tensorflow, biopython, plotly
870
+ - Bioinformatics tools: MAFFT, IQ-TREE (optional for ML placement)
871
+ - Pre-trained models: boundary detection + keras validation models
872
+ """)
873
+
874
  return iface
875
+
876
  except Exception as e:
877
  logger.error(f"Failed to create Gradio interface: {e}")
878
+ # Fallback simple interface
879
+ with gr.Blocks() as fallback_iface:
880
+ gr.Markdown("# 🧬 Gene Analysis Pipeline (Fallback Mode)")
881
+ gr.Markdown(f"⚠️ Error creating full interface: {str(e)}")
882
+
883
+ dna_input = gr.Textbox(label="DNA Sequence", lines=5)
884
+ analyze_btn = gr.Button("Analyze")
885
+ output = gr.Textbox(label="Results", lines=10)
886
+
887
+ analyze_btn.click(
888
+ fn=lambda seq: run_pipeline(seq, 95.0, False)[4], # Just return summary
889
+ inputs=[dna_input],
890
+ outputs=[output]
891
+ )
892
+
893
+ return fallback_iface
894
 
895
  # --- Application Startup ---
896
+ if __name__ == "__main__":
897
  try:
898
+ # Create Gradio interface
899
+ gr_interface = create_gradio_interface()
900
+
901
+ # Mount Gradio app to FastAPI
902
+ gr_app = gr.mount_gradio_app(app, gr_interface, path="/gradio")
903
+
904
+ # Log startup info
905
+ logger.info("🚀 Starting Gene Analysis Pipeline...")
906
+ logger.info(f"📁 Base directory: {BASE_DIR}")
907
+ logger.info(f"🤖 Models loaded: Boundary={boundary_model is not None}, Keras={keras_model is not None}")
908
+ logger.info(f"🌳 Tree analyzer: {analyzer is not None}")
909
+
910
+ mafft_available, iqtree_available, _, _ = check_tool_availability()
911
+ logger.info(f"🔬 Tools available: MAFFT={mafft_available}, IQ-TREE={iqtree_available}")
912
+
913
+ # Start server
914
+ logger.info("🌐 Starting server on http://0.0.0.0:7860")
915
+ logger.info("📊 FastAPI docs: http://0.0.0.0:7860/docs")
916
+ logger.info("🎮 Gradio interface: http://0.0.0.0:7860/gradio")
917
+
918
+ uvicorn.run(
919
+ app,
920
+ host="0.0.0.0",
921
+ port=7860,
922
+ log_level="info",
923
+ access_log=True
924
+ )
925
+
926
  except Exception as e:
927
+ logger.error(f"❌ Startup failed: {e}")
928
+ print(f"❌ Failed to start application: {e}")
929
+ sys.exit(1)