re-type commited on
Commit
956abb6
·
verified ·
1 Parent(s): 4d41dd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +461 -334
app.py CHANGED
@@ -1,14 +1,9 @@
1
- import os
2
- # Disable GPU to avoid CUDA errors
3
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
4
- # Suppress TensorFlow warnings
5
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6
-
7
  import gradio as gr
8
  import torch
9
  import pickle
10
  import subprocess
11
  import pandas as pd
 
12
  import re
13
  import logging
14
  import numpy as np
@@ -27,16 +22,14 @@ from Bio.SeqRecord import SeqRecord
27
  import stat
28
  import time
29
  import asyncio
 
 
30
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
31
  from fastapi.responses import HTMLResponse
32
  from pydantic import BaseModel
33
  from typing import Optional
34
  import uvicorn
35
 
36
- # Log Gradio version
37
- logger = logging.getLogger(__name__)
38
- logger.info(f"Gradio version: {gr.__version__}")
39
-
40
  # Set event loop policy for Spaces
41
  try:
42
  asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@@ -50,6 +43,8 @@ app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
50
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
51
  log_handler = logging.StreamHandler()
52
  log_handler.setFormatter(log_formatter)
 
 
53
  try:
54
  file_handler = logging.FileHandler('/tmp/app.log')
55
  file_handler.setFormatter(log_formatter)
@@ -57,18 +52,23 @@ try:
57
  except Exception:
58
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
59
 
 
 
60
  # --- Global Variables ---
61
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
62
- MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
63
- IQTREE_PATH = os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
 
64
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
65
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
66
- QUERY_OUTPUT_DIR = os.path.join(BASE_DIR, "queries")
67
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
68
 
69
- # --- Model Configuration ---
70
- MODEL_REPO = "GGproject10/best_boundary_aware_model"
71
- CSV_PATH = "f cleaned.csv"
 
 
72
 
73
  # Initialize models as None
74
  boundary_model = None
@@ -76,85 +76,128 @@ keras_model = None
76
  kmer_to_index = None
77
  analyzer = None
78
 
79
- # --- Model Loading ---
80
  def load_models_safely():
81
  global boundary_model, keras_model, kmer_to_index, analyzer
82
- logger.info("🔍 Loading models...")
83
-
84
- # Load Boundary Model
 
 
 
 
 
85
  try:
86
- boundary_path = hf_hub_download(
87
- repo_id=MODEL_REPO,
88
- filename="best_boundary_aware_model.pth",
89
- token=None
90
- )
91
- if os.path.exists(boundary_path):
92
- boundary_model = EnhancedGenePredictor(boundary_path)
93
- logger.info("✅ Boundary model loaded successfully from Hugging Face Hub.")
 
 
 
 
 
 
 
 
 
 
 
 
94
  else:
95
- logger.error(f"❌ Boundary model file not found after download from {MODEL_REPO}")
96
  except Exception as e:
97
- logger.error(f"❌ Failed to load boundary model from HF Hub: {e}. Ensure {MODEL_REPO} is public and accessible.")
 
98
 
99
- # Load Keras Model
100
  try:
101
- keras_path = hf_hub_download(
102
- repo_id=MODEL_REPO,
103
- filename="best_model.keras",
104
- token=None
105
- )
106
- kmer_path = hf_hub_download(
107
- repo_id=MODEL_REPO,
108
- filename="kmer_to_index.pkl",
109
- token=None
110
- )
111
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
112
- keras_model = load_model(keras_path)
113
- with open(kmer_path, "rb") as f:
114
  kmer_to_index = pickle.load(f)
115
- logger.info("✅ Keras model and k-mer index loaded successfully from Hugging Face Hub.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  else:
117
- logger.error(f"❌ Keras model or kmer files not found after download from {MODEL_REPO}")
118
  except Exception as e:
119
- logger.error(f"❌ Failed to load Keras model from HF Hub: {e}. Ensure {MODEL_REPO} is public and accessible.")
 
 
120
 
121
  # Initialize Tree Analyzer
122
  try:
123
  logger.info("🌳 Initializing tree analyzer...")
124
  analyzer = PhylogeneticTreeAnalyzer()
 
 
125
  csv_candidates = [
126
- CSV_PATH,
127
- os.path.join(BASE_DIR, CSV_PATH),
128
- os.path.join(BASE_DIR, "app", CSV_PATH),
129
- os.path.join(os.path.dirname(__file__), CSV_PATH),
130
  "f_cleaned.csv",
131
- os.path.join(BASE_DIR, "f_cleaned.csv")
 
132
  ]
 
133
  csv_loaded = False
134
  for csv_candidate in csv_candidates:
135
  if os.path.exists(csv_candidate):
136
- logger.info(f"📊 Trying CSV: {csv_candidate}")
137
  try:
 
138
  if analyzer.load_data(csv_candidate):
139
- logger.info(f"✅ CSV loaded from: {csv_candidate}")
140
  csv_loaded = True
141
  break
142
  except Exception as e:
143
- logger.warning(f"CSV load failed for {csv_candidate}: {e}")
144
  continue
 
145
  if not csv_loaded:
146
- logger.error("❌ Failed to load CSV data from any candidate location. Place 'f cleaned.csv' in project root.")
147
- analyzer = None
148
- else:
149
  try:
150
- if analyzer.train_ai_model():
151
- logger.info("✅ AI model training completed successfully")
152
- else:
153
- logger.warning("⚠️ AI model training failed; proceeding with basic analysis.")
154
- except Exception as e:
155
- logger.warning(f"⚠️ AI model training failed: {e}")
156
  except Exception as e:
157
- logger.error(f"❌ Tree analyzer initialization failed: {e}")
158
  analyzer = None
159
 
160
  # Load models at startup
@@ -172,16 +215,19 @@ def setup_binary_permissions():
172
 
173
  def check_tool_availability():
174
  setup_binary_permissions()
 
 
175
  mafft_available = False
176
  mafft_cmd = None
177
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
178
  for candidate in mafft_candidates:
179
  if shutil.which(candidate) or os.path.exists(candidate):
180
  try:
181
  result = subprocess.run(
182
- [candidate, "--help"],
183
- capture_output=True,
184
- text=True,
185
  timeout=5
186
  )
187
  if result.returncode == 0 or "mafft" in result.stderr.lower():
@@ -191,16 +237,19 @@ def check_tool_availability():
191
  break
192
  except Exception as e:
193
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
 
 
194
  iqtree_available = False
195
  iqtree_cmd = None
196
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
197
  for candidate in iqtree_candidates:
198
  if shutil.which(candidate) or os.path.exists(candidate):
199
  try:
200
  result = subprocess.run(
201
- [candidate, "--help"],
202
- capture_output=True,
203
- text=True,
204
  timeout=5
205
  )
206
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
@@ -210,36 +259,46 @@ def check_tool_availability():
210
  break
211
  except Exception as e:
212
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
 
213
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
214
 
215
- # --- Pipeline Functions ---
216
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
217
  try:
218
  if len(sequence.strip()) < 100:
219
  return False, "Sequence too short (<100 bp).", None, None
 
220
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
221
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
222
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
223
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
 
224
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
225
  return False, "Reference alignment or tree not found.", None, None
 
226
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
227
  SeqIO.write([query_record], query_fasta, "fasta")
 
228
  with open(aligned_with_query, "w") as output_file:
229
  subprocess.run([
230
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
231
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
 
232
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
233
  return False, "MAFFT alignment failed.", None, None
 
234
  subprocess.run([
235
- iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
236
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
237
  ], capture_output=True, text=True, timeout=1200, check=True)
 
238
  treefile = f"{output_prefix}.treefile"
239
  if not os.path.exists(treefile):
240
  return False, "IQ-TREE placement failed.", aligned_with_query, None
 
241
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
242
  return True, success_msg, aligned_with_query, treefile
 
243
  except Exception as e:
244
  logger.error(f"Phylogenetic placement failed: {e}")
245
  return False, f"Error: {str(e)}", None, None
@@ -250,73 +309,40 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
250
  except:
251
  pass
252
 
253
- def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
254
- try:
255
- if not analyzer:
256
- return "❌ Tree analyzer not initialized.", None, None
257
- if not sequence or len(sequence.strip()) < 10:
258
- return "❌ Invalid sequence.", None, None
259
- if not (1 <= matching_percentage <= 99):
260
- return "❌ Matching percentage must be 1-99.", None, None
261
- if not analyzer.find_query_sequence(sequence):
262
- return "❌ Sequence not accepted.", None, None
263
- matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
264
- if not matched_ids:
265
- return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
266
- analyzer.build_tree_structure_with_ml_safe(matched_ids)
267
- fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
268
- query_id = analyzer.query_id or f"query_{int(time.time())}"
269
- tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
270
- fig.write_html(tree_html_path)
271
- analyzer.matching_percentage = matching_percentage
272
- report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
273
- report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
274
- return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
275
- except Exception as e:
276
- logger.error(f"Tree analysis failed: {e}")
277
- return f"❌ Error: {str(e)}", None, None
278
-
279
  def predict_with_keras(sequence):
280
  try:
281
  if not keras_model or not kmer_to_index:
282
  return "❌ Keras model not available."
 
283
  if len(sequence) < 6:
284
  return "❌ Sequence too short (<6 bp)."
 
285
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
286
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
287
  input_arr = np.array([indices])
 
288
  prediction = keras_model.predict(input_arr, verbose=0)[0]
289
  f_gene_prob = prediction[-1]
290
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
291
  return f"✅ {percentage}% F gene confidence"
292
  except Exception as e:
293
  logger.error(f"Keras prediction failed: {e}")
294
  return f"❌ Error: {str(e)}"
295
 
296
- def read_fasta_file(file_obj):
297
- try:
298
- if file_obj is None:
299
- return ""
300
- if isinstance(file_obj, str):
301
- with open(file_obj, "r") as f:
302
- content = f.read()
303
- else:
304
- content = file_obj.read().decode("utf-8")
305
- lines = content.strip().split("\n")
306
- seq_lines = [line.strip() for line in lines if not line.startswith(">")]
307
- return ''.join(seq_lines)
308
- except Exception as e:
309
- logger.error(f"Failed to read FASTA file: {e}")
310
- return ""
311
-
312
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
313
  try:
314
  dna_input = dna_input.upper().strip()
315
  if not dna_input:
316
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
 
 
317
  if not re.match('^[ACTGN]+$', dna_input):
318
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
 
319
  processed_sequence = dna_input
 
 
320
  boundary_output = ""
321
  if boundary_model:
322
  try:
@@ -333,10 +359,15 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
333
  processed_sequence = dna_input
334
  else:
335
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
 
336
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
 
 
337
  aligned_file = None
338
  phy_file = None
339
  ml_tree_output = ""
 
340
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
341
  try:
342
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
@@ -353,23 +384,29 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
353
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
354
  else:
355
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
 
356
  tree_html_content = "No tree generated."
357
  report_html_content = "No report generated."
358
  simplified_ml_output = ""
 
359
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
360
  try:
361
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
362
  simplified_ml_output = tree_result
 
363
  if tree_html_path and os.path.exists(tree_html_path):
364
  with open(tree_html_path, 'r', encoding='utf-8') as f:
365
  tree_html_content = f.read()
366
  else:
367
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
368
  if report_html_path and os.path.exists(report_html_path):
369
  with open(report_html_path, 'r', encoding='utf-8') as f:
370
  report_html_content = f.read()
371
  else:
372
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
373
  except Exception as e:
374
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
375
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
@@ -378,6 +415,8 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
378
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
379
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
380
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
 
381
  summary_output = f"""
382
  📊 ANALYSIS SUMMARY:
383
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -388,15 +427,72 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
388
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
389
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
390
  """
 
391
  return (
392
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
393
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
394
  )
 
395
  except Exception as e:
396
  logger.error(f"Pipeline error: {e}")
397
  error_msg = f"❌ Pipeline Error: {str(e)}"
398
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
401
  try:
402
  dna_input = read_fasta_file(fasta_file_obj)
@@ -453,7 +549,18 @@ async def health_check():
453
  },
454
  "paths": {
455
  "base_dir": BASE_DIR,
456
- "query_output_dir": QUERY_OUTPUT_DIR
 
 
 
 
 
 
 
 
 
 
 
457
  }
458
  }
459
  except Exception as e:
@@ -475,15 +582,15 @@ async def analyze_sequence(request: AnalysisRequest):
475
  except Exception as e:
476
  logger.error(f"Analyze error: {e}")
477
  return AnalysisResponse(
478
- boundary_output="", keras_output="", ml_tree_output="",
479
  tree_analysis_output="", summary_output="",
480
  success=False, error_message=str(e)
481
  )
482
 
483
  @app.post("/analyze-file")
484
  async def analyze_file(
485
- file: UploadFile = File(...),
486
- similarity_score: float = Form(95.0),
487
  build_ml_tree: bool = Form(False)
488
  ):
489
  temp_file_path = None
@@ -492,7 +599,9 @@ async def analyze_file(
492
  content = await file.read()
493
  temp_file.write(content)
494
  temp_file_path = temp_file.name
 
495
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
496
  return AnalysisResponse(
497
  boundary_output=result[0] or "",
498
  keras_output=result[1] or "",
@@ -504,7 +613,7 @@ async def analyze_file(
504
  except Exception as e:
505
  logger.error(f"Analyze-file error: {e}")
506
  return AnalysisResponse(
507
- boundary_output="", keras_output="", ml_tree_output="",
508
  tree_analysis_output="", summary_output="",
509
  success=False, error_message=str(e)
510
  )
@@ -515,7 +624,7 @@ async def analyze_file(
515
  except:
516
  pass
517
 
518
- # --- Gradio Interface ---
519
  def create_gradio_interface():
520
  try:
521
  with gr.Blocks(
@@ -529,7 +638,10 @@ def create_gradio_interface():
529
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
530
  """
531
  ) as iface:
 
532
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
 
 
533
  with gr.Row():
534
  with gr.Column():
535
  status_display = gr.HTML(value=f"""
@@ -537,266 +649,281 @@ def create_gradio_interface():
537
  <h3>🔧 System Status</h3>
538
  <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
539
  <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
540
- <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}</p>
541
- <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
542
- <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
543
- </div>
544
  """)
 
 
545
  with gr.Tabs():
546
  with gr.TabItem("📝 Text Input"):
547
- with gr.Row():
548
- with gr.Column(scale=2):
549
- dna_input = gr.Textbox(
550
- label="🧬 DNA Sequence",
551
- placeholder="Enter DNA sequence (ATCG format)...",
552
- lines=5,
553
- info="Paste your DNA sequence here"
554
- )
555
- with gr.Column(scale=1):
556
- similarity_score = gr.Slider(
557
- minimum=1,
558
- maximum=99,
559
- value=95.0,
560
- step=1.0,
561
- label="🎯 Similarity Threshold (%)",
562
- info="Minimum similarity for tree analysis"
563
- )
564
- build_ml_tree = gr.Checkbox(
565
- label="🌲 Build ML Tree",
566
- value=False,
567
- info="Generate phylogenetic placement (slower)"
568
- )
569
- analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
570
  with gr.TabItem("📁 File Upload"):
571
- with gr.Row():
572
- with gr.Column(scale=2):
573
- file_input = gr.File(
574
- label="📄 Upload FASTA File",
575
- file_types=[".fasta", ".fa", ".fas", ".txt"],
576
- info="Upload a FASTA file containing your sequence"
577
- )
578
- with gr.Column(scale=1):
579
- file_similarity_score = gr.Slider(
580
- minimum=1,
581
- maximum=99,
582
- value=95.0,
583
- step=1.0,
584
- label="🎯 Similarity Threshold (%)",
585
- info="Minimum similarity for tree analysis"
586
- )
587
- file_build_ml_tree = gr.Checkbox(
588
- label="🌲 Build ML Tree",
589
- value=False,
590
- info="Generate phylogenetic placement (slower)"
591
- )
592
- analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
593
- gr.Markdown("## 📊 Analysis Results")
594
  with gr.Row():
595
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  boundary_output = gr.Textbox(
597
- label="🎯 Boundary Detection",
598
- interactive=False,
599
- lines=2
600
  )
 
 
601
  keras_output = gr.Textbox(
602
- label="🧠 F Gene Validation",
603
- interactive=False,
604
- lines=2
605
  )
606
- with gr.Column():
 
607
  ml_tree_output = gr.Textbox(
608
- label="🌲 Phylogenetic Placement",
609
- interactive=False,
610
- lines=2
611
  )
 
 
612
  tree_analysis_output = gr.Textbox(
613
- label="🌳 Tree Analysis",
614
- interactive=False,
615
- lines=2
616
  )
617
- summary_output = gr.Textbox(
618
- label="📋 Summary",
619
- interactive=False,
620
- lines=8
621
- )
622
- with gr.Row():
623
- aligned_file = gr.File(label="📄 Alignment File", visible=False)
624
- tree_file = gr.File(label="🌲 Tree File", visible=False)
 
625
  with gr.Tabs():
626
  with gr.TabItem("🌳 Interactive Tree"):
627
  tree_html = gr.HTML(
628
  label="Phylogenetic Tree Visualization",
629
- value="<div style='text-align: center; padding: 20px; color: #666;'>Run analysis to see interactive tree</div>"
630
  )
 
631
  with gr.TabItem("📊 Detailed Report"):
632
  report_html = gr.HTML(
633
  label="Analysis Report",
634
- value="<div style='text-align: center; padding: 20px; color: #666;'>Run analysis to see detailed report</div>"
635
  )
636
-
 
 
 
 
 
 
 
 
 
 
 
 
637
  # Event handlers
638
- analyze_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  fn=run_pipeline,
640
- inputs=[dna_input, similarity_score, build_ml_tree],
641
  outputs=[
642
- boundary_output, keras_output, ml_tree_output,
643
- tree_analysis_output, summary_output,
644
- aligned_file, tree_file, gr.State(), gr.State(),
645
- tree_html, report_html
 
 
 
 
 
 
 
646
  ]
647
  )
648
-
 
649
  analyze_file_btn.click(
650
  fn=run_pipeline_from_file,
651
- inputs=[file_input, file_similarity_score, file_build_ml_tree],
652
  outputs=[
653
- boundary_output, keras_output, ml_tree_output,
654
- tree_analysis_output, summary_output,
655
- aligned_file, tree_file, gr.State(), gr.State(),
656
- tree_html, report_html
 
 
 
 
 
 
 
657
  ]
658
  )
659
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
  # Examples
 
661
  gr.Examples(
662
  examples=[
663
- [
664
- "ATGAAACTGCAGATGAAGTCCAAGTTGAAGAAGCTGGTGAAGATCCTGAAGGAGAACCTGGTGAAGCCCGAGAAGTGA",
665
- 95.0,
666
- False
667
- ],
668
- [
669
- "ATGCCCACCATGAAACTGCAGATGAAGTCCAAGTTGAAGAAGCTGGTGAAGATCCTGAAGGAGAACCTGGTGAAGCCCGAGAAGTGATCCGGGCGGTACATCCTGCTGCCCACCATGAAG",
670
- 90.0,
671
- True
672
- ]
673
  ],
674
- inputs=[dna_input, similarity_score, build_ml_tree],
675
- label="🧪 Example Sequences"
676
  )
677
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678
  return iface
679
-
680
  except Exception as e:
681
- logger.error(f"Gradio interface creation failed: {e}")
682
  # Fallback simple interface
683
- def simple_interface():
684
- return gr.Interface(
685
- fn=lambda x: f"Error creating interface: {e}",
686
- inputs=gr.Textbox(label="Input"),
687
- outputs=gr.Textbox(label="Output"),
688
- title="Gene Analysis Pipeline - Error Mode"
 
 
 
 
 
 
689
  )
690
- return simple_interface()
 
691
 
692
- # --- Main Application Launcher ---
693
- def launch_app():
694
- """Launch the application with both FastAPI and Gradio"""
695
  try:
696
  # Create Gradio interface
697
- gradio_app = create_gradio_interface()
698
 
699
  # Mount Gradio app to FastAPI
700
- app.mount("/gradio", gradio_app.mount_to_fastapi())
701
-
702
- # Add root redirect to Gradio
703
- @app.get("/")
704
- async def redirect_to_gradio():
705
- return HTMLResponse("""
706
- <html>
707
- <head>
708
- <title>🧬 Gene Analysis Pipeline</title>
709
- <meta http-equiv="refresh" content="0; url=/gradio">
710
- <style>
711
- body {
712
- font-family: Arial, sans-serif;
713
- text-align: center;
714
- padding: 50px;
715
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
716
- color: white;
717
- }
718
- .container {
719
- background: rgba(255,255,255,0.1);
720
- padding: 30px;
721
- border-radius: 15px;
722
- backdrop-filter: blur(10px);
723
- max-width: 500px;
724
- margin: 0 auto;
725
- }
726
- a {
727
- color: #fff;
728
- text-decoration: none;
729
- font-weight: bold;
730
- }
731
- </style>
732
- </head>
733
- <body>
734
- <div class="container">
735
- <h1>🧬 Gene Analysis Pipeline</h1>
736
- <p>Redirecting to the analysis interface...</p>
737
- <p><a href="/gradio">Click here if not redirected automatically</a></p>
738
- <br>
739
- <p><small>
740
- API Documentation: <a href="/docs">/docs</a><br>
741
- Health Check: <a href="/health">/health</a>
742
- </small></p>
743
- </div>
744
- </body>
745
- </html>
746
- """)
747
 
748
- logger.info("🚀 Application setup complete!")
749
- return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
 
751
  except Exception as e:
752
- logger.error(f"Application launch failed: {e}")
753
- raise
754
-
755
- # --- Entry Point ---
756
- if __name__ == "__main__":
757
- try:
758
- # Check if running in Hugging Face Spaces
759
- if os.getenv("SPACE_ID"):
760
- logger.info("🤗 Running in Hugging Face Spaces")
761
- # In Spaces, just create and launch Gradio
762
- gradio_app = create_gradio_interface()
763
- gradio_app.launch(
764
- server_name="0.0.0.0",
765
- server_port=7860,
766
- share=False,
767
- show_error=True,
768
- quiet=False
769
- )
770
- else:
771
- # Local development - run full FastAPI + Gradio
772
- logger.info("💻 Running in local development mode")
773
- app = launch_app()
774
-
775
- # Determine port
776
- port = int(os.getenv("PORT", 8000))
777
-
778
- # Launch with uvicorn
779
- uvicorn.run(
780
- app,
781
- host="0.0.0.0",
782
- port=port,
783
- log_level="info",
784
- access_log=True
785
- )
786
-
787
- except KeyboardInterrupt:
788
- logger.info("🛑 Application stopped by user")
789
- except Exception as e:
790
- logger.error(f"💥 Application failed to start: {e}")
791
- # Last resort - simple Gradio interface
792
- try:
793
- simple_gradio = gr.Interface(
794
- fn=lambda x: f"Application Error: {e}",
795
- inputs=gr.Textbox(label="Error Mode"),
796
- outputs=gr.Textbox(label="Status"),
797
- title="🧬 Gene Analysis Pipeline - Error Mode"
798
- )
799
- simple_gradio.launch(server_name="0.0.0.0", server_port=7860)
800
- except:
801
- logger.critical("💀 Complete application failure")
802
- sys.exit(1)
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import pickle
4
  import subprocess
5
  import pandas as pd
6
+ import os
7
  import re
8
  import logging
9
  import numpy as np
 
22
  import stat
23
  import time
24
  import asyncio
25
+
26
+ # FastAPI imports
27
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
28
  from fastapi.responses import HTMLResponse
29
  from pydantic import BaseModel
30
  from typing import Optional
31
  import uvicorn
32
 
 
 
 
 
33
  # Set event loop policy for Spaces
34
  try:
35
  asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
 
43
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44
  log_handler = logging.StreamHandler()
45
  log_handler.setFormatter(log_formatter)
46
+
47
+ # File handler with error handling
48
  try:
49
  file_handler = logging.FileHandler('/tmp/app.log')
50
  file_handler.setFormatter(log_formatter)
 
52
  except Exception:
53
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
54
 
55
+ logger = logging.getLogger(__name__)
56
+
57
  # --- Global Variables ---
58
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
59
+ MODELS_DIR = os.path.join(BASE_DIR, "models") # Local models directory
60
+ MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
61
+ IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
62
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
63
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
64
+ QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
65
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
66
 
67
+ # --- Corrected Paths ---
68
+ boundary_model_repo = "GGproject10/best_boundary_aware_model"
69
+ other_models_repo = "GGproject10/simplified_tree_AI"
70
+ csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
71
+ hf_token = os.getenv("HF_TOKEN")
72
 
73
  # Initialize models as None
74
  boundary_model = None
 
76
  kmer_to_index = None
77
  analyzer = None
78
 
79
+ # --- Enhanced Model Loading with Correct Paths ---
80
  def load_models_safely():
81
  global boundary_model, keras_model, kmer_to_index, analyzer
82
+
83
+ logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
84
+ logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
85
+
86
+ if os.path.exists(MODELS_DIR):
87
+ logger.info(f"📂 Contents of models directory: {os.listdir(MODELS_DIR)}")
88
+
89
+ # Load Boundary Model - Try local first, then HF from correct repo
90
  try:
91
+ # Local model paths
92
+ local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
93
+
94
+ if os.path.exists(local_boundary_path):
95
+ logger.info(f"✅ Loading boundary model from local path: {local_boundary_path}")
96
+ boundary_model = EnhancedGenePredictor(local_boundary_path)
97
+ logger.info("✅ Boundary model loaded successfully from local directory")
98
+ elif hf_token:
99
+ logger.info("🌐 Attempting to load boundary model from Hugging Face...")
100
+ boundary_path = hf_hub_download(
101
+ repo_id=boundary_model_repo, # Correct repo for boundary model
102
+ filename="best_boundary_aware_model.pth",
103
+ token=hf_token,
104
+ cache_dir="/tmp/hf_cache"
105
+ )
106
+ if os.path.exists(boundary_path):
107
+ boundary_model = EnhancedGenePredictor(boundary_path)
108
+ logger.info("✅ Boundary model loaded successfully from HF")
109
+ else:
110
+ logger.warning("❌ Boundary model file not found after HF download")
111
  else:
112
+ logger.warning("❌ No local boundary model found and no HF_TOKEN available")
113
  except Exception as e:
114
+ logger.error(f"❌ Failed to load boundary model: {e}")
115
+ boundary_model = None
116
 
117
+ # Load Keras Model - Try local first, then HF from correct repo
118
  try:
119
+ # Local model paths
120
+ local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
121
+ local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
122
+
123
+ if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
124
+ logger.info(f"✅ Loading Keras model from local paths:")
125
+ logger.info(f" - Keras model: {local_keras_path}")
126
+ logger.info(f" - K-mer index: {local_kmer_path}")
127
+
128
+ keras_model = load_model(local_keras_path)
129
+ with open(local_kmer_path, "rb") as f:
 
 
130
  kmer_to_index = pickle.load(f)
131
+ logger.info("✅ Keras model loaded successfully from local directory")
132
+
133
+ elif hf_token:
134
+ logger.info("🌐 Attempting to load Keras model from Hugging Face...")
135
+ keras_path = hf_hub_download(
136
+ repo_id=other_models_repo, # Correct repo for other models
137
+ filename="best_model.keras",
138
+ token=hf_token,
139
+ cache_dir="/tmp/hf_cache"
140
+ )
141
+ kmer_path = hf_hub_download(
142
+ repo_id=other_models_repo, # Correct repo for other models
143
+ filename="kmer_to_index.pkl",
144
+ token=hf_token,
145
+ cache_dir="/tmp/hf_cache"
146
+ )
147
+
148
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
149
+ keras_model = load_model(keras_path)
150
+ with open(kmer_path, "rb") as f:
151
+ kmer_to_index = pickle.load(f)
152
+ logger.info("✅ Keras model loaded successfully from HF")
153
+ else:
154
+ logger.warning("❌ Keras model files not found after HF download")
155
  else:
156
+ logger.warning("❌ No local Keras model found and no HF_TOKEN available")
157
  except Exception as e:
158
+ logger.error(f"❌ Failed to load Keras model: {e}")
159
+ keras_model = None
160
+ kmer_to_index = None
161
 
162
  # Initialize Tree Analyzer
163
  try:
164
  logger.info("🌳 Initializing tree analyzer...")
165
  analyzer = PhylogeneticTreeAnalyzer()
166
+
167
+ # Try multiple CSV locations
168
  csv_candidates = [
169
+ csv_path,
170
+ os.path.join(BASE_DIR, "f cleaned.csv"),
 
 
171
  "f_cleaned.csv",
172
+ os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
173
+ os.path.join(MODELS_DIR, "f_cleaned.csv") # Also check models directory
174
  ]
175
+
176
  csv_loaded = False
177
  for csv_candidate in csv_candidates:
178
  if os.path.exists(csv_candidate):
 
179
  try:
180
+ logger.info(f"📊 Trying to load CSV from: {csv_candidate}")
181
  if analyzer.load_data(csv_candidate):
182
+ logger.info(f"✅ Tree analyzer loaded CSV from: {csv_candidate}")
183
  csv_loaded = True
184
  break
185
  except Exception as e:
186
+ logger.warning(f"Failed to load CSV from {csv_candidate}: {e}")
187
  continue
188
+
189
  if not csv_loaded:
190
+ logger.error("❌ Failed to load CSV data from any location")
191
+ logger.info("📂 Available files in base directory:")
 
192
  try:
193
+ for file in os.listdir(BASE_DIR):
194
+ if file.endswith('.csv'):
195
+ logger.info(f" - {file}")
196
+ except:
197
+ pass
198
+ analyzer = None
199
  except Exception as e:
200
+ logger.error(f"❌ Failed to initialize tree analyzer: {e}")
201
  analyzer = None
202
 
203
  # Load models at startup
 
215
 
216
  def check_tool_availability():
217
  setup_binary_permissions()
218
+
219
+ # Check MAFFT
220
  mafft_available = False
221
  mafft_cmd = None
222
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
223
+
224
  for candidate in mafft_candidates:
225
  if shutil.which(candidate) or os.path.exists(candidate):
226
  try:
227
  result = subprocess.run(
228
+ [candidate, "--help"],
229
+ capture_output=True,
230
+ text=True,
231
  timeout=5
232
  )
233
  if result.returncode == 0 or "mafft" in result.stderr.lower():
 
237
  break
238
  except Exception as e:
239
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
240
+
241
+ # Check IQ-TREE
242
  iqtree_available = False
243
  iqtree_cmd = None
244
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
245
+
246
  for candidate in iqtree_candidates:
247
  if shutil.which(candidate) or os.path.exists(candidate):
248
  try:
249
  result = subprocess.run(
250
+ [candidate, "--help"],
251
+ capture_output=True,
252
+ text=True,
253
  timeout=5
254
  )
255
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
 
259
  break
260
  except Exception as e:
261
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
262
+
263
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
264
 
265
+ # --- Pipeline Functions (keeping your original logic) ---
266
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
267
  try:
268
  if len(sequence.strip()) < 100:
269
  return False, "Sequence too short (<100 bp).", None, None
270
+
271
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
272
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
273
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
274
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
275
+
276
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
277
  return False, "Reference alignment or tree not found.", None, None
278
+
279
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
280
  SeqIO.write([query_record], query_fasta, "fasta")
281
+
282
  with open(aligned_with_query, "w") as output_file:
283
  subprocess.run([
284
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
285
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
286
+
287
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
288
  return False, "MAFFT alignment failed.", None, None
289
+
290
  subprocess.run([
291
+ iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
292
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
293
  ], capture_output=True, text=True, timeout=1200, check=True)
294
+
295
  treefile = f"{output_prefix}.treefile"
296
  if not os.path.exists(treefile):
297
  return False, "IQ-TREE placement failed.", aligned_with_query, None
298
+
299
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
300
  return True, success_msg, aligned_with_query, treefile
301
+
302
  except Exception as e:
303
  logger.error(f"Phylogenetic placement failed: {e}")
304
  return False, f"Error: {str(e)}", None, None
 
309
  except:
310
  pass
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  def predict_with_keras(sequence):
313
  try:
314
  if not keras_model or not kmer_to_index:
315
  return "❌ Keras model not available."
316
+
317
  if len(sequence) < 6:
318
  return "❌ Sequence too short (<6 bp)."
319
+
320
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
321
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
322
  input_arr = np.array([indices])
323
+
324
  prediction = keras_model.predict(input_arr, verbose=0)[0]
325
  f_gene_prob = prediction[-1]
326
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
327
+
328
  return f"✅ {percentage}% F gene confidence"
329
  except Exception as e:
330
  logger.error(f"Keras prediction failed: {e}")
331
  return f"❌ Error: {str(e)}"
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
334
  try:
335
  dna_input = dna_input.upper().strip()
336
  if not dna_input:
337
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
338
+
339
+ # Clean sequence
340
  if not re.match('^[ACTGN]+$', dna_input):
341
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
342
+
343
  processed_sequence = dna_input
344
+
345
+ # Boundary prediction
346
  boundary_output = ""
347
  if boundary_model:
348
  try:
 
359
  processed_sequence = dna_input
360
  else:
361
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
362
+
363
+ # Keras prediction
364
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
365
+
366
+ # ML Tree (keeping your original logic)
367
  aligned_file = None
368
  phy_file = None
369
  ml_tree_output = ""
370
+
371
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
372
  try:
373
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
 
384
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
385
  else:
386
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
387
+
388
+ # Tree analysis
389
  tree_html_content = "No tree generated."
390
  report_html_content = "No report generated."
391
  simplified_ml_output = ""
392
+
393
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
394
  try:
395
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
396
  simplified_ml_output = tree_result
397
+
398
  if tree_html_path and os.path.exists(tree_html_path):
399
  with open(tree_html_path, 'r', encoding='utf-8') as f:
400
  tree_html_content = f.read()
401
  else:
402
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
403
+
404
  if report_html_path and os.path.exists(report_html_path):
405
  with open(report_html_path, 'r', encoding='utf-8') as f:
406
  report_html_content = f.read()
407
  else:
408
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
409
+
410
  except Exception as e:
411
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
412
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
 
415
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
416
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
417
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
418
+
419
+ # Summary
420
  summary_output = f"""
421
  📊 ANALYSIS SUMMARY:
422
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
427
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
428
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
429
  """
430
+
431
  return (
432
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
433
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
434
  )
435
+
436
  except Exception as e:
437
  logger.error(f"Pipeline error: {e}")
438
  error_msg = f"❌ Pipeline Error: {str(e)}"
439
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
440
 
441
+ # Keep your other functions (analyze_sequence_for_tree, build_maximum_likelihood_tree, etc.)
442
+ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
443
+ try:
444
+ if not analyzer:
445
+ return "❌ Tree analyzer not initialized.", None, None
446
+
447
+ if not sequence or len(sequence.strip()) < 10:
448
+ return "❌ Invalid sequence.", None, None
449
+
450
+ if not (1 <= matching_percentage <= 99):
451
+ return "❌ Matching percentage must be 1-99.", None, None
452
+
453
+ if not analyzer.find_query_sequence(sequence):
454
+ return "❌ Sequence not accepted.", None, None
455
+
456
+ matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
457
+ if not matched_ids:
458
+ return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
459
+
460
+ analyzer.build_tree_structure_with_ml_safe(matched_ids)
461
+ fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
462
+
463
+ query_id = analyzer.query_id or f"query_{int(time.time())}"
464
+ tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
465
+ fig.write_html(tree_html_path)
466
+
467
+ analyzer.matching_percentage = matching_percentage
468
+ report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
469
+ report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
470
+
471
+ return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
472
+
473
+ except Exception as e:
474
+ logger.error(f"Tree analysis failed: {e}")
475
+ return f"❌ Error: {str(e)}", None, None
476
+
477
+ def read_fasta_file(file_obj):
478
+ try:
479
+ if file_obj is None:
480
+ return ""
481
+
482
+ if isinstance(file_obj, str):
483
+ with open(file_obj, "r") as f:
484
+ content = f.read()
485
+ else:
486
+ content = file_obj.read().decode("utf-8")
487
+
488
+ lines = content.strip().split("\n")
489
+ seq_lines = [line.strip() for line in lines if not line.startswith(">")]
490
+ return ''.join(seq_lines)
491
+
492
+ except Exception as e:
493
+ logger.error(f"Failed to read FASTA file: {e}")
494
+ return ""
495
+
496
  async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
497
  try:
498
  dna_input = read_fasta_file(fasta_file_obj)
 
549
  },
550
  "paths": {
551
  "base_dir": BASE_DIR,
552
+ "models_dir": MODELS_DIR,
553
+ "models_dir_exists": os.path.exists(MODELS_DIR),
554
+ "csv_path": csv_path,
555
+ "csv_exists": os.path.exists(csv_path)
556
+ },
557
+ "model_repos": {
558
+ "boundary_model": boundary_model_repo,
559
+ "other_models": other_models_repo
560
+ },
561
+ "recommendations": {
562
+ "models": "Models loaded from local directory" if (boundary_model and keras_model) else "Check models directory",
563
+ "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
564
  }
565
  }
566
  except Exception as e:
 
582
  except Exception as e:
583
  logger.error(f"Analyze error: {e}")
584
  return AnalysisResponse(
585
+ boundary_output="", keras_output="", ml_tree_output="",
586
  tree_analysis_output="", summary_output="",
587
  success=False, error_message=str(e)
588
  )
589
 
590
  @app.post("/analyze-file")
591
  async def analyze_file(
592
+ file: UploadFile = File(...),
593
+ similarity_score: float = Form(95.0),
594
  build_ml_tree: bool = Form(False)
595
  ):
596
  temp_file_path = None
 
599
  content = await file.read()
600
  temp_file.write(content)
601
  temp_file_path = temp_file.name
602
+
603
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
604
+
605
  return AnalysisResponse(
606
  boundary_output=result[0] or "",
607
  keras_output=result[1] or "",
 
613
  except Exception as e:
614
  logger.error(f"Analyze-file error: {e}")
615
  return AnalysisResponse(
616
+ boundary_output="", keras_output="", ml_tree_output="",
617
  tree_analysis_output="", summary_output="",
618
  success=False, error_message=str(e)
619
  )
 
624
  except:
625
  pass
626
 
627
+ # --- Fixed Gradio Interface ---
628
  def create_gradio_interface():
629
  try:
630
  with gr.Blocks(
 
638
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
639
  """
640
  ) as iface:
641
+
642
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
643
+
644
+ # Status display
645
  with gr.Row():
646
  with gr.Column():
647
  status_display = gr.HTML(value=f"""
 
649
  <h3>🔧 System Status</h3>
650
  <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
651
  <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
652
+ <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}
653
+ <p>🔬 MAFFT/IQ-TREE: {'✅ Available' if check_tool_availability()[0] and check_tool_availability()[1] else '❌ Missing'}</p>
 
 
654
  """)
655
+
656
+ # Input tabs
657
  with gr.Tabs():
658
  with gr.TabItem("📝 Text Input"):
659
+ dna_input = gr.Textbox(
660
+ label="🧬 DNA Sequence",
661
+ placeholder="Enter DNA sequence (ATCG format)...",
662
+ lines=5,
663
+ max_lines=10
664
+ )
665
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  with gr.TabItem("📁 File Upload"):
667
+ fasta_file = gr.File(
668
+ label="📄 Upload FASTA File",
669
+ file_types=[".fasta", ".fa", ".txt"],
670
+ file_count="single"
671
+ )
672
+
673
+ # Analysis options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
  with gr.Row():
675
  with gr.Column():
676
+ similarity_slider = gr.Slider(
677
+ minimum=1,
678
+ maximum=99,
679
+ value=95,
680
+ step=1,
681
+ label="🎯 Similarity Threshold (%)",
682
+ info="Minimum similarity for phylogenetic analysis"
683
+ )
684
+
685
+ with gr.Column():
686
+ ml_tree_checkbox = gr.Checkbox(
687
+ label="🌲 Build ML Tree",
688
+ value=False,
689
+ info="Perform phylogenetic placement (slower)"
690
+ )
691
+
692
+ # Action buttons
693
+ with gr.Row():
694
+ analyze_text_btn = gr.Button("🔍 Analyze Text", variant="primary", size="lg")
695
+ analyze_file_btn = gr.Button("📁 Analyze File", variant="secondary", size="lg")
696
+ clear_btn = gr.Button("🗑️ Clear", variant="stop")
697
+
698
+ # Results section
699
+ gr.Markdown("## 📊 Analysis Results")
700
+
701
+ with gr.Tabs():
702
+ with gr.TabItem("🎯 Boundary Prediction"):
703
  boundary_output = gr.Textbox(
704
+ label="🔍 F Gene Boundary Detection",
705
+ lines=3,
706
+ interactive=False
707
  )
708
+
709
+ with gr.TabItem("🧠 Keras Validation"):
710
  keras_output = gr.Textbox(
711
+ label="🤖 Neural Network Validation",
712
+ lines=3,
713
+ interactive=False
714
  )
715
+
716
+ with gr.TabItem("🌲 ML Tree Placement"):
717
  ml_tree_output = gr.Textbox(
718
+ label="🌳 Maximum Likelihood Tree",
719
+ lines=5,
720
+ interactive=False
721
  )
722
+
723
+ with gr.TabItem("📈 Tree Analysis"):
724
  tree_analysis_output = gr.Textbox(
725
+ label="📊 Phylogenetic Analysis",
726
+ lines=5,
727
+ interactive=False
728
  )
729
+
730
+ with gr.TabItem("📋 Summary"):
731
+ summary_output = gr.Textbox(
732
+ label="📝 Analysis Summary",
733
+ lines=10,
734
+ interactive=False
735
+ )
736
+
737
+ # Visualization section
738
  with gr.Tabs():
739
  with gr.TabItem("🌳 Interactive Tree"):
740
  tree_html = gr.HTML(
741
  label="Phylogenetic Tree Visualization",
742
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Tree visualization will appear here after analysis</div>"
743
  )
744
+
745
  with gr.TabItem("📊 Detailed Report"):
746
  report_html = gr.HTML(
747
  label="Analysis Report",
748
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Detailed report will appear here after analysis</div>"
749
  )
750
+
751
+ # File downloads
752
+ gr.Markdown("## 📥 Download Results")
753
+ with gr.Row():
754
+ aligned_file = gr.File(
755
+ label="📄 Aligned Sequences",
756
+ interactive=False
757
+ )
758
+ tree_file = gr.File(
759
+ label="🌳 Tree File",
760
+ interactive=False
761
+ )
762
+
763
  # Event handlers
764
+ def clear_all():
765
+ return (
766
+ "", # dna_input
767
+ None, # fasta_file
768
+ "", # boundary_output
769
+ "", # keras_output
770
+ "", # ml_tree_output
771
+ "", # tree_analysis_output
772
+ "", # summary_output
773
+ "<div style='text-align: center; padding: 20px; color: #666;'>Tree visualization will appear here after analysis</div>", # tree_html
774
+ "<div style='text-align: center; padding: 20px; color: #666;'>Detailed report will appear here after analysis</div>", # report_html
775
+ None, # aligned_file
776
+ None # tree_file
777
+ )
778
+
779
+ # Text analysis
780
+ analyze_text_btn.click(
781
  fn=run_pipeline,
782
+ inputs=[dna_input, similarity_slider, ml_tree_checkbox],
783
  outputs=[
784
+ boundary_output,
785
+ keras_output,
786
+ ml_tree_output,
787
+ tree_analysis_output,
788
+ summary_output,
789
+ aligned_file,
790
+ tree_file,
791
+ gr.State(), # placeholder for additional outputs
792
+ gr.State(), # placeholder for additional outputs
793
+ tree_html,
794
+ report_html
795
  ]
796
  )
797
+
798
+ # File analysis
799
  analyze_file_btn.click(
800
  fn=run_pipeline_from_file,
801
+ inputs=[fasta_file, similarity_slider, ml_tree_checkbox],
802
  outputs=[
803
+ boundary_output,
804
+ keras_output,
805
+ ml_tree_output,
806
+ tree_analysis_output,
807
+ summary_output,
808
+ aligned_file,
809
+ tree_file,
810
+ gr.State(), # placeholder for additional outputs
811
+ gr.State(), # placeholder for additional outputs
812
+ tree_html,
813
+ report_html
814
  ]
815
  )
816
+
817
+ # Clear button
818
+ clear_btn.click(
819
+ fn=clear_all,
820
+ outputs=[
821
+ dna_input,
822
+ fasta_file,
823
+ boundary_output,
824
+ keras_output,
825
+ ml_tree_output,
826
+ tree_analysis_output,
827
+ summary_output,
828
+ tree_html,
829
+ report_html,
830
+ aligned_file,
831
+ tree_file
832
+ ]
833
+ )
834
+
835
  # Examples
836
+ gr.Markdown("## 🧪 Example Sequences")
837
  gr.Examples(
838
  examples=[
839
+ ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGCAGCAGCAGCAGCAGCAGCAGCAGCAGC", 95.0, False],
840
+ ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGCAGCAGCAGCAGCAGCAGCAGCAGCAGC", 85.0, True],
841
+ ["ATGGAGCTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAG", 90.0, False]
 
 
 
 
 
 
 
842
  ],
843
+ inputs=[dna_input, similarity_slider, ml_tree_checkbox],
844
+ label="Click to load example sequences"
845
  )
846
+
847
+ # Footer
848
+ gr.Markdown("""
849
+ ---
850
+
851
+ ### 🔬 About This Pipeline
852
+
853
+ This tool performs comprehensive analysis of DNA sequences using multiple approaches:
854
+
855
+ - **🎯 Boundary Detection**: Identifies F gene regions using ML models
856
+ - **🧠 Keras Validation**: Neural network-based sequence validation
857
+ - **🌲 ML Tree Placement**: Phylogenetic placement using MAFFT + IQ-TREE
858
+ - **📈 Tree Analysis**: Interactive phylogenetic analysis and visualization
859
+
860
+ ### 📝 Usage Notes
861
+
862
+ - Sequences should be in ATCG format (other characters will be converted to N)
863
+ - Minimum 100 bp recommended for phylogenetic placement
864
+ - Higher similarity thresholds = fewer but more similar sequences
865
+ - ML tree building requires MAFFT and IQ-TREE (slower but more accurate)
866
+
867
+ ### ⚠️ System Requirements
868
+
869
+ - Python packages: gradio, torch, tensorflow, biopython, plotly
870
+ - Bioinformatics tools: MAFFT, IQ-TREE (optional for ML placement)
871
+ - Pre-trained models: boundary detection + keras validation models
872
+ """)
873
+
874
  return iface
875
+
876
  except Exception as e:
877
+ logger.error(f"Failed to create Gradio interface: {e}")
878
  # Fallback simple interface
879
+ with gr.Blocks() as fallback_iface:
880
+ gr.Markdown("# 🧬 Gene Analysis Pipeline (Fallback Mode)")
881
+ gr.Markdown(f"⚠️ Error creating full interface: {str(e)}")
882
+
883
+ dna_input = gr.Textbox(label="DNA Sequence", lines=5)
884
+ analyze_btn = gr.Button("Analyze")
885
+ output = gr.Textbox(label="Results", lines=10)
886
+
887
+ analyze_btn.click(
888
+ fn=lambda seq: run_pipeline(seq, 95.0, False)[4], # Just return summary
889
+ inputs=[dna_input],
890
+ outputs=[output]
891
  )
892
+
893
+ return fallback_iface
894
 
895
+ # --- Application Startup ---
896
+ if __name__ == "__main__":
 
897
  try:
898
  # Create Gradio interface
899
+ gr_interface = create_gradio_interface()
900
 
901
  # Mount Gradio app to FastAPI
902
+ gr_app = gr.mount_gradio_app(app, gr_interface, path="/gradio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
903
 
904
+ # Log startup info
905
+ logger.info("🚀 Starting Gene Analysis Pipeline...")
906
+ logger.info(f"📁 Base directory: {BASE_DIR}")
907
+ logger.info(f"🤖 Models loaded: Boundary={boundary_model is not None}, Keras={keras_model is not None}")
908
+ logger.info(f"🌳 Tree analyzer: {analyzer is not None}")
909
+
910
+ mafft_available, iqtree_available, _, _ = check_tool_availability()
911
+ logger.info(f"🔬 Tools available: MAFFT={mafft_available}, IQ-TREE={iqtree_available}")
912
+
913
+ # Start server
914
+ logger.info("🌐 Starting server on http://0.0.0.0:7860")
915
+ logger.info("📊 FastAPI docs: http://0.0.0.0:7860/docs")
916
+ logger.info("🎮 Gradio interface: http://0.0.0.0:7860/gradio")
917
+
918
+ uvicorn.run(
919
+ app,
920
+ host="0.0.0.0",
921
+ port=7860,
922
+ log_level="info",
923
+ access_log=True
924
+ )
925
 
926
  except Exception as e:
927
+ logger.error(f" Startup failed: {e}")
928
+ print(f"❌ Failed to start application: {e}")
929
+ sys.exit(1)