re-type commited on
Commit
1d23751
·
verified ·
1 Parent(s): 956abb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -470
app.py CHANGED
@@ -1,9 +1,14 @@
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import pickle
4
  import subprocess
5
  import pandas as pd
6
- import os
7
  import re
8
  import logging
9
  import numpy as np
@@ -22,14 +27,16 @@ from Bio.SeqRecord import SeqRecord
22
  import stat
23
  import time
24
  import asyncio
25
-
26
- # FastAPI imports
27
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
28
  from fastapi.responses import HTMLResponse
29
  from pydantic import BaseModel
30
  from typing import Optional
31
  import uvicorn
32
 
 
 
 
 
33
  # Set event loop policy for Spaces
34
  try:
35
  asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@@ -43,8 +50,6 @@ app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
43
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44
  log_handler = logging.StreamHandler()
45
  log_handler.setFormatter(log_formatter)
46
-
47
- # File handler with error handling
48
  try:
49
  file_handler = logging.FileHandler('/tmp/app.log')
50
  file_handler.setFormatter(log_formatter)
@@ -52,23 +57,18 @@ try:
52
  except Exception:
53
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
54
 
55
- logger = logging.getLogger(__name__)
56
-
57
  # --- Global Variables ---
58
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
59
- MODELS_DIR = os.path.join(BASE_DIR, "models") # Local models directory
60
- MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
61
- IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
62
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
63
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
64
- QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
65
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
66
 
67
- # --- Corrected Paths ---
68
- boundary_model_repo = "GGproject10/best_boundary_aware_model"
69
- other_models_repo = "GGproject10/simplified_tree_AI"
70
- csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
71
- hf_token = os.getenv("HF_TOKEN")
72
 
73
  # Initialize models as None
74
  boundary_model = None
@@ -76,128 +76,85 @@ keras_model = None
76
  kmer_to_index = None
77
  analyzer = None
78
 
79
- # --- Enhanced Model Loading with Correct Paths ---
80
  def load_models_safely():
81
  global boundary_model, keras_model, kmer_to_index, analyzer
82
-
83
- logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
84
- logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
85
-
86
- if os.path.exists(MODELS_DIR):
87
- logger.info(f"📂 Contents of models directory: {os.listdir(MODELS_DIR)}")
88
-
89
- # Load Boundary Model - Try local first, then HF from correct repo
90
  try:
91
- # Local model paths
92
- local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
93
-
94
- if os.path.exists(local_boundary_path):
95
- logger.info(f"✅ Loading boundary model from local path: {local_boundary_path}")
96
- boundary_model = EnhancedGenePredictor(local_boundary_path)
97
- logger.info("✅ Boundary model loaded successfully from local directory")
98
- elif hf_token:
99
- logger.info("🌐 Attempting to load boundary model from Hugging Face...")
100
- boundary_path = hf_hub_download(
101
- repo_id=boundary_model_repo, # Correct repo for boundary model
102
- filename="best_boundary_aware_model.pth",
103
- token=hf_token,
104
- cache_dir="/tmp/hf_cache"
105
- )
106
- if os.path.exists(boundary_path):
107
- boundary_model = EnhancedGenePredictor(boundary_path)
108
- logger.info("✅ Boundary model loaded successfully from HF")
109
- else:
110
- logger.warning("❌ Boundary model file not found after HF download")
111
  else:
112
- logger.warning("❌ No local boundary model found and no HF_TOKEN available")
113
  except Exception as e:
114
- logger.error(f"❌ Failed to load boundary model: {e}")
115
- boundary_model = None
116
 
117
- # Load Keras Model - Try local first, then HF from correct repo
118
  try:
119
- # Local model paths
120
- local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
121
- local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
122
-
123
- if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
124
- logger.info(f"✅ Loading Keras model from local paths:")
125
- logger.info(f" - Keras model: {local_keras_path}")
126
- logger.info(f" - K-mer index: {local_kmer_path}")
127
-
128
- keras_model = load_model(local_keras_path)
129
- with open(local_kmer_path, "rb") as f:
 
 
130
  kmer_to_index = pickle.load(f)
131
- logger.info("✅ Keras model loaded successfully from local directory")
132
-
133
- elif hf_token:
134
- logger.info("🌐 Attempting to load Keras model from Hugging Face...")
135
- keras_path = hf_hub_download(
136
- repo_id=other_models_repo, # Correct repo for other models
137
- filename="best_model.keras",
138
- token=hf_token,
139
- cache_dir="/tmp/hf_cache"
140
- )
141
- kmer_path = hf_hub_download(
142
- repo_id=other_models_repo, # Correct repo for other models
143
- filename="kmer_to_index.pkl",
144
- token=hf_token,
145
- cache_dir="/tmp/hf_cache"
146
- )
147
-
148
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
149
- keras_model = load_model(keras_path)
150
- with open(kmer_path, "rb") as f:
151
- kmer_to_index = pickle.load(f)
152
- logger.info("✅ Keras model loaded successfully from HF")
153
- else:
154
- logger.warning("❌ Keras model files not found after HF download")
155
  else:
156
- logger.warning("❌ No local Keras model found and no HF_TOKEN available")
157
  except Exception as e:
158
- logger.error(f"❌ Failed to load Keras model: {e}")
159
- keras_model = None
160
- kmer_to_index = None
161
 
162
  # Initialize Tree Analyzer
163
  try:
164
  logger.info("🌳 Initializing tree analyzer...")
165
  analyzer = PhylogeneticTreeAnalyzer()
166
-
167
- # Try multiple CSV locations
168
  csv_candidates = [
169
- csv_path,
170
- os.path.join(BASE_DIR, "f cleaned.csv"),
 
 
171
  "f_cleaned.csv",
172
- os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
173
- os.path.join(MODELS_DIR, "f_cleaned.csv") # Also check models directory
174
  ]
175
-
176
  csv_loaded = False
177
  for csv_candidate in csv_candidates:
178
  if os.path.exists(csv_candidate):
 
179
  try:
180
- logger.info(f"📊 Trying to load CSV from: {csv_candidate}")
181
  if analyzer.load_data(csv_candidate):
182
- logger.info(f"✅ Tree analyzer loaded CSV from: {csv_candidate}")
183
  csv_loaded = True
184
  break
185
  except Exception as e:
186
- logger.warning(f"Failed to load CSV from {csv_candidate}: {e}")
187
  continue
188
-
189
  if not csv_loaded:
190
- logger.error("❌ Failed to load CSV data from any location")
191
- logger.info("📂 Available files in base directory:")
192
- try:
193
- for file in os.listdir(BASE_DIR):
194
- if file.endswith('.csv'):
195
- logger.info(f" - {file}")
196
- except:
197
- pass
198
  analyzer = None
 
 
 
 
 
 
 
 
199
  except Exception as e:
200
- logger.error(f"❌ Failed to initialize tree analyzer: {e}")
201
  analyzer = None
202
 
203
  # Load models at startup
@@ -215,19 +172,16 @@ def setup_binary_permissions():
215
 
216
  def check_tool_availability():
217
  setup_binary_permissions()
218
-
219
- # Check MAFFT
220
  mafft_available = False
221
  mafft_cmd = None
222
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
223
-
224
  for candidate in mafft_candidates:
225
  if shutil.which(candidate) or os.path.exists(candidate):
226
  try:
227
  result = subprocess.run(
228
- [candidate, "--help"],
229
- capture_output=True,
230
- text=True,
231
  timeout=5
232
  )
233
  if result.returncode == 0 or "mafft" in result.stderr.lower():
@@ -237,19 +191,16 @@ def check_tool_availability():
237
  break
238
  except Exception as e:
239
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
240
-
241
- # Check IQ-TREE
242
  iqtree_available = False
243
  iqtree_cmd = None
244
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
245
-
246
  for candidate in iqtree_candidates:
247
  if shutil.which(candidate) or os.path.exists(candidate):
248
  try:
249
  result = subprocess.run(
250
- [candidate, "--help"],
251
- capture_output=True,
252
- text=True,
253
  timeout=5
254
  )
255
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
@@ -259,46 +210,36 @@ def check_tool_availability():
259
  break
260
  except Exception as e:
261
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
262
-
263
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
264
 
265
- # --- Pipeline Functions (keeping your original logic) ---
266
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
267
  try:
268
  if len(sequence.strip()) < 100:
269
  return False, "Sequence too short (<100 bp).", None, None
270
-
271
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
272
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
273
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
274
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
275
-
276
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
277
  return False, "Reference alignment or tree not found.", None, None
278
-
279
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
280
  SeqIO.write([query_record], query_fasta, "fasta")
281
-
282
  with open(aligned_with_query, "w") as output_file:
283
  subprocess.run([
284
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
285
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
286
-
287
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
288
  return False, "MAFFT alignment failed.", None, None
289
-
290
  subprocess.run([
291
- iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
292
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
293
  ], capture_output=True, text=True, timeout=1200, check=True)
294
-
295
  treefile = f"{output_prefix}.treefile"
296
  if not os.path.exists(treefile):
297
  return False, "IQ-TREE placement failed.", aligned_with_query, None
298
-
299
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
300
  return True, success_msg, aligned_with_query, treefile
301
-
302
  except Exception as e:
303
  logger.error(f"Phylogenetic placement failed: {e}")
304
  return False, f"Error: {str(e)}", None, None
@@ -309,40 +250,73 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
309
  except:
310
  pass
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  def predict_with_keras(sequence):
313
  try:
314
  if not keras_model or not kmer_to_index:
315
  return "❌ Keras model not available."
316
-
317
  if len(sequence) < 6:
318
  return "❌ Sequence too short (<6 bp)."
319
-
320
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
321
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
322
  input_arr = np.array([indices])
323
-
324
  prediction = keras_model.predict(input_arr, verbose=0)[0]
325
  f_gene_prob = prediction[-1]
326
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
327
-
328
  return f"✅ {percentage}% F gene confidence"
329
  except Exception as e:
330
  logger.error(f"Keras prediction failed: {e}")
331
  return f"❌ Error: {str(e)}"
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
334
  try:
335
  dna_input = dna_input.upper().strip()
336
  if not dna_input:
337
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
338
-
339
- # Clean sequence
340
  if not re.match('^[ACTGN]+$', dna_input):
341
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
342
-
343
  processed_sequence = dna_input
344
-
345
- # Boundary prediction
346
  boundary_output = ""
347
  if boundary_model:
348
  try:
@@ -359,15 +333,10 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
359
  processed_sequence = dna_input
360
  else:
361
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
362
-
363
- # Keras prediction
364
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
365
-
366
- # ML Tree (keeping your original logic)
367
  aligned_file = None
368
  phy_file = None
369
  ml_tree_output = ""
370
-
371
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
372
  try:
373
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
@@ -384,29 +353,23 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
384
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
385
  else:
386
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
387
-
388
- # Tree analysis
389
  tree_html_content = "No tree generated."
390
  report_html_content = "No report generated."
391
  simplified_ml_output = ""
392
-
393
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
394
  try:
395
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
396
  simplified_ml_output = tree_result
397
-
398
  if tree_html_path and os.path.exists(tree_html_path):
399
  with open(tree_html_path, 'r', encoding='utf-8') as f:
400
  tree_html_content = f.read()
401
  else:
402
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
403
-
404
  if report_html_path and os.path.exists(report_html_path):
405
  with open(report_html_path, 'r', encoding='utf-8') as f:
406
  report_html_content = f.read()
407
  else:
408
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
409
-
410
  except Exception as e:
411
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
412
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
@@ -415,8 +378,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
415
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
416
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
417
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
418
-
419
- # Summary
420
  summary_output = f"""
421
  📊 ANALYSIS SUMMARY:
422
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -427,72 +388,15 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
427
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
428
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
429
  """
430
-
431
  return (
432
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
433
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
434
  )
435
-
436
  except Exception as e:
437
  logger.error(f"Pipeline error: {e}")
438
  error_msg = f"❌ Pipeline Error: {str(e)}"
439
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
440
 
441
- # Keep your other functions (analyze_sequence_for_tree, build_maximum_likelihood_tree, etc.)
442
- def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
443
- try:
444
- if not analyzer:
445
- return "❌ Tree analyzer not initialized.", None, None
446
-
447
- if not sequence or len(sequence.strip()) < 10:
448
- return "❌ Invalid sequence.", None, None
449
-
450
- if not (1 <= matching_percentage <= 99):
451
- return "❌ Matching percentage must be 1-99.", None, None
452
-
453
- if not analyzer.find_query_sequence(sequence):
454
- return "❌ Sequence not accepted.", None, None
455
-
456
- matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
457
- if not matched_ids:
458
- return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
459
-
460
- analyzer.build_tree_structure_with_ml_safe(matched_ids)
461
- fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
462
-
463
- query_id = analyzer.query_id or f"query_{int(time.time())}"
464
- tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
465
- fig.write_html(tree_html_path)
466
-
467
- analyzer.matching_percentage = matching_percentage
468
- report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
469
- report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
470
-
471
- return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
472
-
473
- except Exception as e:
474
- logger.error(f"Tree analysis failed: {e}")
475
- return f"❌ Error: {str(e)}", None, None
476
-
477
- def read_fasta_file(file_obj):
478
- try:
479
- if file_obj is None:
480
- return ""
481
-
482
- if isinstance(file_obj, str):
483
- with open(file_obj, "r") as f:
484
- content = f.read()
485
- else:
486
- content = file_obj.read().decode("utf-8")
487
-
488
- lines = content.strip().split("\n")
489
- seq_lines = [line.strip() for line in lines if not line.startswith(">")]
490
- return ''.join(seq_lines)
491
-
492
- except Exception as e:
493
- logger.error(f"Failed to read FASTA file: {e}")
494
- return ""
495
-
496
  async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
497
  try:
498
  dna_input = read_fasta_file(fasta_file_obj)
@@ -549,18 +453,7 @@ async def health_check():
549
  },
550
  "paths": {
551
  "base_dir": BASE_DIR,
552
- "models_dir": MODELS_DIR,
553
- "models_dir_exists": os.path.exists(MODELS_DIR),
554
- "csv_path": csv_path,
555
- "csv_exists": os.path.exists(csv_path)
556
- },
557
- "model_repos": {
558
- "boundary_model": boundary_model_repo,
559
- "other_models": other_models_repo
560
- },
561
- "recommendations": {
562
- "models": "Models loaded from local directory" if (boundary_model and keras_model) else "Check models directory",
563
- "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
564
  }
565
  }
566
  except Exception as e:
@@ -582,15 +475,15 @@ async def analyze_sequence(request: AnalysisRequest):
582
  except Exception as e:
583
  logger.error(f"Analyze error: {e}")
584
  return AnalysisResponse(
585
- boundary_output="", keras_output="", ml_tree_output="",
586
  tree_analysis_output="", summary_output="",
587
  success=False, error_message=str(e)
588
  )
589
 
590
  @app.post("/analyze-file")
591
  async def analyze_file(
592
- file: UploadFile = File(...),
593
- similarity_score: float = Form(95.0),
594
  build_ml_tree: bool = Form(False)
595
  ):
596
  temp_file_path = None
@@ -599,9 +492,7 @@ async def analyze_file(
599
  content = await file.read()
600
  temp_file.write(content)
601
  temp_file_path = temp_file.name
602
-
603
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
604
-
605
  return AnalysisResponse(
606
  boundary_output=result[0] or "",
607
  keras_output=result[1] or "",
@@ -613,7 +504,7 @@ async def analyze_file(
613
  except Exception as e:
614
  logger.error(f"Analyze-file error: {e}")
615
  return AnalysisResponse(
616
- boundary_output="", keras_output="", ml_tree_output="",
617
  tree_analysis_output="", summary_output="",
618
  success=False, error_message=str(e)
619
  )
@@ -624,7 +515,7 @@ async def analyze_file(
624
  except:
625
  pass
626
 
627
- # --- Fixed Gradio Interface ---
628
  def create_gradio_interface():
629
  try:
630
  with gr.Blocks(
@@ -638,10 +529,7 @@ def create_gradio_interface():
638
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
639
  """
640
  ) as iface:
641
-
642
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
643
-
644
- # Status display
645
  with gr.Row():
646
  with gr.Column():
647
  status_display = gr.HTML(value=f"""
@@ -649,281 +537,211 @@ def create_gradio_interface():
649
  <h3>🔧 System Status</h3>
650
  <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
651
  <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
652
- <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}
653
- <p>🔬 MAFFT/IQ-TREE: {'✅ Available' if check_tool_availability()[0] and check_tool_availability()[1] else '❌ Missing'}</p>
 
 
654
  """)
655
-
656
- # Input tabs
657
  with gr.Tabs():
658
  with gr.TabItem("📝 Text Input"):
659
- dna_input = gr.Textbox(
660
- label="🧬 DNA Sequence",
661
- placeholder="Enter DNA sequence (ATCG format)...",
662
- lines=5,
663
- max_lines=10
664
- )
665
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  with gr.TabItem("📁 File Upload"):
667
- fasta_file = gr.File(
668
- label="📄 Upload FASTA File",
669
- file_types=[".fasta", ".fa", ".txt"],
670
- file_count="single"
671
- )
672
-
673
- # Analysis options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
  with gr.Row():
675
  with gr.Column():
676
- similarity_slider = gr.Slider(
677
- minimum=1,
678
- maximum=99,
679
- value=95,
680
- step=1,
681
- label="🎯 Similarity Threshold (%)",
682
- info="Minimum similarity for phylogenetic analysis"
683
- )
684
-
685
- with gr.Column():
686
- ml_tree_checkbox = gr.Checkbox(
687
- label="🌲 Build ML Tree",
688
- value=False,
689
- info="Perform phylogenetic placement (slower)"
690
- )
691
-
692
- # Action buttons
693
- with gr.Row():
694
- analyze_text_btn = gr.Button("🔍 Analyze Text", variant="primary", size="lg")
695
- analyze_file_btn = gr.Button("📁 Analyze File", variant="secondary", size="lg")
696
- clear_btn = gr.Button("🗑️ Clear", variant="stop")
697
-
698
- # Results section
699
- gr.Markdown("## 📊 Analysis Results")
700
-
701
- with gr.Tabs():
702
- with gr.TabItem("🎯 Boundary Prediction"):
703
  boundary_output = gr.Textbox(
704
- label="🔍 F Gene Boundary Detection",
705
- lines=3,
706
- interactive=False
707
  )
708
-
709
- with gr.TabItem("🧠 Keras Validation"):
710
  keras_output = gr.Textbox(
711
- label="🤖 Neural Network Validation",
712
- lines=3,
713
- interactive=False
714
  )
715
-
716
- with gr.TabItem("🌲 ML Tree Placement"):
717
  ml_tree_output = gr.Textbox(
718
- label="🌳 Maximum Likelihood Tree",
719
- lines=5,
720
- interactive=False
721
  )
722
-
723
- with gr.TabItem("📈 Tree Analysis"):
724
  tree_analysis_output = gr.Textbox(
725
- label="📊 Phylogenetic Analysis",
726
- lines=5,
727
- interactive=False
728
- )
729
-
730
- with gr.TabItem("📋 Summary"):
731
- summary_output = gr.Textbox(
732
- label="📝 Analysis Summary",
733
- lines=10,
734
- interactive=False
735
  )
736
-
737
- # Visualization section
 
 
 
 
 
 
738
  with gr.Tabs():
739
  with gr.TabItem("🌳 Interactive Tree"):
740
  tree_html = gr.HTML(
741
- label="Phylogenetic Tree Visualization",
742
- value="<div style='text-align: center; padding: 20px; color: #666;'>Tree visualization will appear here after analysis</div>"
743
  )
744
-
745
  with gr.TabItem("📊 Detailed Report"):
746
  report_html = gr.HTML(
747
  label="Analysis Report",
748
- value="<div style='text-align: center; padding: 20px; color: #666;'>Detailed report will appear here after analysis</div>"
749
  )
750
-
751
- # File downloads
752
- gr.Markdown("## 📥 Download Results")
753
- with gr.Row():
754
- aligned_file = gr.File(
755
- label="📄 Aligned Sequences",
756
- interactive=False
757
- )
758
- tree_file = gr.File(
759
- label="🌳 Tree File",
760
- interactive=False
761
- )
762
-
763
- # Event handlers
764
- def clear_all():
765
- return (
766
- "", # dna_input
767
- None, # fasta_file
768
- "", # boundary_output
769
- "", # keras_output
770
- "", # ml_tree_output
771
- "", # tree_analysis_output
772
- "", # summary_output
773
- "<div style='text-align: center; padding: 20px; color: #666;'>Tree visualization will appear here after analysis</div>", # tree_html
774
- "<div style='text-align: center; padding: 20px; color: #666;'>Detailed report will appear here after analysis</div>", # report_html
775
- None, # aligned_file
776
- None # tree_file
777
- )
778
-
779
- # Text analysis
780
- analyze_text_btn.click(
781
  fn=run_pipeline,
782
- inputs=[dna_input, similarity_slider, ml_tree_checkbox],
783
  outputs=[
784
- boundary_output,
785
- keras_output,
786
- ml_tree_output,
787
- tree_analysis_output,
788
- summary_output,
789
- aligned_file,
790
- tree_file,
791
- gr.State(), # placeholder for additional outputs
792
- gr.State(), # placeholder for additional outputs
793
- tree_html,
794
- report_html
795
  ]
796
  )
797
-
798
- # File analysis
799
  analyze_file_btn.click(
800
  fn=run_pipeline_from_file,
801
- inputs=[fasta_file, similarity_slider, ml_tree_checkbox],
802
  outputs=[
803
- boundary_output,
804
- keras_output,
805
- ml_tree_output,
806
- tree_analysis_output,
807
- summary_output,
808
- aligned_file,
809
- tree_file,
810
- gr.State(), # placeholder for additional outputs
811
- gr.State(), # placeholder for additional outputs
812
- tree_html,
813
- report_html
814
  ]
815
  )
816
-
817
- # Clear button
818
- clear_btn.click(
819
- fn=clear_all,
820
- outputs=[
821
- dna_input,
822
- fasta_file,
823
- boundary_output,
824
- keras_output,
825
- ml_tree_output,
826
- tree_analysis_output,
827
- summary_output,
828
- tree_html,
829
- report_html,
830
- aligned_file,
831
- tree_file
832
- ]
833
- )
834
-
835
- # Examples
836
- gr.Markdown("## 🧪 Example Sequences")
837
  gr.Examples(
838
- examples=[
839
- ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGCAGCAGCAGCAGCAGCAGCAGCAGCAGC", 95.0, False],
840
- ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGCAGCAGCAGCAGCAGCAGCAGCAGCAGC", 85.0, True],
841
- ["ATGGAGCTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAG", 90.0, False]
842
- ],
843
- inputs=[dna_input, similarity_slider, ml_tree_checkbox],
844
  label="Click to load example sequences"
845
  )
846
-
847
- # Footer
848
- gr.Markdown("""
849
- ---
850
-
851
- ### 🔬 About This Pipeline
852
-
853
- This tool performs comprehensive analysis of DNA sequences using multiple approaches:
854
-
855
- - **🎯 Boundary Detection**: Identifies F gene regions using ML models
856
- - **🧠 Keras Validation**: Neural network-based sequence validation
857
- - **🌲 ML Tree Placement**: Phylogenetic placement using MAFFT + IQ-TREE
858
- - **📈 Tree Analysis**: Interactive phylogenetic analysis and visualization
859
-
860
- ### 📝 Usage Notes
861
-
862
- - Sequences should be in ATCG format (other characters will be converted to N)
863
- - Minimum 100 bp recommended for phylogenetic placement
864
- - Higher similarity thresholds = fewer but more similar sequences
865
- - ML tree building requires MAFFT and IQ-TREE (slower but more accurate)
866
-
867
- ### ⚠️ System Requirements
868
-
869
- - Python packages: gradio, torch, tensorflow, biopython, plotly
870
- - Bioinformatics tools: MAFFT, IQ-TREE (optional for ML placement)
871
- - Pre-trained models: boundary detection + keras validation models
872
- """)
873
-
874
  return iface
875
-
876
  except Exception as e:
877
  logger.error(f"Failed to create Gradio interface: {e}")
878
- # Fallback simple interface
879
- with gr.Blocks() as fallback_iface:
880
- gr.Markdown("# 🧬 Gene Analysis Pipeline (Fallback Mode)")
881
- gr.Markdown(f"⚠️ Error creating full interface: {str(e)}")
882
-
883
- dna_input = gr.Textbox(label="DNA Sequence", lines=5)
884
- analyze_btn = gr.Button("Analyze")
885
- output = gr.Textbox(label="Results", lines=10)
886
-
887
- analyze_btn.click(
888
- fn=lambda seq: run_pipeline(seq, 95.0, False)[4], # Just return summary
889
- inputs=[dna_input],
890
- outputs=[output]
891
- )
892
-
893
- return fallback_iface
894
 
895
  # --- Application Startup ---
896
- if __name__ == "__main__":
897
  try:
898
- # Create Gradio interface
899
- gr_interface = create_gradio_interface()
900
-
901
- # Mount Gradio app to FastAPI
902
- gr_app = gr.mount_gradio_app(app, gr_interface, path="/gradio")
903
-
904
- # Log startup info
905
- logger.info("🚀 Starting Gene Analysis Pipeline...")
906
- logger.info(f"📁 Base directory: {BASE_DIR}")
907
- logger.info(f"🤖 Models loaded: Boundary={boundary_model is not None}, Keras={keras_model is not None}")
908
- logger.info(f"🌳 Tree analyzer: {analyzer is not None}")
909
-
910
- mafft_available, iqtree_available, _, _ = check_tool_availability()
911
- logger.info(f"🔬 Tools available: MAFFT={mafft_available}, IQ-TREE={iqtree_available}")
912
-
913
- # Start server
914
- logger.info("🌐 Starting server on http://0.0.0.0:7860")
915
- logger.info("📊 FastAPI docs: http://0.0.0.0:7860/docs")
916
- logger.info("🎮 Gradio interface: http://0.0.0.0:7860/gradio")
917
-
918
- uvicorn.run(
919
- app,
920
- host="0.0.0.0",
921
- port=7860,
922
- log_level="info",
923
- access_log=True
924
- )
925
-
926
  except Exception as e:
927
- logger.error(f"❌ Startup failed: {e}")
928
- print(f"❌ Failed to start application: {e}")
929
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Disable GPU to avoid CUDA errors
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
4
+ # Suppress TensorFlow warnings
5
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6
+
7
  import gradio as gr
8
  import torch
9
  import pickle
10
  import subprocess
11
  import pandas as pd
 
12
  import re
13
  import logging
14
  import numpy as np
 
27
  import stat
28
  import time
29
  import asyncio
 
 
30
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
31
  from fastapi.responses import HTMLResponse
32
  from pydantic import BaseModel
33
  from typing import Optional
34
  import uvicorn
35
 
36
+ # Log Gradio version
37
+ logger = logging.getLogger(__name__)
38
+ logger.info(f"Gradio version: {gr.__version__}")
39
+
40
  # Set event loop policy for Spaces
41
  try:
42
  asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
 
50
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
51
  log_handler = logging.StreamHandler()
52
  log_handler.setFormatter(log_formatter)
 
 
53
  try:
54
  file_handler = logging.FileHandler('/tmp/app.log')
55
  file_handler.setFormatter(log_formatter)
 
57
  except Exception:
58
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
59
 
 
 
60
  # --- Global Variables ---
61
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
62
+ MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
63
+ IQTREE_PATH = os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
 
64
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
65
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
66
+ QUERY_OUTPUT_DIR = os.path.join(BASE_DIR, "queries")
67
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
68
 
69
+ # --- Model Configuration ---
70
+ MODEL_REPO = "GGproject10/best_boundary_aware_model"
71
+ CSV_PATH = "f cleaned.csv"
 
 
72
 
73
  # Initialize models as None
74
  boundary_model = None
 
76
  kmer_to_index = None
77
  analyzer = None
78
 
79
+ # --- Model Loading ---
80
  def load_models_safely():
81
  global boundary_model, keras_model, kmer_to_index, analyzer
82
+ logger.info("🔍 Loading models...")
83
+
84
+ # Load Boundary Model
 
 
 
 
 
85
  try:
86
+ boundary_path = hf_hub_download(
87
+ repo_id=MODEL_REPO,
88
+ filename="best_boundary_aware_model.pth",
89
+ token=None
90
+ )
91
+ if os.path.exists(boundary_path):
92
+ boundary_model = EnhancedGenePredictor(boundary_path)
93
+ logger.info("✅ Boundary model loaded successfully from Hugging Face Hub.")
 
 
 
 
 
 
 
 
 
 
 
 
94
  else:
95
+ logger.error(f"❌ Boundary model file not found after download from {MODEL_REPO}")
96
  except Exception as e:
97
+ logger.error(f"❌ Failed to load boundary model from HF Hub: {e}. Ensure {MODEL_REPO} is public and accessible.")
 
98
 
99
+ # Load Keras Model
100
  try:
101
+ keras_path = hf_hub_download(
102
+ repo_id=MODEL_REPO,
103
+ filename="best_model.keras",
104
+ token=None
105
+ )
106
+ kmer_path = hf_hub_download(
107
+ repo_id=MODEL_REPO,
108
+ filename="kmer_to_index.pkl",
109
+ token=None
110
+ )
111
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
112
+ keras_model = load_model(keras_path)
113
+ with open(kmer_path, "rb") as f:
114
  kmer_to_index = pickle.load(f)
115
+ logger.info("✅ Keras model and k-mer index loaded successfully from Hugging Face Hub.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  else:
117
+ logger.error(f"❌ Keras model or kmer files not found after download from {MODEL_REPO}")
118
  except Exception as e:
119
+ logger.error(f"❌ Failed to load Keras model from HF Hub: {e}. Ensure {MODEL_REPO} is public and accessible.")
 
 
120
 
121
  # Initialize Tree Analyzer
122
  try:
123
  logger.info("🌳 Initializing tree analyzer...")
124
  analyzer = PhylogeneticTreeAnalyzer()
 
 
125
  csv_candidates = [
126
+ CSV_PATH,
127
+ os.path.join(BASE_DIR, CSV_PATH),
128
+ os.path.join(BASE_DIR, "app", CSV_PATH),
129
+ os.path.join(os.path.dirname(__file__), CSV_PATH),
130
  "f_cleaned.csv",
131
+ os.path.join(BASE_DIR, "f_cleaned.csv")
 
132
  ]
 
133
  csv_loaded = False
134
  for csv_candidate in csv_candidates:
135
  if os.path.exists(csv_candidate):
136
+ logger.info(f"📊 Trying CSV: {csv_candidate}")
137
  try:
 
138
  if analyzer.load_data(csv_candidate):
139
+ logger.info(f"✅ CSV loaded from: {csv_candidate}")
140
  csv_loaded = True
141
  break
142
  except Exception as e:
143
+ logger.warning(f"CSV load failed for {csv_candidate}: {e}")
144
  continue
 
145
  if not csv_loaded:
146
+ logger.error("❌ Failed to load CSV data from any candidate location. Place 'f cleaned.csv' in project root.")
 
 
 
 
 
 
 
147
  analyzer = None
148
+ else:
149
+ try:
150
+ if analyzer.train_ai_model():
151
+ logger.info("✅ AI model training completed successfully")
152
+ else:
153
+ logger.warning("⚠️ AI model training failed; proceeding with basic analysis.")
154
+ except Exception as e:
155
+ logger.warning(f"⚠️ AI model training failed: {e}")
156
  except Exception as e:
157
+ logger.error(f"❌ Tree analyzer initialization failed: {e}")
158
  analyzer = None
159
 
160
  # Load models at startup
 
172
 
173
  def check_tool_availability():
174
  setup_binary_permissions()
 
 
175
  mafft_available = False
176
  mafft_cmd = None
177
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
178
  for candidate in mafft_candidates:
179
  if shutil.which(candidate) or os.path.exists(candidate):
180
  try:
181
  result = subprocess.run(
182
+ [candidate, "--help"],
183
+ capture_output=True,
184
+ text=True,
185
  timeout=5
186
  )
187
  if result.returncode == 0 or "mafft" in result.stderr.lower():
 
191
  break
192
  except Exception as e:
193
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
 
 
194
  iqtree_available = False
195
  iqtree_cmd = None
196
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
197
  for candidate in iqtree_candidates:
198
  if shutil.which(candidate) or os.path.exists(candidate):
199
  try:
200
  result = subprocess.run(
201
+ [candidate, "--help"],
202
+ capture_output=True,
203
+ text=True,
204
  timeout=5
205
  )
206
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
 
210
  break
211
  except Exception as e:
212
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
 
213
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
214
 
215
+ # --- Pipeline Functions ---
216
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
217
  try:
218
  if len(sequence.strip()) < 100:
219
  return False, "Sequence too short (<100 bp).", None, None
 
220
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
221
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
222
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
223
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
 
224
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
225
  return False, "Reference alignment or tree not found.", None, None
 
226
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
227
  SeqIO.write([query_record], query_fasta, "fasta")
 
228
  with open(aligned_with_query, "w") as output_file:
229
  subprocess.run([
230
  mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
231
  ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
 
232
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
233
  return False, "MAFFT alignment failed.", None, None
 
234
  subprocess.run([
235
+ iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
236
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
237
  ], capture_output=True, text=True, timeout=1200, check=True)
 
238
  treefile = f"{output_prefix}.treefile"
239
  if not os.path.exists(treefile):
240
  return False, "IQ-TREE placement failed.", aligned_with_query, None
 
241
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
242
  return True, success_msg, aligned_with_query, treefile
 
243
  except Exception as e:
244
  logger.error(f"Phylogenetic placement failed: {e}")
245
  return False, f"Error: {str(e)}", None, None
 
250
  except:
251
  pass
252
 
253
+ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
254
+ try:
255
+ if not analyzer:
256
+ return "❌ Tree analyzer not initialized.", None, None
257
+ if not sequence or len(sequence.strip()) < 10:
258
+ return "❌ Invalid sequence.", None, None
259
+ if not (1 <= matching_percentage <= 99):
260
+ return "❌ Matching percentage must be 1-99.", None, None
261
+ if not analyzer.find_query_sequence(sequence):
262
+ return "❌ Sequence not accepted.", None, None
263
+ matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
264
+ if not matched_ids:
265
+ return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
266
+ analyzer.build_tree_structure_with_ml_safe(matched_ids)
267
+ fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
268
+ query_id = analyzer.query_id or f"query_{int(time.time())}"
269
+ tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
270
+ fig.write_html(tree_html_path)
271
+ analyzer.matching_percentage = matching_percentage
272
+ report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
273
+ report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
274
+ return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
275
+ except Exception as e:
276
+ logger.error(f"Tree analysis failed: {e}")
277
+ return f"❌ Error: {str(e)}", None, None
278
+
279
  def predict_with_keras(sequence):
280
  try:
281
  if not keras_model or not kmer_to_index:
282
  return "❌ Keras model not available."
 
283
  if len(sequence) < 6:
284
  return "❌ Sequence too short (<6 bp)."
 
285
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
286
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
287
  input_arr = np.array([indices])
 
288
  prediction = keras_model.predict(input_arr, verbose=0)[0]
289
  f_gene_prob = prediction[-1]
290
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
291
  return f"✅ {percentage}% F gene confidence"
292
  except Exception as e:
293
  logger.error(f"Keras prediction failed: {e}")
294
  return f"❌ Error: {str(e)}"
295
 
296
+ def read_fasta_file(file_obj):
297
+ try:
298
+ if file_obj is None:
299
+ return ""
300
+ if isinstance(file_obj, str):
301
+ with open(file_obj, "r") as f:
302
+ content = f.read()
303
+ else:
304
+ content = file_obj.read().decode("utf-8")
305
+ lines = content.strip().split("\n")
306
+ seq_lines = [line.strip() for line in lines if not line.startswith(">")]
307
+ return ''.join(seq_lines)
308
+ except Exception as e:
309
+ logger.error(f"Failed to read FASTA file: {e}")
310
+ return ""
311
+
312
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
313
  try:
314
  dna_input = dna_input.upper().strip()
315
  if not dna_input:
316
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
 
 
317
  if not re.match('^[ACTGN]+$', dna_input):
318
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
 
319
  processed_sequence = dna_input
 
 
320
  boundary_output = ""
321
  if boundary_model:
322
  try:
 
333
  processed_sequence = dna_input
334
  else:
335
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
 
336
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
 
 
337
  aligned_file = None
338
  phy_file = None
339
  ml_tree_output = ""
 
340
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
341
  try:
342
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
 
353
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
354
  else:
355
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
 
356
  tree_html_content = "No tree generated."
357
  report_html_content = "No report generated."
358
  simplified_ml_output = ""
 
359
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
360
  try:
361
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
362
  simplified_ml_output = tree_result
 
363
  if tree_html_path and os.path.exists(tree_html_path):
364
  with open(tree_html_path, 'r', encoding='utf-8') as f:
365
  tree_html_content = f.read()
366
  else:
367
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
368
  if report_html_path and os.path.exists(report_html_path):
369
  with open(report_html_path, 'r', encoding='utf-8') as f:
370
  report_html_content = f.read()
371
  else:
372
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
373
  except Exception as e:
374
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
375
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
 
378
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
379
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
380
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
 
381
  summary_output = f"""
382
  📊 ANALYSIS SUMMARY:
383
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
388
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
389
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
390
  """
 
391
  return (
392
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
393
  aligned_file, phy_file, None, None, tree_html_content, report_html_content
394
  )
 
395
  except Exception as e:
396
  logger.error(f"Pipeline error: {e}")
397
  error_msg = f"❌ Pipeline Error: {str(e)}"
398
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
401
  try:
402
  dna_input = read_fasta_file(fasta_file_obj)
 
453
  },
454
  "paths": {
455
  "base_dir": BASE_DIR,
456
+ "query_output_dir": QUERY_OUTPUT_DIR
 
 
 
 
 
 
 
 
 
 
 
457
  }
458
  }
459
  except Exception as e:
 
475
  except Exception as e:
476
  logger.error(f"Analyze error: {e}")
477
  return AnalysisResponse(
478
+ boundary_output="", keras_output="", ml_tree_output="",
479
  tree_analysis_output="", summary_output="",
480
  success=False, error_message=str(e)
481
  )
482
 
483
  @app.post("/analyze-file")
484
  async def analyze_file(
485
+ file: UploadFile = File(...),
486
+ similarity_score: float = Form(95.0),
487
  build_ml_tree: bool = Form(False)
488
  ):
489
  temp_file_path = None
 
492
  content = await file.read()
493
  temp_file.write(content)
494
  temp_file_path = temp_file.name
 
495
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
496
  return AnalysisResponse(
497
  boundary_output=result[0] or "",
498
  keras_output=result[1] or "",
 
504
  except Exception as e:
505
  logger.error(f"Analyze-file error: {e}")
506
  return AnalysisResponse(
507
+ boundary_output="", keras_output="", ml_tree_output="",
508
  tree_analysis_output="", summary_output="",
509
  success=False, error_message=str(e)
510
  )
 
515
  except:
516
  pass
517
 
518
+ # --- Gradio Interface ---
519
  def create_gradio_interface():
520
  try:
521
  with gr.Blocks(
 
529
  .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
530
  """
531
  ) as iface:
 
532
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
 
 
533
  with gr.Row():
534
  with gr.Column():
535
  status_display = gr.HTML(value=f"""
 
537
  <h3>🔧 System Status</h3>
538
  <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
539
  <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
540
+ <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}</p>
541
+ <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
542
+ <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
543
+ </div>
544
  """)
 
 
545
  with gr.Tabs():
546
  with gr.TabItem("📝 Text Input"):
547
+ with gr.Row():
548
+ with gr.Column(scale=2):
549
+ dna_input = gr.Textbox(
550
+ label="🧬 DNA Sequence",
551
+ placeholder="Enter DNA sequence (ATCG format)...",
552
+ lines=5,
553
+ description="Paste your DNA sequence here"
554
+ )
555
+ with gr.Column(scale=1):
556
+ similarity_score = gr.Slider(
557
+ minimum=1,
558
+ maximum=99,
559
+ value=95.0,
560
+ step=1.0,
561
+ label="🎯 Similarity Threshold (%)",
562
+ description="Minimum similarity for tree analysis"
563
+ )
564
+ build_ml_tree = gr.Checkbox(
565
+ label="🌲 Build ML Tree",
566
+ value=False,
567
+ description="Generate phylogenetic placement (slower)"
568
+ )
569
+ analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
570
  with gr.TabItem("📁 File Upload"):
571
+ with gr.Row():
572
+ with gr.Column(scale=2):
573
+ file_input = gr.File(
574
+ label="📄 Upload FASTA File",
575
+ file_types=[".fasta", ".fa", ".fas", ".txt"],
576
+ description="Upload a FASTA file containing your sequence"
577
+ )
578
+ with gr.Column(scale=1):
579
+ file_similarity_score = gr.Slider(
580
+ minimum=1,
581
+ maximum=99,
582
+ value=95.0,
583
+ step=1.0,
584
+ label="🎯 Similarity Threshold (%)",
585
+ description="Minimum similarity for tree analysis"
586
+ )
587
+ file_build_ml_tree = gr.Checkbox(
588
+ label="🌲 Build ML Tree",
589
+ value=False,
590
+ description="Generate phylogenetic placement (slower)"
591
+ )
592
+ analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
593
+ gr.Markdown("## 📊 Analysis Results")
594
  with gr.Row():
595
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  boundary_output = gr.Textbox(
597
+ label="🎯 Boundary Detection",
598
+ interactive=False,
599
+ lines=2
600
  )
 
 
601
  keras_output = gr.Textbox(
602
+ label="🧠 F Gene Validation",
603
+ interactive=False,
604
+ lines=2
605
  )
606
+ with gr.Column():
 
607
  ml_tree_output = gr.Textbox(
608
+ label="🌲 Phylogenetic Placement",
609
+ interactive=False,
610
+ lines=2
611
  )
 
 
612
  tree_analysis_output = gr.Textbox(
613
+ label="🌳 Tree Analysis",
614
+ interactive=False,
615
+ lines=2
 
 
 
 
 
 
 
616
  )
617
+ summary_output = gr.Textbox(
618
+ label="📋 Summary",
619
+ interactive=False,
620
+ lines=8
621
+ )
622
+ with gr.Row():
623
+ aligned_file = gr.File(label="📄 Alignment File", visible=False)
624
+ tree_file = gr.File(label="🌲 Tree File", visible=False)
625
  with gr.Tabs():
626
  with gr.TabItem("🌳 Interactive Tree"):
627
  tree_html = gr.HTML(
628
+ label="Phylogenetic Tree",
629
+ value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
630
  )
 
631
  with gr.TabItem("📊 Detailed Report"):
632
  report_html = gr.HTML(
633
  label="Analysis Report",
634
+ value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
635
  )
636
+ analyze_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  fn=run_pipeline,
638
+ inputs=[dna_input, similarity_score, build_ml_tree],
639
  outputs=[
640
+ boundary_output, keras_output, ml_tree_output,
641
+ tree_analysis_output, summary_output,
642
+ aligned_file, tree_file, gr.State(), gr.State(),
643
+ tree_html, report_html
 
 
 
 
 
 
 
644
  ]
645
  )
 
 
646
  analyze_file_btn.click(
647
  fn=run_pipeline_from_file,
648
+ inputs=[file_input, file_similarity_score, file_build_ml_tree],
649
  outputs=[
650
+ boundary_output, keras_output, ml_tree_output,
651
+ tree_analysis_output, summary_output,
652
+ aligned_file, tree_file, gr.State(), gr.State(),
653
+ tree_html, report_html
 
 
 
 
 
 
 
654
  ]
655
  )
656
+ gr.Markdown("## 🔬 Example Sequences")
657
+ example_sequences = [
658
+ ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
659
+ ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True]
660
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  gr.Examples(
662
+ examples=example_sequences,
663
+ inputs=[dna_input, similarity_score, build_ml_tree],
 
 
 
 
664
  label="Click to load example sequences"
665
  )
666
+ with gr.Accordion("❓ Help & Information", open=False):
667
+ gr.Markdown("""
668
+ ### 🧬 Gene Analysis Pipeline
669
+ This tool performs comprehensive analysis of F gene sequences:
670
+ **🎯 Boundary Detection**: Identifies F gene regions within your sequence
671
+ **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
672
+ **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
673
+ **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
674
+ ### 📋 Input Requirements
675
+ - DNA sequences in ATCG format
676
+ - Minimum 10 bp for basic analysis
677
+ - Minimum 100 bp for phylogenetic placement
678
+ - FASTA files supported for upload
679
+ ### ⚙️ Parameters
680
+ - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
681
+ - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
682
+ ### 📊 Output Files
683
+ - Alignment files (.fa format)
684
+ - Tree files (.treefile format)
685
+ - Interactive HTML visualizations
686
+ """)
 
 
 
 
 
 
 
687
  return iface
 
688
  except Exception as e:
689
  logger.error(f"Failed to create Gradio interface: {e}")
690
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
 
692
  # --- Application Startup ---
693
+ def mount_gradio_app():
694
  try:
695
+ gradio_app = create_gradio_interface()
696
+ if gradio_app:
697
+ app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
698
+ logger.info("✅ Gradio interface mounted at /gradio")
699
+ else:
700
+ logger.error("❌ Failed to create Gradio interface")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  except Exception as e:
702
+ logger.error(f"❌ Failed to mount Gradio app: {e}")
703
+
704
+ # Initialize Gradio
705
+ mount_gradio_app()
706
+
707
+
708
+ # --- Main Application ---
709
+ if __name__ == "__main__":
710
+ import argparse
711
+ parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
712
+ parser.add_argument("--host", default="0.0.0.0", help="Host address")
713
+ parser.add_argument("--port", type=int, default=7860, help="Port number")
714
+ parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
715
+ parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
716
+ args = parser.parse_args()
717
+
718
+ if args.gradio_only:
719
+ logger.info("🚖 Starting Gradio interface only...")
720
+ iface = create_gradio_interface()
721
+ if iface:
722
+ iface.launch(
723
+ server_name=args.host,
724
+ server_port=args.port,
725
+ share=False,
726
+ show_error=True
727
+ )
728
+ else:
729
+ logger.error("Failed to create Gradio interface")
730
+ sys.exit(1)
731
+ else:
732
+ logger.info(f"🚖 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
733
+ logger.info("📖 API Documentation: http://localhost:7860/docs")
734
+ logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
735
+ try:
736
+ uvicorn.run(
737
+ "app:app" if args.reload else app,
738
+ host=args.host,
739
+ port=args.port,
740
+ reload=args.reload,
741
+ log_level="info"
742
+ )
743
+ except KeyboardInterrupt:
744
+ logger.info("🛑 Application stopped by user")
745
+ except Exception as e:
746
+ logger.error(f"❌ Application failed: {e}")
747
+ sys.exit(1)