re-type commited on
Commit
fd13101
·
verified ·
1 Parent(s): 3c94bd8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +366 -558
app.py CHANGED
@@ -1,19 +1,14 @@
1
  import os
2
- # Disable GPU to avoid CUDA errors
3
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
4
- os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" # Prevent TensorFlow memory issues
5
- # Suppress TensorFlow warnings
6
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # More aggressive suppression
7
-
8
- import gradio as gr
9
- import torch
10
  import pickle
11
  import subprocess
12
  import pandas as pd
13
  import re
14
- import logging
15
  import numpy as np
16
- from predictor import EnhancedGenePredictor
 
 
 
17
  from tensorflow.keras.models import load_model
18
  from analyzer import PhylogeneticTreeAnalyzer
19
  import tempfile
@@ -27,33 +22,23 @@ from Bio.Seq import Seq
27
  from Bio.SeqRecord import SeqRecord
28
  import stat
29
  import time
30
- import asyncio
31
- from fastapi import FastAPI, File, UploadFile, Form, HTTPException
32
- from fastapi.responses import HTMLResponse, FileResponse
33
- from pydantic import BaseModel
34
- from typing import Optional
35
- import uvicorn
36
 
37
  # --- Logging Setup ---
38
- log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
39
- log_handler = logging.StreamHandler()
40
- log_handler.setFormatter(log_formatter)
41
- try:
42
- file_handler = logging.FileHandler('/tmp/app.log')
43
- file_handler.setFormatter(log_formatter)
44
- logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
45
- except Exception as e:
46
- logging.basicConfig(level=logging.INFO, handlers=[log_handler])
47
- logging.warning(f"Failed to set up file logging: {e}")
48
-
49
  logger = logging.getLogger(__name__)
50
- logger.info(f"Gradio version: {gr.__version__}")
51
 
52
- # Set event loop policy for compatibility with Gradio Spaces
53
- try:
54
- asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
55
- except Exception as e:
56
- logger.warning(f"Failed to set event loop policy: {e}")
57
 
58
  # --- Global Variables ---
59
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -63,59 +48,69 @@ ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
63
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
64
  QUERY_OUTPUT_DIR = os.path.join(BASE_DIR, "queries")
65
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
66
-
67
- # Model repository and file paths
68
  MODEL_REPO = "GGproject10/best_boundary_aware_model"
69
  CSV_PATH = "f cleaned.csv"
 
70
 
71
- # Initialize models as None
72
  boundary_model = None
73
  keras_model = None
74
  kmer_to_index = None
75
  analyzer = None
76
 
77
- # --- Model Loading ---
78
  def load_models_safely():
79
  global boundary_model, keras_model, kmer_to_index, analyzer
80
  logger.info("🔍 Loading models...")
 
 
81
  try:
82
- boundary_path = hf_hub_download(
83
- repo_id=MODEL_REPO,
84
- filename="best_boundary_aware_model.pth",
85
- token=None
86
- )
87
- if os.path.exists(boundary_path):
88
- boundary_model = EnhancedGenePredictor(boundary_path)
89
- logger.info("✅ Boundary model loaded successfully.")
90
- else:
91
- logger.error(f"❌ Boundary model file not found after download.")
 
92
  except Exception as e:
93
  logger.error(f"❌ Failed to load boundary model: {e}")
94
  boundary_model = None
 
 
95
  try:
96
- keras_path = hf_hub_download(
97
- repo_id=MODEL_REPO,
98
- filename="best_model.keras",
99
- token=None
100
- )
101
- kmer_path = hf_hub_download(
102
- repo_id=MODEL_REPO,
103
- filename="kmer_to_index.pkl",
104
- token=None
105
- )
106
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
107
- keras_model = load_model(keras_path)
108
- with open(kmer_path, "rb") as f:
109
- kmer_to_index = pickle.load(f)
110
- logger.info("✅ Keras model and k-mer index loaded successfully.")
111
- else:
112
- logger.error(f"❌ Keras model or k-mer files not found.")
 
 
 
 
 
113
  except Exception as e:
114
  logger.error(f"❌ Failed to load Keras model: {e}")
115
  keras_model = None
116
  kmer_to_index = None
 
 
117
  try:
118
- logger.info("🌳 Initializing tree analyzer...")
119
  analyzer = PhylogeneticTreeAnalyzer()
120
  csv_candidates = [
121
  CSV_PATH,
@@ -128,94 +123,129 @@ def load_models_safely():
128
  csv_loaded = False
129
  for csv_candidate in csv_candidates:
130
  if os.path.exists(csv_candidate):
131
- logger.info(f"📊 Trying CSV: {csv_candidate}")
132
- try:
133
- if analyzer.load_data(csv_candidate):
134
- logger.info(f"✅ CSV loaded from: {csv_candidate}")
135
- csv_loaded = True
136
- break
137
- except Exception as e:
138
- logger.warning(f"CSV load failed for {csv_candidate}: {e}")
139
- continue
140
  if not csv_loaded:
141
- logger.error("❌ Failed to load CSV data from any candidate location.")
142
  analyzer = None
143
  else:
144
- try:
145
- if analyzer.train_ai_model():
146
- logger.info("✅ AI model training completed successfully")
147
- else:
148
- logger.warning("⚠️ AI model training failed; proceeding with basic analysis.")
149
- except Exception as e:
150
- logger.warning(f"⚠️ AI model training failed: {e}")
151
  except Exception as e:
152
- logger.error(f"❌ Tree analyzer initialization failed: {e}")
153
  analyzer = None
154
 
155
- # Load models at startup
156
- load_models_safely()
 
 
 
157
 
158
  # --- Tool Detection ---
159
  def setup_binary_permissions():
160
  for binary in [MAFFT_PATH, IQTREE_PATH]:
161
  if os.path.exists(binary):
162
- try:
163
- os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
164
- logger.info(f"Set executable permission on {binary}")
165
- except Exception as e:
166
- logger.warning(f"Failed to set permission on {binary}: {e}")
167
 
168
  def check_tool_availability():
169
  setup_binary_permissions()
170
  mafft_available = False
171
  mafft_cmd = None
172
- mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
 
 
 
 
 
 
 
 
 
 
 
173
  for candidate in mafft_candidates:
174
- if shutil.which(candidate) or os.path.exists(candidate):
175
  try:
176
  result = subprocess.run(
177
  [candidate, "--help"],
178
  capture_output=True,
179
  text=True,
180
- timeout=5
181
  )
182
  if result.returncode == 0 or "mafft" in result.stderr.lower():
183
  mafft_available = True
184
  mafft_cmd = candidate
185
- logger.info(f"✅ MAFFT found at: {candidate}")
186
  break
187
  except Exception as e:
188
- logger.debug(f"MAFFT test failed for {candidate}: {e}")
189
  iqtree_available = False
190
  iqtree_cmd = None
191
- iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  for candidate in iqtree_candidates:
193
- if shutil.which(candidate) or os.path.exists(candidate):
194
  try:
195
  result = subprocess.run(
196
  [candidate, "--help"],
197
  capture_output=True,
198
  text=True,
199
- timeout=5
200
  )
201
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
202
  iqtree_available = True
203
  iqtree_cmd = candidate
204
- logger.info(f"✅ IQ-TREE found at: {candidate}")
205
  break
206
  except Exception as e:
207
- logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
208
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # --- Pipeline Functions ---
211
- def cleanup_file(file_path: str) -> None:
212
- """Utility function to safely delete a file and log errors."""
213
  if file_path and os.path.exists(file_path):
214
  try:
215
  os.unlink(file_path)
216
  logger.debug(f"Cleaned up {file_path}")
217
- except Exception as cleanup_error:
218
- logger.warning(f"Failed to clean up {file_path}: {cleanup_error}")
219
 
220
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
221
  query_fasta = None
@@ -227,8 +257,7 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
227
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
228
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
229
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
230
- cleanup_file(query_fasta)
231
- return False, "Reference alignment or tree not found.", None, None
232
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
233
  SeqIO.write([query_record], query_fasta, "fasta")
234
  with open(aligned_with_query, "w") as output_file:
@@ -257,49 +286,88 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
257
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
258
  cleanup_file(query_fasta)
259
  return True, success_msg, aligned_with_query, treefile
260
- except Exception as main_error:
261
- logger.error(f"Phylogenetic placement failed: {main_error}", exc_info=True)
262
  cleanup_file(query_fasta)
263
- return False, f"Error: {str(main_error)}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
266
  try:
267
- logger.debug("Starting tree analysis...")
268
  if not analyzer:
269
- return "❌ Tree analyzer not initialized.", None, None
270
  if not sequence or len(sequence.strip()) < 10:
271
- return "❌ Invalid sequence.", None, None
272
  if not (1 <= matching_percentage <= 99):
273
- return "❌ Matching percentage must be 1-99.", None, None
274
- logger.debug("Finding query sequence...")
275
  if not analyzer.find_query_sequence(sequence):
276
- return "❌ Sequence not accepted.", None, None
277
- logger.debug("Finding similar sequences...")
278
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
279
  if not matched_ids:
280
- return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
281
- logger.debug("Building tree structure...")
282
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
283
- logger.debug("Creating interactive tree...")
284
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
285
  query_id = analyzer.query_id or f"query_{int(time.time())}"
286
- tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
287
- logger.debug(f"Saving tree to {tree_html_path}")
288
- fig.write_html(tree_html_path)
289
- analyzer.matching_percentage = matching_percentage
290
- logger.debug("Generating detailed report...")
291
- report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
292
- report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
293
- logger.debug(f"Tree analysis completed: {len(matched_ids)} matches")
294
- return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
295
  except Exception as e:
296
- logger.error(f"Tree analysis failed: {e}", exc_info=True)
297
- return f"❌ Error: {str(e)}", None, None
298
 
299
  def predict_with_keras(sequence):
300
  try:
301
  if not keras_model or not kmer_to_index:
302
- return "❌ Keras model not available."
303
  if len(sequence) < 6:
304
  return "❌ Sequence too short (<6 bp)."
305
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
@@ -310,46 +378,54 @@ def predict_with_keras(sequence):
310
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
311
  return f"✅ {percentage}% F gene confidence"
312
  except Exception as e:
313
- logger.error(f"Keras prediction failed: {e}", exc_info=True)
314
  return f"❌ Error: {str(e)}"
315
 
316
- def read_fasta_file(file_obj):
317
  try:
318
- if file_obj is None:
319
  return ""
320
- if isinstance(file_obj, str):
321
- with open(file_obj, "r") as f:
322
- content = f.read()
323
- else:
324
- content = file_obj.read().decode("utf-8")
325
  lines = content.strip().split("\n")
326
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
327
  return ''.join(seq_lines)
328
  except Exception as e:
329
- logger.error(f"Failed to read FASTA file: {e}", exc_info=True)
330
  return ""
331
 
 
 
 
 
 
 
 
 
 
 
332
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
333
  try:
334
  dna_input = dna_input.upper().strip()
335
  if not dna_input:
336
- return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input", None, None
337
  if not re.match('^[ACTGN]+$', dna_input):
338
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
339
  processed_sequence = dna_input
340
  boundary_output = ""
341
  if boundary_model:
342
  try:
343
- result = boundary_model.predict_sequence(dna_input)
344
- regions = result['gene_regions']
345
  if regions:
346
  processed_sequence = regions[0]["sequence"]
347
- boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
 
348
  else:
349
  boundary_output = "⚠️ No F gene regions found."
350
  processed_sequence = dna_input
351
  except Exception as e:
352
- boundary_output = f"❌ Boundary prediction error: {str(e)}"
353
  processed_sequence = dna_input
354
  else:
355
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
@@ -358,48 +434,29 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
358
  phy_file = None
359
  ml_tree_output = ""
360
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
361
- try:
362
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
363
- if mafft_available and iqtree_available:
364
- ml_success, ml_message, ml_aligned, ml_tree = phylogenetic_placement(processed_sequence, mafft_cmd, iqtree_cmd)
365
- ml_tree_output = ml_message
366
- aligned_file = ml_aligned
367
- phy_file = ml_tree
368
- else:
369
- ml_tree_output = "❌ MAFFT or IQ-TREE not available"
370
- except Exception as e:
371
- ml_tree_output = f"❌ ML tree error: {str(e)}"
372
  elif build_ml_tree:
373
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
374
  else:
375
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
376
  tree_html_content = "No tree generated."
377
- report_html_content = "No report generated."
378
- tree_html_path = None
379
- report_html_path = None
380
  simplified_ml_output = ""
381
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
382
- try:
383
- tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
384
- simplified_ml_output = tree_result
385
- if tree_html_path and os.path.exists(tree_html_path):
386
- with open(tree_html_path, 'r', encoding='utf-8') as f:
387
- tree_html_content = f.read()
388
- else:
389
- tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
390
- if report_html_path and os.path.exists(report_html_path):
391
- with open(report_html_path, 'r', encoding='utf-8') as f:
392
- report_html_content = f.read()
393
- else:
394
- report_html_content = f"<div style='color: red;'>{tree_result}</div>"
395
- except Exception as e:
396
- simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
397
- tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
398
- report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
399
  else:
400
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
401
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
402
- report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
403
  summary_output = f"""
404
  📊 ANALYSIS SUMMARY:
405
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -407,420 +464,171 @@ Input: {len(dna_input)} bp
407
  F Gene: {len(processed_sequence)} bp
408
  Validation: {keras_output.split(':')[-1].strip() if ':' in keras_output else keras_output}
409
  Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skipped' in ml_tree_output else '❌ Failed'}
410
- Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
411
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
412
  """
413
  return (
414
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
415
- aligned_file, phy_file, None, None, tree_html_content, report_html_content,
416
- tree_html_path, report_html_path
417
  )
418
  except Exception as e:
419
- logger.error(f"Pipeline error: {e}", exc_info=True)
420
  error_msg = f"❌ Pipeline Error: {str(e)}"
421
- return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg, None, None
422
-
423
- async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
424
- temp_file_path = None
425
- try:
426
- if fasta_file_obj is None:
427
- return "❌ No file provided", "", "", "", "", None, None, None, None, "No input", "No input", None, None
428
- with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
429
- if isinstance(fasta_file_obj, UploadFile):
430
- content = await fasta_file_obj.read()
431
- temp_file.write(content)
432
- else:
433
- with open(fasta_file_obj, 'rb') as f:
434
- content = f.read()
435
- temp_file.write(content)
436
- temp_file_path = temp_file.name
437
- dna_input = read_fasta_file(temp_file_path)
438
- if not dna_input:
439
- cleanup_file(temp_file_path)
440
- return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input", None, None
441
- result = run_pipeline(dna_input, similarity_score, build_ml_tree)
442
- cleanup_file(temp_file_path)
443
- return result
444
- except Exception as main_error:
445
- logger.error(f"Pipeline from file error: {main_error}", exc_info=True)
446
- cleanup_file(temp_file_path)
447
- error_msg = f"❌ Error: {str(main_error)}"
448
- return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg, None, None
449
 
450
- class AnalysisRequest(BaseModel):
451
- sequence: str
452
- similarity_score: float = 95.0
453
- build_ml_tree: bool = False
454
 
455
- class AnalysisResponse(BaseModel):
456
- boundary_output: str
457
- keras_output: str
458
- ml_tree_output: str
459
- tree_analysis_output: str
460
- summary_output: str
461
- success: bool
462
- error_message: Optional[str] = None
463
- tree_html_path: Optional[str] = None
464
- report_html_path: Optional[str] = None
465
-
466
- # --- FastAPI App Setup ---
467
- app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
468
-
469
- @app.get("/")
470
- async def root():
471
- return {
472
- "message": "🧬 Gene Analysis Pipeline API",
473
- "status": "running",
474
- "endpoints": {
475
- "docs": "/docs",
476
- "health": "/health",
477
- "gradio": "/gradio",
478
- "analyze": "/analyze",
479
- "analyze_file": "/analyze-file",
480
- "download": "/download/{file_type}/{query_id}"
481
- }
482
- }
483
-
484
- @app.get("/health")
485
- async def health_check():
486
  try:
487
  mafft_available, iqtree_available, _, _ = check_tool_availability()
488
- return {
489
- "status": "healthy",
 
 
 
 
 
 
 
 
 
 
 
 
490
  "components": {
491
  "boundary_model": boundary_model is not None,
492
  "keras_model": keras_model is not None,
 
493
  "tree_analyzer": analyzer is not None,
494
  "mafft_available": mafft_available,
495
- "iqtree_available": iqtree_available
 
496
  },
497
  "paths": {
498
  "base_dir": BASE_DIR,
499
- "query_output_dir": QUERY_OUTPUT_DIR
 
 
500
  }
501
- }
502
  except Exception as e:
503
- logger.error(f"Health check error: {e}", exc_info=True)
504
- return {"status": "unhealthy", "error": str(e)}
505
 
506
- @app.post("/analyze", response_model=AnalysisResponse)
507
- async def analyze_sequence(request: AnalysisRequest):
508
  try:
509
- result = run_pipeline(request.sequence, request.similarity_score, request.build_ml_tree)
510
- return AnalysisResponse(
511
- boundary_output=result[0] or "",
512
- keras_output=result[1] or "",
513
- ml_tree_output=result[2] or "",
514
- tree_analysis_output=result[3] or "",
515
- summary_output=result[4] or "",
516
- tree_html_path=result[11],
517
- report_html_path=result[12],
518
- success=True
519
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  except Exception as e:
521
- logger.error(f"Analyze error: {e}", exc_info=True)
522
- return AnalysisResponse(
523
- boundary_output="", keras_output="", ml_tree_output="",
524
- tree_analysis_output="", summary_output="",
525
- tree_html_path=None, report_html_path=None,
526
- success=False, error_message=str(e)
527
- )
528
 
529
- @app.post("/analyze-file", response_model=AnalysisResponse)
530
- async def analyze_file(
531
- file: UploadFile = File(...),
532
- similarity_score: float = Form(95.0),
533
- build_ml_tree: bool = Form(False)
534
- ):
535
- temp_file_path = None
536
  try:
 
 
 
 
 
 
 
 
 
 
 
537
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
538
- content = await file.read()
539
- temp_file.write(content)
540
  temp_file_path = temp_file.name
541
- result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
542
- cleanup_file(temp_file_path)
543
- return AnalysisResponse(
544
- boundary_output=result[0] or "",
545
- keras_output=result[1] or "",
546
- ml_tree_output=result[2] or "",
547
- tree_analysis_output=result[3] or "",
548
- summary_output=result[4] or "",
549
- tree_html_path=result[11],
550
- report_html_path=result[12],
551
- success=True
552
- )
553
- except Exception as main_error:
554
- logger.error(f"Analyze-file error: {main_error}", exc_info=True)
555
  cleanup_file(temp_file_path)
556
- return AnalysisResponse(
557
- boundary_output="", keras_output="", ml_tree_output="",
558
- tree_analysis_output="", summary_output="",
559
- tree_html_path=None, report_html_path=None,
560
- success=False, error_message=str(main_error)
561
- )
 
 
 
 
 
 
 
 
 
 
562
 
563
- @app.get("/download/{file_type}/{query_id}")
564
- async def download_file(file_type: str, query_id: str):
565
  try:
566
- if file_type not in ["tree", "report"]:
567
- raise HTTPException(status_code=400, detail="Invalid file type. Use 'tree' or 'report'.")
568
- file_name = f"phylogenetic_tree_{query_id}.html" if file_type == "tree" else f"detailed_report_{query_id}.html"
569
- file_path = os.path.join("/tmp", file_name)
 
 
 
 
 
 
 
 
570
  if not os.path.exists(file_path):
571
- raise HTTPException(status_code=404, detail="File not found.")
572
- return FileResponse(file_path, filename=file_name, media_type="text/html")
573
  except Exception as e:
574
- logger.error(f"Download error: {e}", exc_info=True)
575
- raise HTTPException(status_code=500, detail=f"Error serving file: {str(e)}")
576
-
577
- # --- Gradio Interface ---
578
- def create_gradio_interface():
579
- try:
580
- with gr.Blocks(
581
- title="🧬 Gene Analysis Pipeline",
582
- theme=gr.themes.Soft(),
583
- css="""
584
- .gradio-container { max-width: 1200px !important; }
585
- .status-box { padding: 10px; border-radius: 5px; margin: 5px 0; }
586
- .success { background-color: #d4edda; border: 1px solid #c3e6cb; color: #155724; }
587
- .warning { background-color: #fff3cd; border: 1px solid #ffeaa7; color: #856404; }
588
- .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
589
- """
590
- ) as iface:
591
- gr.Markdown("# 🧬 Gene Analysis Pipeline")
592
- with gr.Row():
593
- with gr.Column():
594
- status_display = gr.HTML(value=f"""
595
- <div class="status-box">
596
- <h3>🔧 System Status</h3>
597
- <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
598
- <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
599
- <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}</p>
600
- <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
601
- <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
602
- </div>
603
- """)
604
- with gr.Tabs():
605
- with gr.TabItem("📝 Text Input"):
606
- with gr.Row():
607
- with gr.Column(scale=2):
608
- gr.Markdown("Paste your DNA sequence here")
609
- dna_input = gr.Textbox(
610
- label="🧬 DNA Sequence",
611
- placeholder="Enter DNA sequence (ATCG format)...",
612
- lines=5
613
- )
614
- with gr.Column(scale=1):
615
- gr.Markdown("Minimum similarity for tree analysis")
616
- similarity_score = gr.Slider(
617
- minimum=1,
618
- maximum=99,
619
- value=95.0,
620
- step=1.0,
621
- label="🎯 Similarity Threshold (%)"
622
- )
623
- gr.Markdown("Generate phylogenetic placement (slower)")
624
- build_ml_tree = gr.Checkbox(
625
- label="🌲 Build ML Tree",
626
- value=False
627
- )
628
- analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
629
- with gr.TabItem("📁 File Upload"):
630
- with gr.Row():
631
- with gr.Column(scale=2):
632
- gr.Markdown("Upload a FASTA file containing your sequence")
633
- file_input = gr.File(
634
- label="📄 Upload FASTA File",
635
- file_types=[".fasta", ".fa", ".fas", ".txt"]
636
- )
637
- with gr.Column(scale=1):
638
- gr.Markdown("Minimum similarity for tree analysis")
639
- file_similarity_score = gr.Slider(
640
- minimum=1,
641
- maximum=99,
642
- value=95.0,
643
- step=1.0,
644
- label="🎯 Similarity Threshold (%)"
645
- )
646
- gr.Markdown("Generate phylogenetic placement (slower)")
647
- file_build_ml_tree = gr.Checkbox(
648
- label="🌲 Build ML Tree",
649
- value=False
650
- )
651
- analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
652
- gr.Markdown("## 📊 Analysis Results")
653
- with gr.Row():
654
- with gr.Column():
655
- boundary_output = gr.Textbox(
656
- label="🎯 Boundary Detection",
657
- interactive=False,
658
- lines=2
659
- )
660
- keras_output = gr.Textbox(
661
- label="🧠 F Gene Validation",
662
- interactive=False,
663
- lines=2
664
- )
665
- with gr.Column():
666
- ml_tree_output = gr.Textbox(
667
- label="🌲 Phylogenetic Placement",
668
- interactive=False,
669
- lines=2
670
- )
671
- tree_analysis_output = gr.Textbox(
672
- label="🌳 Tree Analysis",
673
- interactive=False,
674
- lines=2
675
- )
676
- summary_output = gr.Textbox(
677
- label="📋 Summary",
678
- interactive=False,
679
- lines=8
680
- )
681
- with gr.Row():
682
- aligned_file = gr.File(label="📄 Alignment File", visible=False)
683
- tree_file = gr.File(label="🌲 Tree File", visible=False)
684
- tree_html_file = gr.File(label="🌳 Simplified Tree HTML", visible=False)
685
- report_html_file = gr.File(label="📊 Detailed Report HTML", visible=False)
686
- with gr.Tabs():
687
- with gr.TabItem("🌳 Interactive Tree"):
688
- tree_html = gr.HTML(
689
- value="<div style='text-align: center; color: #666; padding: 20px;'>No tree generated yet. Run analysis to see results.</div>"
690
- )
691
- with gr.TabItem("📊 Detailed Report"):
692
- report_html = gr.HTML(
693
- label="Analysis Report",
694
- value="<div style='text-align: center; color: #666; padding: 20px;'>No report generated yet. Run analysis to see results.</div>"
695
- )
696
-
697
- # Event handlers
698
- def handle_analysis_output(*outputs):
699
- boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output, aligned_file, phy_file, _, _, tree_html_content, report_html_content, tree_html_path, report_html_path = outputs
700
- return (
701
- boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
702
- gr.File.update(value=aligned_file, visible=aligned_file is not None),
703
- gr.File.update(value=phy_file, visible=phy_file is not None),
704
- gr.File.update(value=tree_html_path, visible=tree_html_path is not None),
705
- gr.File.update(value=report_html_path, visible=report_html_path is not None),
706
- tree_html_content,
707
- report_html_content
708
- )
709
-
710
- analyze_btn.click(
711
- fn=run_pipeline,
712
- inputs=[dna_input, similarity_score, build_ml_tree],
713
- outputs=[
714
- boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
715
- aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
716
- ]
717
- )
718
-
719
- analyze_file_btn.click(
720
- fn=run_pipeline_from_file,
721
- inputs=[file_input, file_similarity_score, file_build_ml_tree],
722
- outputs=[
723
- boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
724
- aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
725
- ]
726
- )
727
-
728
- # Examples
729
- gr.Examples(
730
- examples=[
731
- ["ATCG" * 250, 85.0, False],
732
- ["CGATCG" * 150, 90.0, True]
733
- ],
734
- inputs=[dna_input, similarity_score, build_ml_tree],
735
- label="Example Sequences"
736
- )
737
-
738
- gr.Markdown("""
739
- ## 📚 Instructions
740
- 1. **Input**: Enter a DNA sequence (ATCG format) or upload a FASTA file
741
- 2. **Parameters**:
742
- - Set similarity threshold for phylogenetic analysis (1-99%)
743
- - Choose whether to build ML tree (slower but more accurate)
744
- 3. **Analysis**: Click analyze to run the complete pipeline
745
- 4. **Results**: View results in different tabs - summary, tree visualization, and detailed report
746
- 5. **Downloads**: Download alignment, tree, simplified tree HTML, and detailed report HTML files
747
- ### 🔬 Pipeline Components:
748
- - **Boundary Detection**: Identifies F gene regions
749
- - **F Gene Validation**: Validates F gene using ML
750
- - **Phylogenetic Placement**: Places sequence in reference tree (optional)
751
- - **Tree Analysis**: Builds phylogenetic tree with similar sequences
752
- """)
753
-
754
- return iface
755
- except Exception as main_error:
756
- logger.error(f"Gradio interface creation failed: {main_error}", exc_info=True)
757
- return gr.Interface(
758
- fn=lambda x: f"Error: {str(main_error)}",
759
- inputs=gr.Textbox(label="DNA Sequence"),
760
- outputs=gr.Textbox(label="Error"),
761
- title="🧬 Gene Analysis Pipeline (Error Mode)"
762
- )
763
-
764
- # --- Application Startup ---
765
- def run_application():
766
- try:
767
- logger.info("🧬 Initializing Gene Analysis Pipeline...")
768
- main_gradio_app = create_gradio_interface()
769
- if main_gradio_app is None:
770
- raise RuntimeError("Gradio interface creation returned None")
771
- logger.info("✅ Gradio interface created successfully")
772
- main_gradio_app = gr.mount_gradio_app(app, main_gradio_app, path="/gradio")
773
- logger.info("✅ Gradio mounted to FastAPI at /gradio")
774
- logger.info("=" * 50)
775
- logger.info("🔍 Checking system components...")
776
- logger.info(f"🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}")
777
- logger.info(f"🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}")
778
- logger.info(f"🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}")
779
- mafft_available, iqtree_available, _, _ = check_tool_availability()
780
- logger.info(f"🧬 MAFFT: {'✅ Available' if mafft_available else '❌ Missing'}")
781
- logger.info(f"🌲 IQ-TREE: {'✅ Available' if iqtree_available else '❌ Missing'}")
782
- logger.info("=" * 50)
783
- logger.info("🚀 Starting Gene Analysis Pipeline...")
784
- logger.warning("⚠️ Running without request queuing. Concurrent requests may block.")
785
- logger.info("📊 FastAPI docs available at: http://localhost:7860/docs")
786
- logger.info("🧬 Gradio interface available at: http://localhost:7860/gradio")
787
- uvicorn.run(
788
- app,
789
- host="0.0.0.0",
790
- port=7860,
791
- log_level="info",
792
- access_log=True,
793
- timeout_keep_alive=120
794
- )
795
- except Exception as main_error:
796
- logger.error(f"Application startup failed: {main_error}", exc_info=True)
797
- try:
798
- logger.info("🔄 Falling back to Gradio-only mode...")
799
- fallback_gradio_app = create_gradio_interface()
800
- if fallback_gradio_app is None:
801
- raise RuntimeError("Fallback Gradio interface creation returned None")
802
- logger.info("✅ Fallback Gradio interface created successfully")
803
- logger.info("🧬 Gradio interface available at: http://localhost:7860")
804
- fallback_gradio_app.launch(
805
- server_name="0.0.0.0",
806
- server_port=7860,
807
- prevent_thread_lock=True,
808
- quiet=True
809
- )
810
- except Exception as fallback_error:
811
- logger.error(f"Fallback failed: {fallback_error}", exc_info=True)
812
- print("❌ Application failed to start. Check logs at /tmp/app.log for details.")
813
- sys.exit(1)
814
 
815
  if __name__ == "__main__":
816
- print("🧬 Gene Analysis Pipeline Starting...")
817
- print("=" * 50)
818
- print("🔍 Checking system components...")
819
  mafft_available, iqtree_available, _, _ = check_tool_availability()
820
- print(f"🤖 Boundary Model: {'✅' if boundary_model else '❌'}")
821
- print(f"🧠 Keras Model: {'✅' if keras_model else '❌'}")
822
- print(f"🌳 Tree Analyzer: {'✅' if analyzer else '❌'}")
823
- print(f"🧬 MAFFT: {'✅' if mafft_available else '❌'}")
824
- print(f"🌲 IQ-TREE: {'✅' if iqtree_available else '❌'}")
825
- print("=" * 50)
826
- run_application()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import logging
 
 
 
 
 
 
 
3
  import pickle
4
  import subprocess
5
  import pandas as pd
6
  import re
 
7
  import numpy as np
8
+ import torch
9
+ from flask import Flask, request, jsonify, send_file
10
+ from werkzeug.utils import secure_filename
11
+ from predictor import GenePredictor
12
  from tensorflow.keras.models import load_model
13
  from analyzer import PhylogeneticTreeAnalyzer
14
  import tempfile
 
22
  from Bio.SeqRecord import SeqRecord
23
  import stat
24
  import time
 
 
 
 
 
 
25
 
26
  # --- Logging Setup ---
27
+ os.makedirs('/tmp', exist_ok=True)
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(levelname)s - %(message)s',
31
+ handlers=[
32
+ logging.StreamHandler(),
33
+ logging.FileHandler('/tmp/flask_app.log')
34
+ ]
35
+ )
 
 
36
  logger = logging.getLogger(__name__)
 
37
 
38
+ # Disable GPU to avoid CUDA errors
39
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
40
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
41
+ os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
 
42
 
43
  # --- Global Variables ---
44
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
48
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
49
  QUERY_OUTPUT_DIR = os.path.join(BASE_DIR, "queries")
50
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
 
 
51
  MODEL_REPO = "GGproject10/best_boundary_aware_model"
52
  CSV_PATH = "f cleaned.csv"
53
+ HF_TOKEN = os.getenv("HF_TOKEN")
54
 
55
+ # Initialize models
56
  boundary_model = None
57
  keras_model = None
58
  kmer_to_index = None
59
  analyzer = None
60
 
61
+ # --- Load Models ---
62
  def load_models_safely():
63
  global boundary_model, keras_model, kmer_to_index, analyzer
64
  logger.info("🔍 Loading models...")
65
+
66
+ # Boundary model
67
  try:
68
+ boundary_path = os.path.join(BASE_DIR, "models", "best_boundary_aware_model.pth")
69
+ if not os.path.exists(boundary_path):
70
+ logger.info(f"Downloading boundary model from {MODEL_REPO}...")
71
+ boundary_path = hf_hub_download(
72
+ repo_id=MODEL_REPO,
73
+ filename="best_boundary_aware_model.pth",
74
+ token=HF_TOKEN,
75
+ local_dir=os.path.join(BASE_DIR, "models")
76
+ )
77
+ boundary_model = GenePredictor(boundary_path)
78
+ logger.info("✅ Boundary model loaded")
79
  except Exception as e:
80
  logger.error(f"❌ Failed to load boundary model: {e}")
81
  boundary_model = None
82
+
83
+ # Keras model
84
  try:
85
+ keras_path = os.path.join(BASE_DIR, "models", "best_model.keras")
86
+ kmer_path = os.path.join(BASE_DIR, "models", "kmer_to_index.pkl")
87
+ if not os.path.exists(keras_path):
88
+ logger.info(f"Downloading Keras model from {MODEL_REPO}...")
89
+ keras_path = hf_hub_download(
90
+ repo_id=MODEL_REPO,
91
+ filename="best_model.keras",
92
+ token=HF_TOKEN,
93
+ local_dir=os.path.join(BASE_DIR, "models")
94
+ )
95
+ if not os.path.exists(kmer_path):
96
+ logger.info(f"Downloading k-mer index from {MODEL_REPO}...")
97
+ kmer_path = hf_hub_download(
98
+ repo_id=MODEL_REPO,
99
+ filename="kmer_to_index.pkl",
100
+ token=HF_TOKEN,
101
+ local_dir=os.path.join(BASE_DIR, "models")
102
+ )
103
+ keras_model = load_model(keras_path)
104
+ with open(kmer_path, "rb") as f:
105
+ kmer_to_index = pickle.load(f)
106
+ logger.info("✅ Keras model and k-mer index loaded")
107
  except Exception as e:
108
  logger.error(f"❌ Failed to load Keras model: {e}")
109
  keras_model = None
110
  kmer_to_index = None
111
+
112
+ # Tree analyzer
113
  try:
 
114
  analyzer = PhylogeneticTreeAnalyzer()
115
  csv_candidates = [
116
  CSV_PATH,
 
123
  csv_loaded = False
124
  for csv_candidate in csv_candidates:
125
  if os.path.exists(csv_candidate):
126
+ if analyzer.load_data(csv_candidate):
127
+ logger.info(f"✅ CSV loaded: {csv_candidate}")
128
+ csv_loaded = True
129
+ break
 
 
 
 
 
130
  if not csv_loaded:
131
+ logger.error("❌ Failed to load CSV")
132
  analyzer = None
133
  else:
134
+ if analyzer.train_ai_model():
135
+ logger.info("✅ AI model trained")
 
 
 
 
 
136
  except Exception as e:
137
+ logger.error(f"❌ Tree analyzer failed: {e}")
138
  analyzer = None
139
 
140
+ try:
141
+ load_models_safely()
142
+ except Exception as e:
143
+ logger.critical(f"Model loading failed: {e}")
144
+ sys.exit(1)
145
 
146
  # --- Tool Detection ---
147
  def setup_binary_permissions():
148
  for binary in [MAFFT_PATH, IQTREE_PATH]:
149
  if os.path.exists(binary):
150
+ os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
151
+ logger.info(f"✅ Set permission: {binary}")
152
+ else:
153
+ logger.warning(f"⚠️ Binary not found: {binary}")
 
154
 
155
  def check_tool_availability():
156
  setup_binary_permissions()
157
  mafft_available = False
158
  mafft_cmd = None
159
+ mafft_candidates = [
160
+ MAFFT_PATH,
161
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft"),
162
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft.bat"),
163
+ 'mafft',
164
+ '/usr/bin/mafft',
165
+ '/usr/local/bin/mafft',
166
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafftdir", "bin", "mafft"),
167
+ os.path.expanduser("~/anaconda3/bin/mafft"),
168
+ os.path.expanduser("~/miniconda3/bin/mafft"),
169
+ "/opt/conda/bin/mafft",
170
+ "/usr/local/miniconda3/bin/mafft"
171
+ ]
172
  for candidate in mafft_candidates:
173
+ if os.path.exists(candidate) or shutil.which(candidate):
174
  try:
175
  result = subprocess.run(
176
  [candidate, "--help"],
177
  capture_output=True,
178
  text=True,
179
+ timeout=10
180
  )
181
  if result.returncode == 0 or "mafft" in result.stderr.lower():
182
  mafft_available = True
183
  mafft_cmd = candidate
184
+ logger.info(f"✅ MAFFT: {candidate}")
185
  break
186
  except Exception as e:
187
+ logger.debug(f"MAFFT test failed: {candidate}: {e}")
188
  iqtree_available = False
189
  iqtree_cmd = None
190
+ iqtree_candidates = [
191
+ IQTREE_PATH,
192
+ 'iqtree',
193
+ 'iqtree2',
194
+ 'iqtree3',
195
+ '/usr/bin/iqtree',
196
+ '/usr/local/bin/iqtree',
197
+ 'iqtree.exe',
198
+ 'iqtree2.exe',
199
+ 'iqtree3.exe',
200
+ os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree2"),
201
+ os.path.expanduser("~/anaconda3/bin/iqtree2"),
202
+ os.path.expanduser("~/miniconda3/bin/iqtree2"),
203
+ "/opt/conda/bin/iqtree2",
204
+ "/usr/local/miniconda3/bin/iqtree2"
205
+ ]
206
  for candidate in iqtree_candidates:
207
+ if os.path.exists(candidate) or shutil.which(candidate):
208
  try:
209
  result = subprocess.run(
210
  [candidate, "--help"],
211
  capture_output=True,
212
  text=True,
213
+ timeout=10
214
  )
215
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
216
  iqtree_available = True
217
  iqtree_cmd = candidate
218
+ logger.info(f"✅ IQ-TREE: {candidate}")
219
  break
220
  except Exception as e:
221
+ logger.debug(f"IQ-TREE test failed: {candidate}: {e}")
222
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
223
 
224
+ def install_dependencies_guide():
225
+ return """
226
+ 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
227
+ For MAFFT:
228
+ - Ubuntu/Debian: sudo apt-get install mafft
229
+ - CentOS/RHEL: sudo yum install mafft
230
+ - macOS: brew install mafft
231
+ - Windows: Download from https://mafft.cbrc.jp/alignment/software/
232
+ - Conda: conda install -c bioconda mafft
233
+ For IQ-TREE:
234
+ - Ubuntu/Debian: sudo apt-get install iqtree
235
+ - CentOS/RHEL: sudo yum install iqtree
236
+ - macOS: brew install iqtree
237
+ - Windows: Download from http://www.iqtree.org/
238
+ - Conda: conda install -c bioconda iqtree
239
+ """
240
+
241
  # --- Pipeline Functions ---
242
+ def cleanup_file(file_path: str):
 
243
  if file_path and os.path.exists(file_path):
244
  try:
245
  os.unlink(file_path)
246
  logger.debug(f"Cleaned up {file_path}")
247
+ except Exception as e:
248
+ logger.warning(f"Failed to clean up {file_path}: {e}")
249
 
250
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
251
  query_fasta = None
 
257
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
258
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
259
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
260
+ return False, f"Reference files missing: {ALIGNMENT_PATH}, {TREE_PATH}", None, None
 
261
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
262
  SeqIO.write([query_record], query_fasta, "fasta")
263
  with open(aligned_with_query, "w") as output_file:
 
286
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
287
  cleanup_file(query_fasta)
288
  return True, success_msg, aligned_with_query, treefile
289
+ except Exception as e:
290
+ logger.error(f"Phylogenetic placement failed: {e}")
291
  cleanup_file(query_fasta)
292
+ return False, f"Error: {str(e)}", None, None
293
+
294
+ def build_maximum_likelihood_tree(f_gene_sequence):
295
+ try:
296
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
297
+ status_msg = "🔍 Checking dependencies...\n"
298
+ if not mafft_available:
299
+ status_msg += "❌ MAFFT not found\n"
300
+ else:
301
+ status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
302
+ if not iqtree_available:
303
+ status_msg += "❌ IQ-TREE not found\n"
304
+ else:
305
+ status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
306
+ if not os.path.exists(ALIGNMENT_PATH):
307
+ status_msg += f"❌ Reference alignment not found: {ALIGNMENT_PATH}\n"
308
+ else:
309
+ status_msg += f"✅ Reference alignment found\n"
310
+ if not os.path.exists(TREE_PATH):
311
+ status_msg += f"❌ Reference tree not found: {TREE_PATH}\n"
312
+ else:
313
+ status_msg += f"✅ Reference tree found\n"
314
+ if not mafft_available or not iqtree_available:
315
+ guide = install_dependencies_guide()
316
+ return False, f"{status_msg}\n{guide}", None, None
317
+ if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
318
+ status_msg += "\n❌ Missing reference files.\n"
319
+ return False, status_msg, None, None
320
+ placement_success, placement_message, aligned_file, tree_file = phylogenetic_placement(
321
+ f_gene_sequence, mafft_cmd, iqtree_cmd
322
+ )
323
+ if placement_success:
324
+ final_message = f"{status_msg}\n{placement_message}"
325
+ if aligned_file and os.path.exists(aligned_file):
326
+ standard_aligned = os.path.join(QUERY_OUTPUT_DIR, "query_with_references_aligned.fasta")
327
+ shutil.copy2(aligned_file, standard_aligned)
328
+ aligned_file = standard_aligned
329
+ if tree_file and os.path.exists(tree_file):
330
+ standard_tree = os.path.join(QUERY_OUTPUT_DIR, "query_placement_tree.treefile")
331
+ shutil.copy2(tree_file, standard_tree)
332
+ tree_file = standard_tree
333
+ return True, final_message, aligned_file, tree_file
334
+ else:
335
+ return False, f"{status_msg}\n{placement_message}", aligned_file, tree_file
336
+ except Exception as e:
337
+ logger.error(f"ML tree construction failed: {e}")
338
+ return False, f"Error: {str(e)}", None, None
339
 
340
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
341
  try:
 
342
  if not analyzer:
343
+ return "❌ Tree analyzer not initialized.", None
344
  if not sequence or len(sequence.strip()) < 10:
345
+ return "❌ Invalid sequence.", None
346
  if not (1 <= matching_percentage <= 99):
347
+ return "❌ Matching percentage must be 1-99.", None
 
348
  if not analyzer.find_query_sequence(sequence):
349
+ return "❌ Sequence not accepted.", None
 
350
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
351
  if not matched_ids:
352
+ return f"❌ No similar sequences at {matching_percentage}% threshold.", None
 
353
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
 
354
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
355
  query_id = analyzer.query_id or f"query_{int(time.time())}"
356
+ output_dir = os.path.join(BASE_DIR, "output")
357
+ os.makedirs(output_dir, exist_ok=True)
358
+ html_filename = f"tree_{query_id}.html"
359
+ html_path = os.path.join(output_dir, html_filename)
360
+ fig.write_html(html_path)
361
+ success_msg = f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity."
362
+ return success_msg, html_path
 
 
363
  except Exception as e:
364
+ logger.error(f"Tree analysis failed: {e}")
365
+ return f"❌ Error: {str(e)}", None
366
 
367
  def predict_with_keras(sequence):
368
  try:
369
  if not keras_model or not kmer_to_index:
370
+ return f"❌ Keras model not available."
371
  if len(sequence) < 6:
372
  return "❌ Sequence too short (<6 bp)."
373
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
 
378
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
379
  return f"✅ {percentage}% F gene confidence"
380
  except Exception as e:
381
+ logger.error(f"Keras prediction failed: {e}")
382
  return f"❌ Error: {str(e)}"
383
 
384
+ def read_fasta_file(file_path):
385
  try:
386
+ if not file_path:
387
  return ""
388
+ with open(file_path, "r") as f:
389
+ content = f.read()
 
 
 
390
  lines = content.strip().split("\n")
391
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
392
  return ''.join(seq_lines)
393
  except Exception as e:
394
+ logger.error(f"Failed to read FASTA file: {e}")
395
  return ""
396
 
397
+ def run_pipeline_from_file(fasta_file_path, similarity_score, build_ml_tree):
398
+ try:
399
+ dna_input = read_fasta_file(fasta_file_path)
400
+ if not dna_input:
401
+ return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, "No input"
402
+ return run_pipeline(dna_input, similarity_score, build_ml_tree)
403
+ except Exception as e:
404
+ logger.error(f"Pipeline from file error: {e}")
405
+ return f"❌ Error: {str(e)}", "", "", "", "", None, None, None, f"❌ Error: {str(e)}"
406
+
407
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
408
  try:
409
  dna_input = dna_input.upper().strip()
410
  if not dna_input:
411
+ return "❌ Empty input", "", "", "", "", None, None, None, "No input"
412
  if not re.match('^[ACTGN]+$', dna_input):
413
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
414
  processed_sequence = dna_input
415
  boundary_output = ""
416
  if boundary_model:
417
  try:
418
+ predictions, probs, confidence = boundary_model.predict(dna_input)
419
+ regions = boundary_model.extract_gene_regions(predictions, dna_input)
420
  if regions:
421
  processed_sequence = regions[0]["sequence"]
422
+ boundary_output = processed_sequence
423
+ logger.info(f"F gene extracted: {len(processed_sequence)} bp")
424
  else:
425
  boundary_output = "⚠️ No F gene regions found."
426
  processed_sequence = dna_input
427
  except Exception as e:
428
+ boundary_output = f"❌ Boundary error: {str(e)}"
429
  processed_sequence = dna_input
430
  else:
431
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
434
  phy_file = None
435
  ml_tree_output = ""
436
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
437
+ ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
438
+ ml_tree_output = ml_message
439
+ aligned_file = ml_aligned
440
+ phy_file = ml_tree
 
 
 
 
 
 
 
441
  elif build_ml_tree:
442
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
443
  else:
444
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
445
+ html_file = None
446
  tree_html_content = "No tree generated."
 
 
 
447
  simplified_ml_output = ""
448
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
449
+ tree_result, html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
450
+ simplified_ml_output = tree_result
451
+ html_file = html_path
452
+ if html_path and os.path.exists(html_path):
453
+ with open(html_path, 'r', encoding='utf-8') as f:
454
+ tree_html_content = f.read()
455
+ else:
456
+ tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
 
 
 
 
 
 
 
 
457
  else:
458
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
459
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
460
  summary_output = f"""
461
  📊 ANALYSIS SUMMARY:
462
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
464
  F Gene: {len(processed_sequence)} bp
465
  Validation: {keras_output.split(':')[-1].strip() if ':' in keras_output else keras_output}
466
  Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skipped' in ml_tree_output else '❌ Failed'}
467
+ Tree Analysis: {'✅ OK' if '' in simplified_ml_output else '❌ Failed'}
468
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
469
  """
470
  return (
471
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
472
+ aligned_file, phy_file, html_file, tree_html_content
 
473
  )
474
  except Exception as e:
475
+ logger.error(f"Pipeline error: {e}")
476
  error_msg = f"❌ Pipeline Error: {str(e)}"
477
+ return error_msg, "", "", "", "", None, None, None, error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
+ # --- Flask App ---
480
+ app = Flask(__name__)
 
 
481
 
482
+ @app.route("/health", methods=["GET"])
483
+ def health_check():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  try:
485
  mafft_available, iqtree_available, _, _ = check_tool_availability()
486
+ files_exist = {
487
+ "alignment": os.path.exists(ALIGNMENT_PATH),
488
+ "tree": os.path.exists(TREE_PATH),
489
+ "csv": any(os.path.exists(c) for c in [
490
+ CSV_PATH,
491
+ os.path.join(BASE_DIR, CSV_PATH),
492
+ os.path.join(BASE_DIR, "app", CSV_PATH),
493
+ os.path.join(os.path.dirname(__file__), CSV_PATH),
494
+ "f_cleaned.csv",
495
+ os.path.join(BASE_DIR, "f_cleaned.csv")
496
+ ])
497
+ }
498
+ return jsonify({
499
+ "status": "healthy" if all([boundary_model, keras_model, analyzer, mafft_available, iqtree_available, files_exist["alignment"], files_exist["tree"], files_exist["csv"]]) else "unhealthy",
500
  "components": {
501
  "boundary_model": boundary_model is not None,
502
  "keras_model": keras_model is not None,
503
+ "kmer_index": kmer_to_index is not None,
504
  "tree_analyzer": analyzer is not None,
505
  "mafft_available": mafft_available,
506
+ "iqtree_available": iqtree_available,
507
+ "files": files_exist
508
  },
509
  "paths": {
510
  "base_dir": BASE_DIR,
511
+ "query_output_dir": QUERY_OUTPUT_DIR,
512
+ "alignment_path": ALIGNMENT_PATH,
513
+ "tree_path": TREE_PATH
514
  }
515
+ }), 200
516
  except Exception as e:
517
+ logger.error(f"Health check failed: {e}")
518
+ return jsonify({"status": "unhealthy", "error": str(e)}), 500
519
 
520
+ @app.route("/analyze", methods=["POST"])
521
+ def analyze_sequence():
522
  try:
523
+ data = request.get_json()
524
+ if not data or "sequence" not in data:
525
+ return jsonify({"error": "Missing 'sequence' in JSON body"}), 400
526
+ sequence = data["sequence"].upper().strip()
527
+ similarity_score = float(data.get("similarity_score", 95.0))
528
+ build_ml_tree = data.get("build_ml_tree", False)
529
+ if not sequence:
530
+ return jsonify({"error": "Empty sequence"}), 400
531
+ if not re.match('^[ACTGN]+$', sequence):
532
+ return jsonify({"error": "Invalid sequence (use A, T, C, G, N)"}), 400
533
+ if not 30.0 <= similarity_score <= 99.0:
534
+ return jsonify({"error": "Similarity score must be between 30 and 99"}), 400
535
+ result = run_pipeline(sequence, similarity_score, build_ml_tree)
536
+ return jsonify({
537
+ "status": "success",
538
+ "boundary_output": result[0],
539
+ "keras_output": result[1],
540
+ "ml_tree_output": result[2],
541
+ "tree_analysis_output": result[3],
542
+ "summary_output": result[4],
543
+ "aligned_file": os.path.basename(result[5]) if result[5] else None,
544
+ "tree_file": os.path.basename(result[6]) if result[6] else None,
545
+ "html_tree_file": os.path.basename(result[7]) if result[7] else None,
546
+ "tree_html_content": result[8]
547
+ }), 200
548
  except Exception as e:
549
+ logger.error(f"Analyze error: {e}")
550
+ return jsonify({"error": str(e)}), 500
 
 
 
 
 
551
 
552
+ @app.route("/analyze-file", methods=["POST"])
553
+ def analyze_file():
 
 
 
 
 
554
  try:
555
+ if 'file' not in request.files:
556
+ return jsonify({"error": "No file provided"}), 400
557
+ file = request.files['file']
558
+ if file.filename == '':
559
+ return jsonify({"error": "Empty filename"}), 400
560
+ if not file.filename.endswith(('.fasta', '.fa', '.fas', '.txt')):
561
+ return jsonify({"error": "Invalid file type (use .fasta, .fa, .fas, .txt)"}), 400
562
+ similarity_score = float(request.form.get("similarity_score", 95.0))
563
+ build_ml_tree = request.form.get("build_ml_tree", "false").lower() == "true"
564
+ if not 30.0 <= similarity_score <= 99.0:
565
+ return jsonify({"error": "Similarity score must be between 30 and 99"}), 400
566
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
567
+ file.save(temp_file.name)
 
568
  temp_file_path = temp_file.name
569
+ result = run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  cleanup_file(temp_file_path)
571
+ return jsonify({
572
+ "status": "success",
573
+ "boundary_output": result[0],
574
+ "keras_output": result[1],
575
+ "ml_tree_output": result[2],
576
+ "tree_analysis_output": result[3],
577
+ "summary_output": result[4],
578
+ "aligned_file": os.path.basename(result[5]) if result[5] else None,
579
+ "tree_file": os.path.basename(result[6]) if result[6] else None,
580
+ "html_tree_file": os.path.basename(result[7]) if result[7] else None,
581
+ "tree_html_content": result[8]
582
+ }), 200
583
+ except Exception as e:
584
+ logger.error(f"Analyze-file error: {e}")
585
+ cleanup_file(temp_file_path) if 'temp_file_path' in locals() else None
586
+ return jsonify({"error": str(e)}), 500
587
 
588
+ @app.route("/download/<file_type>/<filename>", methods=["GET"])
589
+ def download_file(file_type, filename):
590
  try:
591
+ if file_type not in ["alignment", "tree", "html"]:
592
+ return jsonify({"error": "Invalid file type (use alignment, tree, html)"}), 400
593
+ if file_type == "html":
594
+ file_path = os.path.join(BASE_DIR, "output", filename)
595
+ if not filename.startswith("tree_") or not filename.endswith(".html"):
596
+ return jsonify({"error": "Invalid HTML filename"}), 400
597
+ else:
598
+ file_path = os.path.join(QUERY_OUTPUT_DIR, filename)
599
+ if file_type == "alignment" and not filename.endswith((".fasta", ".fa")):
600
+ return jsonify({"error": "Invalid alignment filename"}), 400
601
+ if file_type == "tree" and not filename.endswith(".treefile"):
602
+ return jsonify({"error": "Invalid tree filename"}), 400
603
  if not os.path.exists(file_path):
604
+ return jsonify({"error": "File not found"}), 404
605
+ return send_file(file_path, as_attachment=True, download_name=filename)
606
  except Exception as e:
607
+ logger.error(f"Download error: {e}")
608
+ return jsonify({"error": str(e)}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
 
610
  if __name__ == "__main__":
611
+ logger.info("🧬 Starting Flask Gene Analysis API...")
 
 
612
  mafft_available, iqtree_available, _, _ = check_tool_availability()
613
+ logger.info(f"🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}")
614
+ logger.info(f"🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}")
615
+ logger.info(f"🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}")
616
+ logger.info(f"🧬 MAFFT: {'✅ Available' if mafft_available else '❌ Missing'}")
617
+ logger.info(f"🌲 IQ-TREE: {'✅ Available' if iqtree_available else '❌ Missing'}")
618
+ files_exist = {
619
+ "alignment": os.path.exists(ALIGNMENT_PATH),
620
+ "tree": os.path.exists(TREE_PATH),
621
+ "csv": any(os.path.exists(c) for c in [
622
+ CSV_PATH,
623
+ os.path.join(BASE_DIR, CSV_PATH),
624
+ os.path.join(BASE_DIR, "app", CSV_PATH),
625
+ os.path.join(os.path.dirname(__file__), CSV_PATH),
626
+ "f_cleaned.csv",
627
+ os.path.join(BASE_DIR, "f_cleaned.csv")
628
+ ])
629
+ }
630
+ logger.info(f"📂 Files: Alignment={'✅' if files_exist['alignment'] else '❌'}, Tree={'✅' if files_exist['tree'] else '❌'}, CSV={'✅' if files_exist['csv'] else '❌'}")
631
+ if not all(files_exist.values()):
632
+ logger.critical("Missing required reference files")
633
+ sys.exit(1)
634
+ app.run(host="0.0.0.0", port=7860, debug=False)