re-type commited on
Commit
c31e216
·
verified ·
1 Parent(s): 9a6bf89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +260 -226
app.py CHANGED
@@ -17,6 +17,7 @@ from tensorflow.keras.models import load_model
17
  from analyzer import PhylogeneticTreeAnalyzer
18
  import tempfile
19
  import shutil
 
20
  import uuid
21
  from pathlib import Path
22
  from huggingface_hub import hf_hub_download
@@ -27,7 +28,7 @@ import stat
27
  import time
28
  import asyncio
29
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
30
- from fastapi.responses import FileResponse
31
  from pydantic import BaseModel
32
  from typing import Optional
33
  import uvicorn
@@ -39,14 +40,15 @@ log_handler.setFormatter(log_formatter)
39
  try:
40
  file_handler = logging.FileHandler('/tmp/app.log')
41
  file_handler.setFormatter(log_formatter)
42
- logging.basicConfig(level=logging.DEBUG, handlers=[log_handler, file_handler]) # Changed to DEBUG
43
  except Exception as e:
44
- logging.basicConfig(level=logging.DEBUG, handlers=[log_handler])
45
  logging.warning(f"Failed to set up file logging: {e}")
 
46
  logger = logging.getLogger(__name__)
47
  logger.info(f"Gradio version: {gr.__version__}")
48
 
49
- # Set event loop policy
50
  try:
51
  asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
52
  except Exception as e:
@@ -61,49 +63,66 @@ TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
61
  QUERY_OUTPUT_DIR = os.path.join(BASE_DIR, "queries")
62
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
63
 
 
64
  MODEL_REPO = "GGproject10/best_boundary_aware_model"
65
  CSV_PATH = "f cleaned.csv"
66
 
67
- # Initialize models
68
  boundary_model = None
69
  keras_model = None
70
  kmer_to_index = None
71
  analyzer = None
72
 
73
- # --- Model Loading (from Script 2) ---
74
  def load_models_safely():
75
  global boundary_model, keras_model, kmer_to_index, analyzer
76
  logger.info("🔍 Loading models...")
77
  try:
78
- boundary_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_boundary_aware_model.pth", token=None)
 
 
 
 
79
  if os.path.exists(boundary_path):
80
  boundary_model = EnhancedGenePredictor(boundary_path)
81
- logger.info("✅ Boundary model loaded.")
82
  else:
83
- logger.error(f"❌ Boundary model file not found.")
84
  except Exception as e:
85
- logger.error(f"❌ Failed to load boundary model: {e}", exc_info=True)
86
  boundary_model = None
87
  try:
88
- keras_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.keras", token=None)
89
- kmer_path = hf_hub_download(repo_id=MODEL_REPO, filename="kmer_to_index.pkl", token=None)
 
 
 
 
 
 
 
 
90
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
91
  keras_model = load_model(keras_path)
92
  with open(kmer_path, "rb") as f:
93
  kmer_to_index = pickle.load(f)
94
- logger.info("✅ Keras model loaded.")
95
  else:
96
- logger.error(f"❌ Keras model files not found.")
97
  except Exception as e:
98
- logger.error(f"❌ Failed to load Keras model: {e}", exc_info=True)
99
  keras_model = None
100
  kmer_to_index = None
101
  try:
102
  logger.info("🌳 Initializing tree analyzer...")
103
  analyzer = PhylogeneticTreeAnalyzer()
104
  csv_candidates = [
105
- CSV_PATH, os.path.join(BASE_DIR, CSV_PATH), os.path.join(BASE_DIR, "app", CSV_PATH),
106
- os.path.join(os.path.dirname(__file__), CSV_PATH), "f_cleaned.csv", os.path.join(BASE_DIR, "f_cleaned.csv")
 
 
 
 
107
  ]
108
  csv_loaded = False
109
  for csv_candidate in csv_candidates:
@@ -115,25 +134,27 @@ def load_models_safely():
115
  csv_loaded = True
116
  break
117
  except Exception as e:
118
- logger.warning(f"CSV load failed for {csv_candidate}: {e}", exc_info=True)
 
119
  if not csv_loaded:
120
- logger.error("❌ Failed to load CSV data.")
121
  analyzer = None
122
  else:
123
  try:
124
  if analyzer.train_ai_model():
125
- logger.info("✅ AI model training completed.")
126
  else:
127
- logger.warning("⚠️ AI model training failed.")
128
  except Exception as e:
129
- logger.warning(f"⚠️ AI model training failed: {e}", exc_info=True)
130
  except Exception as e:
131
- logger.error(f"❌ Tree analyzer initialization failed: {e}", exc_info=True)
132
  analyzer = None
133
 
 
134
  load_models_safely()
135
 
136
- # --- Tool Detection (from Script 2) ---
137
  def setup_binary_permissions():
138
  for binary in [MAFFT_PATH, IQTREE_PATH]:
139
  if os.path.exists(binary):
@@ -141,7 +162,7 @@ def setup_binary_permissions():
141
  os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
142
  logger.info(f"Set executable permission on {binary}")
143
  except Exception as e:
144
- logger.warning(f"Failed to set permission on {binary}: {e}", exc_info=True)
145
 
146
  def check_tool_availability():
147
  setup_binary_permissions()
@@ -151,7 +172,12 @@ def check_tool_availability():
151
  for candidate in mafft_candidates:
152
  if shutil.which(candidate) or os.path.exists(candidate):
153
  try:
154
- result = subprocess.run([candidate, "--help"], capture_output=True, text=True, timeout=5)
 
 
 
 
 
155
  if result.returncode == 0 or "mafft" in result.stderr.lower():
156
  mafft_available = True
157
  mafft_cmd = candidate
@@ -165,7 +191,12 @@ def check_tool_availability():
165
  for candidate in iqtree_candidates:
166
  if shutil.which(candidate) or os.path.exists(candidate):
167
  try:
168
- result = subprocess.run([candidate, "--help"], capture_output=True, text=True, timeout=5)
 
 
 
 
 
169
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
170
  iqtree_available = True
171
  iqtree_cmd = candidate
@@ -177,73 +208,57 @@ def check_tool_availability():
177
 
178
  # --- Pipeline Functions ---
179
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
180
- query_fasta = None
181
  try:
182
  if len(sequence.strip()) < 100:
183
  return False, "Sequence too short (<100 bp).", None, None
184
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
185
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
186
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
187
- output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_")
188
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
189
- logger.error(f"Reference alignment or tree not found: {ALIGNMENT_PATH}, {TREE_PATH}")
190
  return False, "Reference alignment or tree not found.", None, None
191
- logger.debug(f"Writing query FASTA to: {query_fasta}")
192
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
193
- SeqIO.write(query_fasta, [query_fasta], "write([query_record])")
194
- logger.debug("Running MAFFT alignment...")
195
- with open(aligned_with_query, "subprocess") as subprocess:
196
  subprocess.run([
197
- open, mafft_cmd, "--add", "--reorder", "subprocess.PIPEALIGNMENT_PATH",
198
- query_f, aligned_with_query, query_fasta
199
- ], "subprocess.PIPE=stdout", text=True, timeout_ms=600000, check=True)
200
- if not os.path.exists("aligned_with_query") or not os.path.getsize(aligned_with_query):
201
- logger.error(f"MAFFT alignment failed: {aligned_with_query}")
202
  return False, "MAFFT alignment failed.", None, None
203
- logger.debug("Running IQ-TREE placement...")
204
  subprocess.run([
205
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
206
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
207
  ], capture_output=True, text=True, timeout=1200, check=True)
208
  treefile = f"{output_prefix}.treefile"
209
  if not os.path.exists(treefile):
210
- logger.error(f"IQ-TREE placement failed: {treefile} not found")
211
  return False, "IQ-TREE placement failed.", aligned_with_query, None
212
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
213
- logger.info(success_msg)
214
  return True, success_msg, aligned_with_query, treefile
215
  except Exception as e:
216
  logger.error(f"Phylogenetic placement failed: {e}", exc_info=True)
217
  return False, f"Error: {str(e)}", None, None
218
  finally:
219
- if query_fasta and os.path.exists(query_fasta):
220
  try:
221
  os.unlink(query_fasta)
222
- logger.debug(f"Cleaned up {query_fasta}")
223
- except Exception as e:
224
- logger.warning(f"Failed to clean up {query_fasta}: {e}", exc_info=True)
225
 
226
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
227
  try:
228
  logger.debug("Starting tree analysis...")
229
  if not analyzer:
230
- logger.error("Tree analyzer not initialized")
231
  return "❌ Tree analyzer not initialized.", None, None
232
- logger.debug("Validating sequence...")
233
  if not sequence or len(sequence.strip()) < 10:
234
- logger.error("Invalid sequence: too short or empty")
235
  return "❌ Invalid sequence.", None, None
236
  if not (1 <= matching_percentage <= 99):
237
- logger.error(f"Invalid matching percentage: {matching_percentage}")
238
  return "❌ Matching percentage must be 1-99.", None, None
239
  logger.debug("Finding query sequence...")
240
  if not analyzer.find_query_sequence(sequence):
241
- logger.error("Sequence not accepted by analyzer")
242
  return "❌ Sequence not accepted.", None, None
243
  logger.debug("Finding similar sequences...")
244
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
245
  if not matched_ids:
246
- logger.warning(f"No similar sequences found at {matching_percentage}% threshold")
247
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
248
  logger.debug("Building tree structure...")
249
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
@@ -257,7 +272,7 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
257
  logger.debug("Generating detailed report...")
258
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
259
  report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
260
- logger.debug(f"Tree analysis completed: {len(matched_ids)} matches at {actual_percentage:.2f}%")
261
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
262
  except Exception as e:
263
  logger.error(f"Tree analysis failed: {e}", exc_info=True)
@@ -265,22 +280,16 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
265
 
266
  def predict_with_keras(sequence):
267
  try:
268
- logger.debug("Starting Keras prediction...")
269
  if not keras_model or not kmer_to_index:
270
- logger.error("Keras model or kmer index not available")
271
  return "❌ Keras model not available."
272
  if len(sequence) < 6:
273
- logger.error("Sequence too short for Keras prediction")
274
  return "❌ Sequence too short (<6 bp)."
275
- logger.debug("Generating kmers...")
276
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
277
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
278
  input_arr = np.array([indices])
279
- logger.debug("Running Keras prediction...")
280
  prediction = keras_model.predict(input_arr, verbose=0)[0]
281
  f_gene_prob = prediction[-1]
282
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
283
- logger.debug(f"Keras prediction completed: {percentage}% confidence")
284
  return f"✅ {percentage}% F gene confidence"
285
  except Exception as e:
286
  logger.error(f"Keras prediction failed: {e}", exc_info=True)
@@ -288,9 +297,7 @@ def predict_with_keras(sequence):
288
 
289
  def read_fasta_file(file_obj):
290
  try:
291
- logger.debug("Reading FASTA file...")
292
  if file_obj is None:
293
- logger.error("No file object provided")
294
  return ""
295
  if isinstance(file_obj, str):
296
  with open(file_obj, "r") as f:
@@ -299,26 +306,20 @@ def read_fasta_file(file_obj):
299
  content = file_obj.read().decode("utf-8")
300
  lines = content.strip().split("\n")
301
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
302
- sequence = ''.join(seq_lines)
303
- logger.debug(f"FASTA file read successfully: {len(sequence)} bp")
304
- return sequence
305
  except Exception as e:
306
  logger.error(f"Failed to read FASTA file: {e}", exc_info=True)
307
  return ""
308
 
309
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
310
  try:
311
- logger.debug("Starting pipeline...")
312
  dna_input = dna_input.upper().strip()
313
  if not dna_input:
314
- logger.error("Empty input sequence")
315
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input", None, None
316
  if not re.match('^[ACTGN]+$', dna_input):
317
- logger.debug("Cleaning invalid characters from input")
318
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
319
  processed_sequence = dna_input
320
  boundary_output = ""
321
- logger.debug("Running boundary detection...")
322
  if boundary_model:
323
  try:
324
  result = boundary_model.predict_sequence(dna_input)
@@ -326,25 +327,19 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
326
  if regions:
327
  processed_sequence = regions[0]["sequence"]
328
  boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
329
- logger.debug(f"Boundary detection: F gene found, {len(processed_sequence)} bp")
330
  else:
331
  boundary_output = "⚠️ No F gene regions found."
332
  processed_sequence = dna_input
333
- logger.debug("Boundary detection: No F gene regions found")
334
  except Exception as e:
335
  boundary_output = f"❌ Boundary prediction error: {str(e)}"
336
  processed_sequence = dna_input
337
- logger.error(f"Boundary prediction error: {e}", exc_info=True)
338
  else:
339
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
340
- logger.warning("Boundary model not available")
341
- logger.debug("Running Keras validation...")
342
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
343
  aligned_file = None
344
  phy_file = None
345
  ml_tree_output = ""
346
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
347
- logger.debug("Running phylogenetic placement...")
348
  try:
349
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
350
  if mafft_available and iqtree_available:
@@ -352,53 +347,41 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
352
  ml_tree_output = ml_message
353
  aligned_file = ml_aligned
354
  phy_file = ml_tree
355
- logger.debug(f"Phylogenetic placement: {ml_message}")
356
  else:
357
  ml_tree_output = "❌ MAFFT or IQ-TREE not available"
358
- logger.error("MAFFT or IQ-TREE not available")
359
  except Exception as e:
360
  ml_tree_output = f"❌ ML tree error: {str(e)}"
361
- logger.error(f"ML tree error: {e}", exc_info=True)
362
  elif build_ml_tree:
363
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
364
- logger.error("Sequence too short for phylogenetic placement")
365
  else:
366
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
367
- logger.debug("Phylogenetic placement skipped")
368
  tree_html_content = "No tree generated."
369
  report_html_content = "No report generated."
370
  tree_html_path = None
371
  report_html_path = None
372
  simplified_ml_output = ""
373
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
374
- logger.debug("Running tree analysis...")
375
  try:
376
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
377
  simplified_ml_output = tree_result
378
  if tree_html_path and os.path.exists(tree_html_path):
379
  with open(tree_html_path, 'r', encoding='utf-8') as f:
380
  tree_html_content = f.read()
381
- logger.debug(f"Tree HTML generated: {tree_html_path}")
382
  else:
383
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
384
- logger.debug("No tree HTML generated")
385
  if report_html_path and os.path.exists(report_html_path):
386
  with open(report_html_path, 'r', encoding='utf-8') as f:
387
  report_html_content = f.read()
388
- logger.debug(f"Report HTML generated: {report_html_path}")
389
  else:
390
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
391
- logger.debug("No report HTML generated")
392
  except Exception as e:
393
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
394
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
395
  report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
396
- logger.error(f"Tree analysis error: {e}", exc_info=True)
397
  else:
398
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
399
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
400
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
401
- logger.error(simplified_ml_output)
402
  summary_output = f"""
403
  📊 ANALYSIS SUMMARY:
404
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -409,7 +392,6 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
409
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
410
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
411
  """
412
- logger.debug("Pipeline completed successfully")
413
  return (
414
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
415
  aligned_file, phy_file, None, None, tree_html_content, report_html_content,
@@ -423,9 +405,7 @@ Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
423
  async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
424
  temp_file_path = None
425
  try:
426
- logger.debug("Starting pipeline from file...")
427
  if fasta_file_obj is None:
428
- logger.error("No file provided")
429
  return "❌ No file provided", "", "", "", "", None, None, None, None, "No input", "No input", None, None
430
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
431
  if isinstance(fasta_file_obj, UploadFile):
@@ -436,12 +416,9 @@ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree
436
  content = f.read()
437
  temp_file.write(content)
438
  temp_file_path = temp_file.name
439
- logger.debug(f"Reading FASTA file: {temp_file_path}")
440
  dna_input = read_fasta_file(temp_file_path)
441
  if not dna_input:
442
- logger.error("Failed to read FASTA file")
443
  return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input", None, None
444
- logger.debug("Running pipeline with FASTA input...")
445
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
446
  except Exception as e:
447
  logger.error(f"Pipeline from file error: {e}", exc_info=True)
@@ -451,9 +428,8 @@ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree
451
  if temp_file_path and os.path.exists(temp_file_path):
452
  try:
453
  os.unlink(temp_file_path)
454
- logger.debug(f"Cleaned up temp file: {temp_file_path}")
455
  except Exception as e:
456
- logger.warning(f"Failed to delete temp file {temp_file_path}: {e}", exc_info=True)
457
 
458
  # --- Pydantic Models ---
459
  class AnalysisRequest(BaseModel):
@@ -515,9 +491,7 @@ async def health_check():
515
  @app.post("/analyze", response_model=AnalysisResponse)
516
  async def analyze_sequence(request: AnalysisRequest):
517
  try:
518
- logger.debug("Starting sequence analysis via API...")
519
  result = run_pipeline(request.sequence, request.similarity_score, request.build_ml_tree)
520
- logger.debug("API analysis completed")
521
  return AnalysisResponse(
522
  boundary_output=result[0] or "",
523
  keras_output=result[1] or "",
@@ -545,13 +519,11 @@ async def analyze_file(
545
  ):
546
  temp_file_path = None
547
  try:
548
- logger.debug("Starting file analysis via API...")
549
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
550
  content = await file.read()
551
  temp_file.write(content)
552
  temp_file_path = temp_file.name
553
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
554
- logger.debug("API file analysis completed")
555
  return AnalysisResponse(
556
  boundary_output=result[0] or "",
557
  keras_output=result[1] or "",
@@ -574,23 +546,18 @@ async def analyze_file(
574
  if temp_file_path and os.path.exists(temp_file_path):
575
  try:
576
  os.unlink(temp_file_path)
577
- logger.debug(f"Cleaned up API temp file: {temp_file_path}")
578
  except Exception as e:
579
- logger.warning(f"Failed to clean up {temp_file_path}: {e}", exc_info=True)
580
 
581
  @app.get("/download/{file_type}/{query_id}")
582
  async def download_file(file_type: str, query_id: str):
583
  try:
584
- logger.debug(f"Downloading file: {file_type}/{query_id}")
585
  if file_type not in ["tree", "report"]:
586
- logger.error(f"Invalid file type: {file_type}")
587
  raise HTTPException(status_code=400, detail="Invalid file type. Use 'tree' or 'report'.")
588
  file_name = f"phylogenetic_tree_{query_id}.html" if file_type == "tree" else f"detailed_report_{query_id}.html"
589
  file_path = os.path.join("/tmp", file_name)
590
  if not os.path.exists(file_path):
591
- logger.error(f"File not found: {file_path}")
592
  raise HTTPException(status_code=404, detail="File not found.")
593
- logger.debug(f"Serving file: {file_path}")
594
  return FileResponse(file_path, filename=file_name, media_type="text/html")
595
  except Exception as e:
596
  logger.error(f"Download error: {e}", exc_info=True)
@@ -599,7 +566,6 @@ async def download_file(file_type: str, query_id: str):
599
  # --- Gradio Interface ---
600
  def create_gradio_interface():
601
  try:
602
- logger.debug("Creating Gradio interface...")
603
  with gr.Blocks(
604
  title="🧬 Gene Analysis Pipeline",
605
  theme=gr.themes.Soft(),
@@ -630,53 +596,63 @@ def create_gradio_interface():
630
  with gr.Column(scale=2):
631
  dna_input = gr.Textbox(
632
  label="🧬 DNA Sequence",
633
- placeholder="Enter DNA sequence (ATCG format)...",
634
  lines=5,
635
- description="Paste your DNA sequence here"
636
  )
637
  with gr.Column(scale=1):
638
- similarity_score = gr.Slider(
639
  minimum=1,
640
  maximum=99,
641
- value=95.0,
642
- step=1.0,
643
  label="🎯 Similarity Threshold (%)",
644
- description="Minimum similarity for tree analysis"
645
  )
646
  build_ml_tree = gr.Checkbox(
647
- label="🌲 Build ML Tree",
648
  value=False,
649
- description="Generate phylogenetic placement (slower)"
 
 
 
 
 
650
  )
651
- analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
652
  with gr.TabItem("📁 File Upload"):
653
  with gr.Row():
654
  with gr.Column(scale=2):
655
  file_input = gr.File(
656
  label="📄 Upload FASTA File",
657
  file_types=[".fasta", ".fa", ".fas", ".txt"],
658
- description="Upload a FASTA file containing your sequence"
659
  )
660
  with gr.Column(scale=1):
661
- file_similarity_score = gr.Slider(
662
  minimum=1,
663
  maximum=99,
664
- value=95.0,
665
- step=1.0,
666
- label="🎯 Similarity Threshold (%)",
667
- description="Minimum similarity for tree analysis"
668
  )
669
  file_build_ml_tree = gr.Checkbox(
670
- label="🌲 Build ML Tree",
671
- value=False,
672
- description="Generate phylogenetic placement (slower)"
 
 
 
 
673
  )
674
- analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
 
675
  gr.Markdown("## 📊 Analysis Results")
 
676
  with gr.Row():
677
  with gr.Column():
678
  boundary_output = gr.Textbox(
679
- label="🎯 Boundary Detection",
680
  interactive=False,
681
  lines=2
682
  )
@@ -687,145 +663,203 @@ def create_gradio_interface():
687
  )
688
  with gr.Column():
689
  ml_tree_output = gr.Textbox(
690
- label="🌲 Phylogenetic Placement",
691
  interactive=False,
692
  lines=2
693
  )
694
  tree_analysis_output = gr.Textbox(
695
- label="🌳 Tree Analysis",
696
  interactive=False,
697
  lines=2
698
  )
 
699
  summary_output = gr.Textbox(
700
  label="📋 Summary",
701
  interactive=False,
702
  lines=8
703
  )
704
- with gr.Row():
705
- aligned_file = gr.File(label="📄 Alignment File", visible=False)
706
- tree_file = gr.File(label="🌲 Tree File", visible=False)
707
- tree_html_file = gr.File(label="🌳 Simplified Tree HTML", visible=False)
708
- report_html_file = gr.File(label="📊 Detailed Report HTML", visible=False)
709
  with gr.Tabs():
710
- with gr.TabItem("🌳 Interactive Tree"):
711
  tree_html = gr.HTML(
712
- value="<div style='text-align: center; color: #666; padding: 20px;'>No tree generated yet. Run analysis to see results.</div>"
 
713
  )
 
714
  with gr.TabItem("📊 Detailed Report"):
715
  report_html = gr.HTML(
716
- value="<div style='text-align: center; color: #666; padding: 20px;'>No report generated yet. Run analysis to see results.</div>"
 
717
  )
718
 
719
- # Event handlers
720
- def handle_analysis_output(*outputs):
721
- boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output, aligned_file, phy_file, _, _, tree_html_content, report_html_content, tree_html_path, report_html_path = outputs
722
- logger.debug("Handling Gradio output...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  return (
724
- boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
725
- gr.File.update(value=aligned_file, visible=aligned_file is not None),
726
- gr.File.update(value=phy_file, visible=phy_file is not None),
727
- gr.File.update(value=tree_html_path, visible=tree_html_path is not None),
728
- gr.File.update(value=report_html_path, visible=report_html_path is not None),
729
- tree_html_content,
730
- report_html_content
731
  )
732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  analyze_btn.click(
734
- fn=run_pipeline,
735
- inputs=[dna_input, similarity_score, build_ml_tree],
736
  outputs=[
737
- boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
738
- aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
739
- ],
740
- _js="""(outputs) => { return outputs; }"""
741
  )
742
 
743
- analyze_file_btn.click(
744
- fn=run_pipeline_from_file,
745
- inputs=[file_input, file_similarity_score, file_build_ml_tree],
746
  outputs=[
747
- boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
748
- aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
749
- ],
750
- _js="""(outputs) => { return outputs; }"""
751
  )
752
 
753
- gr.Examples(
754
- examples=[
755
- ["ATCG" * 250, 85.0, False],
756
- ["CGATCG" * 150, 90.0, True]
757
- ],
758
- inputs=[dna_input, similarity_score, build_ml_tree],
759
- label="Example Sequences"
760
- )
 
 
 
 
 
 
 
 
 
 
 
761
 
762
- gr.Markdown("""
763
- ## 📚 Instructions
764
- 1. **Input**: Enter a DNA sequence (ATCG format) or upload a FASTA file
765
- 2. **Parameters**:
766
- - Set similarity threshold for phylogenetic analysis (1-99%)
767
- - Choose whether to build ML tree (slower but more accurate)
768
- 3. **Analysis**: Click analyze to run the complete pipeline
769
- 4. **Results**: View results in different tabs - summary, tree visualization, and detailed report
770
- 5. **Downloads**: Download alignment, tree, simplified tree HTML, and detailed report HTML files
771
- ### 🔬 Pipeline Components:
772
- - **Boundary Detection**: Identifies F gene regions
773
- - **F Gene Validation**: Validates F gene using ML
774
- - **Phylogenetic Placement**: Places sequence in reference tree (optional)
775
- - **Tree Analysis**: Builds phylogenetic tree with similar sequences
776
- """)
777
-
778
- logger.debug("Gradio interface created successfully")
779
  return iface
 
780
  except Exception as e:
781
- logger.error(f"Gradio interface creation failed: {e}", exc_info=True)
 
782
  return gr.Interface(
783
- fn=lambda x: f"Error: {str(e)}",
784
- inputs=gr.Textbox(label="DNA Sequence"),
785
- outputs=gr.Textbox(label="Error"),
786
- title="🧬 Gene Analysis Pipeline (Error Mode)"
787
  )
788
 
789
- # --- Application Startup ---
790
- def run_application():
 
 
 
 
 
791
  try:
792
- logger.debug("Starting application...")
793
- gradio_app = create_gradio_interface()
794
- gradio_app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
795
- logger.info("🚀 Starting Gene Analysis Pipeline...")
796
- logger.info("📊 FastAPI docs available at: http://localhost:7860/docs")
797
- logger.info("🧬 Gradio interface available at: http://localhost:7860/gradio")
798
- uvicorn.run(
799
- app,
800
- host="0.0.0.0",
801
- port=7860,
802
- log_level="info"
803
- )
804
  except Exception as e:
805
- logger.error(f"Application startup failed: {e}", exc_info=True)
806
- try:
807
- logger.info("🔄 Falling back to Gradio-only mode...")
808
- gradio_app = create_gradio_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  gradio_app.launch(
810
  server_name="0.0.0.0",
811
  server_port=7860,
812
- share=False,
813
- debug=False
 
814
  )
815
- except Exception as fallback_error:
816
- logger.error(f"Fallback failed: {fallback_error}", exc_info=True)
817
- print("❌ Application failed to start. Check logs for details.")
818
-
819
- # --- Main Entry Point ---
820
- if __name__ == "__main__":
821
- print("🧬 Gene Analysis Pipeline Starting...")
822
- print("=" * 50)
823
- print("🔍 Checking system components...")
824
- mafft_available, iqtree_available, _, _ = check_tool_availability()
825
- print(f"🤖 Boundary Model: {'✅' if boundary_model else '❌'}")
826
- print(f"🧠 Keras Model: {'✅' if keras_model else '❌'}")
827
- print(f"🌳 Tree Analyzer: {'✅' if analyzer else '❌'}")
828
- print(f"🧬 MAFFT: {'✅' if mafft_available else '❌'}")
829
- print(f"🌲 IQ-TREE: {'✅' if iqtree_available else '❌'}")
830
- print("=" * 50)
831
- run_application()
 
17
  from analyzer import PhylogeneticTreeAnalyzer
18
  import tempfile
19
  import shutil
20
+ import sys
21
  import uuid
22
  from pathlib import Path
23
  from huggingface_hub import hf_hub_download
 
28
  import time
29
  import asyncio
30
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
31
+ from fastapi.responses import HTMLResponse, FileResponse
32
  from pydantic import BaseModel
33
  from typing import Optional
34
  import uvicorn
 
40
  try:
41
  file_handler = logging.FileHandler('/tmp/app.log')
42
  file_handler.setFormatter(log_formatter)
43
+ logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
44
  except Exception as e:
45
+ logging.basicConfig(level=logging.INFO, handlers=[log_handler])
46
  logging.warning(f"Failed to set up file logging: {e}")
47
+
48
  logger = logging.getLogger(__name__)
49
  logger.info(f"Gradio version: {gr.__version__}")
50
 
51
+ # Set event loop policy for compatibility with Gradio Spaces
52
  try:
53
  asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
54
  except Exception as e:
 
63
  QUERY_OUTPUT_DIR = os.path.join(BASE_DIR, "queries")
64
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
65
 
66
+ # Model repository and file paths
67
  MODEL_REPO = "GGproject10/best_boundary_aware_model"
68
  CSV_PATH = "f cleaned.csv"
69
 
70
+ # Initialize models as None
71
  boundary_model = None
72
  keras_model = None
73
  kmer_to_index = None
74
  analyzer = None
75
 
76
+ # --- Model Loading ---
77
  def load_models_safely():
78
  global boundary_model, keras_model, kmer_to_index, analyzer
79
  logger.info("🔍 Loading models...")
80
  try:
81
+ boundary_path = hf_hub_download(
82
+ repo_id=MODEL_REPO,
83
+ filename="best_boundary_aware_model.pth",
84
+ token=None
85
+ )
86
  if os.path.exists(boundary_path):
87
  boundary_model = EnhancedGenePredictor(boundary_path)
88
+ logger.info("✅ Boundary model loaded successfully.")
89
  else:
90
+ logger.error(f"❌ Boundary model file not found after download.")
91
  except Exception as e:
92
+ logger.error(f"❌ Failed to load boundary model: {e}")
93
  boundary_model = None
94
  try:
95
+ keras_path = hf_hub_download(
96
+ repo_id=MODEL_REPO,
97
+ filename="best_model.keras",
98
+ token=None
99
+ )
100
+ kmer_path = hf_hub_download(
101
+ repo_id=MODEL_REPO,
102
+ filename="kmer_to_index.pkl",
103
+ token=None
104
+ )
105
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
106
  keras_model = load_model(keras_path)
107
  with open(kmer_path, "rb") as f:
108
  kmer_to_index = pickle.load(f)
109
+ logger.info("✅ Keras model and k-mer index loaded successfully.")
110
  else:
111
+ logger.error(f"❌ Keras model or k-mer files not found.")
112
  except Exception as e:
113
+ logger.error(f"❌ Failed to load Keras model: {e}")
114
  keras_model = None
115
  kmer_to_index = None
116
  try:
117
  logger.info("🌳 Initializing tree analyzer...")
118
  analyzer = PhylogeneticTreeAnalyzer()
119
  csv_candidates = [
120
+ CSV_PATH,
121
+ os.path.join(BASE_DIR, CSV_PATH),
122
+ os.path.join(BASE_DIR, "app", CSV_PATH),
123
+ os.path.join(os.path.dirname(__file__), CSV_PATH),
124
+ "f_cleaned.csv",
125
+ os.path.join(BASE_DIR, "f_cleaned.csv")
126
  ]
127
  csv_loaded = False
128
  for csv_candidate in csv_candidates:
 
134
  csv_loaded = True
135
  break
136
  except Exception as e:
137
+ logger.warning(f"CSV load failed for {csv_candidate}: {e}")
138
+ continue
139
  if not csv_loaded:
140
+ logger.error("❌ Failed to load CSV data from any candidate location.")
141
  analyzer = None
142
  else:
143
  try:
144
  if analyzer.train_ai_model():
145
+ logger.info("✅ AI model training completed successfully")
146
  else:
147
+ logger.warning("⚠️ AI model training failed; proceeding with basic analysis.")
148
  except Exception as e:
149
+ logger.warning(f"⚠️ AI model training failed: {e}")
150
  except Exception as e:
151
+ logger.error(f"❌ Tree analyzer initialization failed: {e}")
152
  analyzer = None
153
 
154
+ # Load models at startup
155
  load_models_safely()
156
 
157
+ # --- Tool Detection ---
158
  def setup_binary_permissions():
159
  for binary in [MAFFT_PATH, IQTREE_PATH]:
160
  if os.path.exists(binary):
 
162
  os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
163
  logger.info(f"Set executable permission on {binary}")
164
  except Exception as e:
165
+ logger.warning(f"Failed to set permission on {binary}: {e}")
166
 
167
  def check_tool_availability():
168
  setup_binary_permissions()
 
172
  for candidate in mafft_candidates:
173
  if shutil.which(candidate) or os.path.exists(candidate):
174
  try:
175
+ result = subprocess.run(
176
+ [candidate, "--help"],
177
+ capture_output=True,
178
+ text=True,
179
+ timeout=5
180
+ )
181
  if result.returncode == 0 or "mafft" in result.stderr.lower():
182
  mafft_available = True
183
  mafft_cmd = candidate
 
191
  for candidate in iqtree_candidates:
192
  if shutil.which(candidate) or os.path.exists(candidate):
193
  try:
194
+ result = subprocess.run(
195
+ [candidate, "--help"],
196
+ capture_output=True,
197
+ text=True,
198
+ timeout=5
199
+ )
200
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
201
  iqtree_available = True
202
  iqtree_cmd = candidate
 
208
 
209
  # --- Pipeline Functions ---
210
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
 
211
  try:
212
  if len(sequence.strip()) < 100:
213
  return False, "Sequence too short (<100 bp).", None, None
214
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
215
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
216
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
217
+ output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
218
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
 
219
  return False, "Reference alignment or tree not found.", None, None
 
220
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
221
+ SeqIO.write([query_record], query_fasta, "fasta")
222
+ with open(aligned_with_query, "w") as output_file:
 
223
  subprocess.run([
224
+ mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
225
+ ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
226
+ if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
 
 
227
  return False, "MAFFT alignment failed.", None, None
 
228
  subprocess.run([
229
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
230
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
231
  ], capture_output=True, text=True, timeout=1200, check=True)
232
  treefile = f"{output_prefix}.treefile"
233
  if not os.path.exists(treefile):
 
234
  return False, "IQ-TREE placement failed.", aligned_with_query, None
235
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
 
236
  return True, success_msg, aligned_with_query, treefile
237
  except Exception as e:
238
  logger.error(f"Phylogenetic placement failed: {e}", exc_info=True)
239
  return False, f"Error: {str(e)}", None, None
240
  finally:
241
+ if 'query_fasta' in locals() and os.path.exists(query_fasta):
242
  try:
243
  os.unlink(query_fasta)
244
+ except Exception as cleanup_error:
245
+ logger.warning(f"Failed to clean up {query_fasta}: {cleanup_error}")
 
246
 
247
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
248
  try:
249
  logger.debug("Starting tree analysis...")
250
  if not analyzer:
 
251
  return "❌ Tree analyzer not initialized.", None, None
 
252
  if not sequence or len(sequence.strip()) < 10:
 
253
  return "❌ Invalid sequence.", None, None
254
  if not (1 <= matching_percentage <= 99):
 
255
  return "❌ Matching percentage must be 1-99.", None, None
256
  logger.debug("Finding query sequence...")
257
  if not analyzer.find_query_sequence(sequence):
 
258
  return "❌ Sequence not accepted.", None, None
259
  logger.debug("Finding similar sequences...")
260
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
261
  if not matched_ids:
 
262
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
263
  logger.debug("Building tree structure...")
264
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
 
272
  logger.debug("Generating detailed report...")
273
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
274
  report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
275
+ logger.debug(f"Tree analysis completed: {len(matched_ids)} matches")
276
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
277
  except Exception as e:
278
  logger.error(f"Tree analysis failed: {e}", exc_info=True)
 
280
 
281
  def predict_with_keras(sequence):
282
  try:
 
283
  if not keras_model or not kmer_to_index:
 
284
  return "❌ Keras model not available."
285
  if len(sequence) < 6:
 
286
  return "❌ Sequence too short (<6 bp)."
 
287
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
288
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
289
  input_arr = np.array([indices])
 
290
  prediction = keras_model.predict(input_arr, verbose=0)[0]
291
  f_gene_prob = prediction[-1]
292
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
293
  return f"✅ {percentage}% F gene confidence"
294
  except Exception as e:
295
  logger.error(f"Keras prediction failed: {e}", exc_info=True)
 
297
 
298
  def read_fasta_file(file_obj):
299
  try:
 
300
  if file_obj is None:
 
301
  return ""
302
  if isinstance(file_obj, str):
303
  with open(file_obj, "r") as f:
 
306
  content = file_obj.read().decode("utf-8")
307
  lines = content.strip().split("\n")
308
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
309
+ return ''.join(seq_lines)
 
 
310
  except Exception as e:
311
  logger.error(f"Failed to read FASTA file: {e}", exc_info=True)
312
  return ""
313
 
314
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
315
  try:
 
316
  dna_input = dna_input.upper().strip()
317
  if not dna_input:
 
318
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input", None, None
319
  if not re.match('^[ACTGN]+$', dna_input):
 
320
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
321
  processed_sequence = dna_input
322
  boundary_output = ""
 
323
  if boundary_model:
324
  try:
325
  result = boundary_model.predict_sequence(dna_input)
 
327
  if regions:
328
  processed_sequence = regions[0]["sequence"]
329
  boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
 
330
  else:
331
  boundary_output = "⚠️ No F gene regions found."
332
  processed_sequence = dna_input
 
333
  except Exception as e:
334
  boundary_output = f"❌ Boundary prediction error: {str(e)}"
335
  processed_sequence = dna_input
 
336
  else:
337
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
 
338
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
339
  aligned_file = None
340
  phy_file = None
341
  ml_tree_output = ""
342
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
 
343
  try:
344
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
345
  if mafft_available and iqtree_available:
 
347
  ml_tree_output = ml_message
348
  aligned_file = ml_aligned
349
  phy_file = ml_tree
 
350
  else:
351
  ml_tree_output = "❌ MAFFT or IQ-TREE not available"
 
352
  except Exception as e:
353
  ml_tree_output = f"❌ ML tree error: {str(e)}"
 
354
  elif build_ml_tree:
355
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
 
356
  else:
357
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
358
  tree_html_content = "No tree generated."
359
  report_html_content = "No report generated."
360
  tree_html_path = None
361
  report_html_path = None
362
  simplified_ml_output = ""
363
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
 
364
  try:
365
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
366
  simplified_ml_output = tree_result
367
  if tree_html_path and os.path.exists(tree_html_path):
368
  with open(tree_html_path, 'r', encoding='utf-8') as f:
369
  tree_html_content = f.read()
 
370
  else:
371
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
372
  if report_html_path and os.path.exists(report_html_path):
373
  with open(report_html_path, 'r', encoding='utf-8') as f:
374
  report_html_content = f.read()
 
375
  else:
376
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
377
  except Exception as e:
378
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
379
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
380
  report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
 
381
  else:
382
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
383
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
384
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
385
  summary_output = f"""
386
  📊 ANALYSIS SUMMARY:
387
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
392
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
393
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
394
  """
 
395
  return (
396
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
397
  aligned_file, phy_file, None, None, tree_html_content, report_html_content,
 
405
  async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
406
  temp_file_path = None
407
  try:
 
408
  if fasta_file_obj is None:
 
409
  return "❌ No file provided", "", "", "", "", None, None, None, None, "No input", "No input", None, None
410
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
411
  if isinstance(fasta_file_obj, UploadFile):
 
416
  content = f.read()
417
  temp_file.write(content)
418
  temp_file_path = temp_file.name
 
419
  dna_input = read_fasta_file(temp_file_path)
420
  if not dna_input:
 
421
  return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input", None, None
 
422
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
423
  except Exception as e:
424
  logger.error(f"Pipeline from file error: {e}", exc_info=True)
 
428
  if temp_file_path and os.path.exists(temp_file_path):
429
  try:
430
  os.unlink(temp_file_path)
 
431
  except Exception as e:
432
+ logger.warning(f"Failed to delete temp file {temp_file_path}: {e}")
433
 
434
  # --- Pydantic Models ---
435
  class AnalysisRequest(BaseModel):
 
491
  @app.post("/analyze", response_model=AnalysisResponse)
492
  async def analyze_sequence(request: AnalysisRequest):
493
  try:
 
494
  result = run_pipeline(request.sequence, request.similarity_score, request.build_ml_tree)
 
495
  return AnalysisResponse(
496
  boundary_output=result[0] or "",
497
  keras_output=result[1] or "",
 
519
  ):
520
  temp_file_path = None
521
  try:
 
522
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
523
  content = await file.read()
524
  temp_file.write(content)
525
  temp_file_path = temp_file.name
526
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
527
  return AnalysisResponse(
528
  boundary_output=result[0] or "",
529
  keras_output=result[1] or "",
 
546
  if temp_file_path and os.path.exists(temp_file_path):
547
  try:
548
  os.unlink(temp_file_path)
 
549
  except Exception as e:
550
+ logger.warning(f"Failed to clean up {temp_file_path}: {e}")
551
 
552
  @app.get("/download/{file_type}/{query_id}")
553
  async def download_file(file_type: str, query_id: str):
554
  try:
 
555
  if file_type not in ["tree", "report"]:
 
556
  raise HTTPException(status_code=400, detail="Invalid file type. Use 'tree' or 'report'.")
557
  file_name = f"phylogenetic_tree_{query_id}.html" if file_type == "tree" else f"detailed_report_{query_id}.html"
558
  file_path = os.path.join("/tmp", file_name)
559
  if not os.path.exists(file_path):
 
560
  raise HTTPException(status_code=404, detail="File not found.")
 
561
  return FileResponse(file_path, filename=file_name, media_type="text/html")
562
  except Exception as e:
563
  logger.error(f"Download error: {e}", exc_info=True)
 
566
  # --- Gradio Interface ---
567
  def create_gradio_interface():
568
  try:
 
569
  with gr.Blocks(
570
  title="🧬 Gene Analysis Pipeline",
571
  theme=gr.themes.Soft(),
 
596
  with gr.Column(scale=2):
597
  dna_input = gr.Textbox(
598
  label="🧬 DNA Sequence",
599
+ placeholder="Enter your DNA sequence (ATCG format)...",
600
  lines=5,
601
+ max_lines=10
602
  )
603
  with gr.Column(scale=1):
604
+ similarity_slider = gr.Slider(
605
  minimum=1,
606
  maximum=99,
607
+ value=95,
608
+ step=1,
609
  label="🎯 Similarity Threshold (%)",
610
+ info="Minimum similarity for phylogenetic analysis"
611
  )
612
  build_ml_tree = gr.Checkbox(
613
+ label="🌳 Build ML Tree",
614
  value=False,
615
+ info="Perform phylogenetic placement (slower)"
616
+ )
617
+ analyze_btn = gr.Button(
618
+ "🔬 Analyze Sequence",
619
+ variant="primary",
620
+ size="lg"
621
  )
622
+
623
  with gr.TabItem("📁 File Upload"):
624
  with gr.Row():
625
  with gr.Column(scale=2):
626
  file_input = gr.File(
627
  label="📄 Upload FASTA File",
628
  file_types=[".fasta", ".fa", ".fas", ".txt"],
629
+ type="filepath"
630
  )
631
  with gr.Column(scale=1):
632
+ file_similarity_slider = gr.Slider(
633
  minimum=1,
634
  maximum=99,
635
+ value=95,
636
+ step=1,
637
+ label="🎯 Similarity Threshold (%)"
 
638
  )
639
  file_build_ml_tree = gr.Checkbox(
640
+ label="🌳 Build ML Tree",
641
+ value=False
642
+ )
643
+ file_analyze_btn = gr.Button(
644
+ "🔬 Analyze File",
645
+ variant="primary",
646
+ size="lg"
647
  )
648
+
649
+ # Results Section
650
  gr.Markdown("## 📊 Analysis Results")
651
+
652
  with gr.Row():
653
  with gr.Column():
654
  boundary_output = gr.Textbox(
655
+ label="🎯 Boundary Prediction",
656
  interactive=False,
657
  lines=2
658
  )
 
663
  )
664
  with gr.Column():
665
  ml_tree_output = gr.Textbox(
666
+ label="🌳 Phylogenetic Placement",
667
  interactive=False,
668
  lines=2
669
  )
670
  tree_analysis_output = gr.Textbox(
671
+ label="📈 Tree Analysis",
672
  interactive=False,
673
  lines=2
674
  )
675
+
676
  summary_output = gr.Textbox(
677
  label="📋 Summary",
678
  interactive=False,
679
  lines=8
680
  )
681
+
682
+ # Interactive Visualizations
 
 
 
683
  with gr.Tabs():
684
+ with gr.TabItem("🌳 Phylogenetic Tree"):
685
  tree_html = gr.HTML(
686
+ label="Interactive Tree Visualization",
687
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Run analysis to see phylogenetic tree</div>"
688
  )
689
+
690
  with gr.TabItem("📊 Detailed Report"):
691
  report_html = gr.HTML(
692
+ label="Comprehensive Analysis Report",
693
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Run analysis to see detailed report</div>"
694
  )
695
 
696
+ # Download Section
697
+ with gr.Row():
698
+ with gr.Column():
699
+ aligned_file = gr.File(
700
+ label="📄 Download Alignment",
701
+ visible=False
702
+ )
703
+ tree_file = gr.File(
704
+ label="🌳 Download Tree File",
705
+ visible=False
706
+ )
707
+
708
+ # Event Handlers
709
+ def process_text_input(dna_seq, similarity, build_tree):
710
+ if not dna_seq or not dna_seq.strip():
711
+ return (
712
+ "❌ Please enter a DNA sequence", "", "", "", "",
713
+ None, None,
714
+ "<div style='color: red;'>No input provided</div>",
715
+ "<div style='color: red;'>No input provided</div>"
716
+ )
717
+
718
+ results = run_pipeline(dna_seq, similarity, build_tree)
719
  return (
720
+ results[0], results[1], results[2], results[3], results[4],
721
+ results[5], results[6], results[9], results[10]
 
 
 
 
 
722
  )
723
 
724
+ def process_file_input(file_path, similarity, build_tree):
725
+ if not file_path:
726
+ return (
727
+ "❌ Please upload a file", "", "", "", "",
728
+ None, None,
729
+ "<div style='color: red;'>No file provided</div>",
730
+ "<div style='color: red;'>No file provided</div>"
731
+ )
732
+
733
+ # Read the FASTA file
734
+ try:
735
+ sequence = read_fasta_file(file_path)
736
+ if not sequence:
737
+ return (
738
+ "❌ Failed to read sequence from file", "", "", "", "",
739
+ None, None,
740
+ "<div style='color: red;'>Invalid file format</div>",
741
+ "<div style='color: red;'>Invalid file format</div>"
742
+ )
743
+
744
+ results = run_pipeline(sequence, similarity, build_tree)
745
+ return (
746
+ results[0], results[1], results[2], results[3], results[4],
747
+ results[5], results[6], results[9], results[10]
748
+ )
749
+ except Exception as e:
750
+ error_msg = f"❌ Error processing file: {str(e)}"
751
+ return (
752
+ error_msg, "", "", "", "",
753
+ None, None,
754
+ f"<div style='color: red;'>{error_msg}</div>",
755
+ f"<div style='color: red;'>{error_msg}</div>"
756
+ )
757
+
758
+ # Wire up the event handlers
759
  analyze_btn.click(
760
+ fn=process_text_input,
761
+ inputs=[dna_input, similarity_slider, build_ml_tree],
762
  outputs=[
763
+ boundary_output, keras_output, ml_tree_output,
764
+ tree_analysis_output, summary_output,
765
+ aligned_file, tree_file, tree_html, report_html
766
+ ]
767
  )
768
 
769
+ file_analyze_btn.click(
770
+ fn=process_file_input,
771
+ inputs=[file_input, file_similarity_slider, file_build_ml_tree],
772
  outputs=[
773
+ boundary_output, keras_output, ml_tree_output,
774
+ tree_analysis_output, summary_output,
775
+ aligned_file, tree_file, tree_html, report_html
776
+ ]
777
  )
778
 
779
+ # Example sequences for quick testing
780
+ gr.Markdown("### 🧪 Example Sequences")
781
+ with gr.Row():
782
+ example1_btn = gr.Button("Example 1: Short F Gene", size="sm")
783
+ example2_btn = gr.Button("Example 2: Full Length", size="sm")
784
+ clear_btn = gr.Button("🗑️ Clear", size="sm")
785
+
786
+ def load_example1():
787
+ return "ATGGAGTTGCTAATCCTCAAACTTCTGCTTGAAGGGTCACAGTACACACCCTGTGCAAAGAGACAAGCAACAATAATGATCTGGATTCGTACGACGTGGCTGAGGGGAACCTGTATGTGAACAGTCTAGCCAGAGGTTACTATGCAACGGTCACTAGGGCCGGAATCCCTCCCAATGCACCAGGACGCTCTGATCACGTAAGACGAACTTACAGATCCAAAGTGGGAAACGGGGAACGGCTGGGTACCCTGAGACAGCCTGGACAAGACCTCAGGTGTCACATACGACGGGGACTATAATATGGACGCCTGCAGCGGTGGAACAAATAGCAACAGACCT"
788
+
789
+ def load_example2():
790
+ return "ATGGAGTTGCTAATCCTCAAACTTCTGCTTGAAGGGTCACAGTACACACCCTGTGCAAAGAGACAAGCAACAATAATGATCTGGATTCGTACGACGTGGCTGAGGGGAACCTGTATGTGAACAGTCTAGCCAGAGGTTACTATGCAACGGTCACTAGGGCCGGAATCCCTCCCAATGCACCAGGACGCTCTGATCACGTAAGACGAACTTACAGATCCAAAGTGGGAAACGGGGAACGGCTGGGTACCCTGAGACAGCCTGGACAAGACCTCAGGTGTCACATACGACGGGGACTATAATATGGACGCCTGCAGCGGTGGAACAAATAGCAACAGACCTCATGTGGGCAGTGGCCACAATCTACAATTTGGATACAGTGGAATTTGGAGAAGCGACCTTCAGAACCTGGGTCATGGTGCCGTCCTACGGTGGGGCCGCCGAAGCAACTCTCGACTACGTGGTGGAAAGCCTGGGCTTCGGAGGCGCAGTTATCGGAAAAAGCAAAGAACTCACAGGAAAGCTGTTCAAGAACGACACCTACTATGGAAAGATGGGTCACTATCTAAAAATTGATTCCTGTACCAGCCAACTTTAA"
791
+
792
+ def clear_inputs():
793
+ return ""
794
+
795
+ example1_btn.click(fn=load_example1, outputs=dna_input)
796
+ example2_btn.click(fn=load_example2, outputs=dna_input)
797
+ clear_btn.click(fn=clear_inputs, outputs=dna_input)
798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
799
  return iface
800
+
801
  except Exception as e:
802
+ logger.error(f"Failed to create Gradio interface: {e}", exc_info=True)
803
+ # Fallback simple interface
804
  return gr.Interface(
805
+ fn=lambda x: f"Interface creation failed: {str(e)}",
806
+ inputs=gr.Textbox(label="Input"),
807
+ outputs=gr.Textbox(label="Output"),
808
+ title="🧬 Gene Analysis Pipeline - Error"
809
  )
810
 
811
+ # Create the Gradio interface
812
+ gradio_app = create_gradio_interface()
813
+
814
+ # Mount Gradio app to FastAPI
815
+ @app.get("/gradio", response_class=HTMLResponse)
816
+ async def gradio_interface():
817
+ """Serve the Gradio interface"""
818
  try:
819
+ # Generate the Gradio app HTML
820
+ return gradio_app.launch(prevent_thread_lock=True, share=False, show_error=True)
 
 
 
 
 
 
 
 
 
 
821
  except Exception as e:
822
+ logger.error(f"Failed to serve Gradio interface: {e}", exc_info=True)
823
+ return HTMLResponse(f"""
824
+ <html>
825
+ <head><title>🧬 Gene Analysis Pipeline - Error</title></head>
826
+ <body>
827
+ <h1>🧬 Gene Analysis Pipeline</h1>
828
+ <p style="color: red;">Failed to load Gradio interface: {str(e)}</p>
829
+ <p>Please try using the API endpoints instead:</p>
830
+ <ul>
831
+ <li><a href="/docs">API Documentation</a></li>
832
+ <li><a href="/health">Health Check</a></li>
833
+ </ul>
834
+ </body>
835
+ </html>
836
+ """)
837
+
838
+ # --- Main Application Runner ---
839
+ if __name__ == "__main__":
840
+ try:
841
+ logger.info("🚀 Starting Gene Analysis Pipeline...")
842
+
843
+ # Check if running in Gradio Spaces
844
+ if os.getenv("SPACE_ID"):
845
+ logger.info("🌐 Running in Hugging Face Spaces - Gradio mode")
846
  gradio_app.launch(
847
  server_name="0.0.0.0",
848
  server_port=7860,
849
+ share=True,
850
+ show_error=True,
851
+ enable_queue=True
852
  )
853
+ else:
854
+ logger.info("🔧 Running in local/server mode - FastAPI + Gradio")
855
+ # Run FastAPI with Gradio mounted
856
+ uvicorn.run(
857
+ app,
858
+ host="0.0.0.0",
859
+ port=int(os.getenv("PORT", 7860)),
860
+ log_level="info"
861
+ )
862
+
863
+ except Exception as e:
864
+ logger.error(f" Failed to start application: {e}", exc_info=True)
865
+ sys.exit(1)