re-type commited on
Commit
f7a0a05
·
verified ·
1 Parent(s): 8c277d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -39
app.py CHANGED
@@ -39,9 +39,9 @@ log_handler.setFormatter(log_formatter)
39
  try:
40
  file_handler = logging.FileHandler('/tmp/app.log')
41
  file_handler.setFormatter(log_formatter)
42
- logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
43
  except Exception as e:
44
- logging.basicConfig(level=logging.INFO, handlers=[log_handler])
45
  logging.warning(f"Failed to set up file logging: {e}")
46
  logger = logging.getLogger(__name__)
47
  logger.info(f"Gradio version: {gr.__version__}")
@@ -82,7 +82,7 @@ def load_models_safely():
82
  else:
83
  logger.error(f"❌ Boundary model file not found.")
84
  except Exception as e:
85
- logger.error(f"❌ Failed to load boundary model: {e}")
86
  boundary_model = None
87
  try:
88
  keras_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.keras", token=None)
@@ -95,7 +95,7 @@ def load_models_safely():
95
  else:
96
  logger.error(f"❌ Keras model files not found.")
97
  except Exception as e:
98
- logger.error(f"❌ Failed to load Keras model: {e}")
99
  keras_model = None
100
  kmer_to_index = None
101
  try:
@@ -115,7 +115,7 @@ def load_models_safely():
115
  csv_loaded = True
116
  break
117
  except Exception as e:
118
- logger.warning(f"CSV load failed for {csv_candidate}: {e}")
119
  if not csv_loaded:
120
  logger.error("❌ Failed to load CSV data.")
121
  analyzer = None
@@ -126,9 +126,9 @@ def load_models_safely():
126
  else:
127
  logger.warning("⚠️ AI model training failed.")
128
  except Exception as e:
129
- logger.warning(f"⚠️ AI model training failed: {e}")
130
  except Exception as e:
131
- logger.error(f"❌ Tree analyzer initialization failed: {e}")
132
  analyzer = None
133
 
134
  load_models_safely()
@@ -141,7 +141,7 @@ def setup_binary_permissions():
141
  os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
142
  logger.info(f"Set executable permission on {binary}")
143
  except Exception as e:
144
- logger.warning(f"Failed to set permission on {binary}: {e}")
145
 
146
  def check_tool_availability():
147
  setup_binary_permissions()
@@ -177,57 +177,73 @@ def check_tool_availability():
177
 
178
  # --- Pipeline Functions ---
179
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
 
180
  try:
181
  if len(sequence.strip()) < 100:
182
  return False, "Sequence too short (<100 bp).", None, None
183
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
184
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
185
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
186
- output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
187
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
 
188
  return False, "Reference alignment or tree not found.", None, None
 
189
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
190
- SeqIO.write([query_record], query_fasta, "fasta")
191
- with open(aligned_with_query, "w") as output_file:
 
192
  subprocess.run([
193
- mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
194
- ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
195
- if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
 
 
196
  return False, "MAFFT alignment failed.", None, None
 
197
  subprocess.run([
198
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
199
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
200
  ], capture_output=True, text=True, timeout=1200, check=True)
201
  treefile = f"{output_prefix}.treefile"
202
  if not os.path.exists(treefile):
 
203
  return False, "IQ-TREE placement failed.", aligned_with_query, None
204
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
 
205
  return True, success_msg, aligned_with_query, treefile
206
  except Exception as e:
207
  logger.error(f"Phylogenetic placement failed: {e}", exc_info=True)
208
  return False, f"Error: {str(e)}", None, None
209
  finally:
210
- if 'query_fasta' in locals() and os.path.exists(query_fasta):
211
  try:
212
  os.unlink(query_fasta)
213
- except Exception as e: # Fixed bare 'except'
214
- logger.warning(f"Failed to clean up {query_fasta}: {e}")
 
215
 
216
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
217
  try:
218
  logger.debug("Starting tree analysis...")
219
  if not analyzer:
 
220
  return "❌ Tree analyzer not initialized.", None, None
 
221
  if not sequence or len(sequence.strip()) < 10:
 
222
  return "❌ Invalid sequence.", None, None
223
  if not (1 <= matching_percentage <= 99):
 
224
  return "❌ Matching percentage must be 1-99.", None, None
225
  logger.debug("Finding query sequence...")
226
  if not analyzer.find_query_sequence(sequence):
 
227
  return "❌ Sequence not accepted.", None, None
228
  logger.debug("Finding similar sequences...")
229
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
230
  if not matched_ids:
 
231
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
232
  logger.debug("Building tree structure...")
233
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
@@ -241,7 +257,7 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
241
  logger.debug("Generating detailed report...")
242
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
243
  report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
244
- logger.debug(f"Tree analysis completed: {len(matched_ids)} matches")
245
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
246
  except Exception as e:
247
  logger.error(f"Tree analysis failed: {e}", exc_info=True)
@@ -249,16 +265,22 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
249
 
250
  def predict_with_keras(sequence):
251
  try:
 
252
  if not keras_model or not kmer_to_index:
 
253
  return "❌ Keras model not available."
254
  if len(sequence) < 6:
 
255
  return "❌ Sequence too short (<6 bp)."
 
256
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
257
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
258
  input_arr = np.array([indices])
 
259
  prediction = keras_model.predict(input_arr, verbose=0)[0]
260
  f_gene_prob = prediction[-1]
261
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
262
  return f"✅ {percentage}% F gene confidence"
263
  except Exception as e:
264
  logger.error(f"Keras prediction failed: {e}", exc_info=True)
@@ -266,7 +288,9 @@ def predict_with_keras(sequence):
266
 
267
  def read_fasta_file(file_obj):
268
  try:
 
269
  if file_obj is None:
 
270
  return ""
271
  if isinstance(file_obj, str):
272
  with open(file_obj, "r") as f:
@@ -275,20 +299,26 @@ def read_fasta_file(file_obj):
275
  content = file_obj.read().decode("utf-8")
276
  lines = content.strip().split("\n")
277
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
278
- return ''.join(seq_lines)
 
 
279
  except Exception as e:
280
  logger.error(f"Failed to read FASTA file: {e}", exc_info=True)
281
  return ""
282
 
283
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
284
  try:
 
285
  dna_input = dna_input.upper().strip()
286
  if not dna_input:
 
287
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input", None, None
288
  if not re.match('^[ACTGN]+$', dna_input):
 
289
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
290
  processed_sequence = dna_input
291
  boundary_output = ""
 
292
  if boundary_model:
293
  try:
294
  result = boundary_model.predict_sequence(dna_input)
@@ -296,19 +326,25 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
296
  if regions:
297
  processed_sequence = regions[0]["sequence"]
298
  boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
 
299
  else:
300
  boundary_output = "⚠️ No F gene regions found."
301
  processed_sequence = dna_input
 
302
  except Exception as e:
303
  boundary_output = f"❌ Boundary prediction error: {str(e)}"
304
  processed_sequence = dna_input
 
305
  else:
306
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
 
 
307
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
308
  aligned_file = None
309
  phy_file = None
310
  ml_tree_output = ""
311
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
 
312
  try:
313
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
314
  if mafft_available and iqtree_available:
@@ -316,41 +352,53 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
316
  ml_tree_output = ml_message
317
  aligned_file = ml_aligned
318
  phy_file = ml_tree
 
319
  else:
320
  ml_tree_output = "❌ MAFFT or IQ-TREE not available"
 
321
  except Exception as e:
322
  ml_tree_output = f"❌ ML tree error: {str(e)}"
 
323
  elif build_ml_tree:
324
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
 
325
  else:
326
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
 
327
  tree_html_content = "No tree generated."
328
  report_html_content = "No report generated."
329
  tree_html_path = None
330
  report_html_path = None
331
  simplified_ml_output = ""
332
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
 
333
  try:
334
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
335
  simplified_ml_output = tree_result
336
  if tree_html_path and os.path.exists(tree_html_path):
337
  with open(tree_html_path, 'r', encoding='utf-8') as f:
338
  tree_html_content = f.read()
 
339
  else:
340
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
341
  if report_html_path and os.path.exists(report_html_path):
342
  with open(report_html_path, 'r', encoding='utf-8') as f:
343
  report_html_content = f.read()
 
344
  else:
345
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
 
346
  except Exception as e:
347
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
348
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
349
  report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
 
350
  else:
351
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
352
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
353
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
 
354
  summary_output = f"""
355
  📊 ANALYSIS SUMMARY:
356
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -361,6 +409,7 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
361
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
362
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
363
  """
 
364
  return (
365
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
366
  aligned_file, phy_file, None, None, tree_html_content, report_html_content,
@@ -371,10 +420,12 @@ Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
371
  error_msg = f"❌ Pipeline Error: {str(e)}"
372
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg, None, None
373
 
374
- async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_file):
375
  temp_file_path = None
376
  try:
 
377
  if fasta_file_obj is None:
 
378
  return "❌ No file provided", "", "", "", "", None, None, None, None, "No input", "No input", None, None
379
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
380
  if isinstance(fasta_file_obj, UploadFile):
@@ -385,9 +436,12 @@ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_file
385
  content = f.read()
386
  temp_file.write(content)
387
  temp_file_path = temp_file.name
 
388
  dna_input = read_fasta_file(temp_file_path)
389
  if not dna_input:
 
390
  return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input", None, None
 
391
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
392
  except Exception as e:
393
  logger.error(f"Pipeline from file error: {e}", exc_info=True)
@@ -397,8 +451,9 @@ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_file
397
  if temp_file_path and os.path.exists(temp_file_path):
398
  try:
399
  os.unlink(temp_file_path)
 
400
  except Exception as e:
401
- logger.warning(f"Failed to delete temp file {temp_file_path}: {e}")
402
 
403
  # --- Pydantic Models ---
404
  class AnalysisRequest(BaseModel):
@@ -460,7 +515,9 @@ async def health_check():
460
  @app.post("/analyze", response_model=AnalysisResponse)
461
  async def analyze_sequence(request: AnalysisRequest):
462
  try:
 
463
  result = run_pipeline(request.sequence, request.similarity_score, request.build_ml_tree)
 
464
  return AnalysisResponse(
465
  boundary_output=result[0] or "",
466
  keras_output=result[1] or "",
@@ -488,11 +545,13 @@ async def analyze_file(
488
  ):
489
  temp_file_path = None
490
  try:
 
491
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
492
  content = await file.read()
493
  temp_file.write(content)
494
  temp_file_path = temp_file.name
495
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
 
496
  return AnalysisResponse(
497
  boundary_output=result[0] or "",
498
  keras_output=result[1] or "",
@@ -515,18 +574,23 @@ async def analyze_file(
515
  if temp_file_path and os.path.exists(temp_file_path):
516
  try:
517
  os.unlink(temp_file_path)
 
518
  except Exception as e:
519
- logger.warning(f"Failed to clean up {temp_file_path}: {e}")
520
 
521
  @app.get("/download/{file_type}/{query_id}")
522
  async def download_file(file_type: str, query_id: str):
523
  try:
 
524
  if file_type not in ["tree", "report"]:
 
525
  raise HTTPException(status_code=400, detail="Invalid file type. Use 'tree' or 'report'.")
526
  file_name = f"phylogenetic_tree_{query_id}.html" if file_type == "tree" else f"detailed_report_{query_id}.html"
527
  file_path = os.path.join("/tmp", file_name)
528
  if not os.path.exists(file_path):
 
529
  raise HTTPException(status_code=404, detail="File not found.")
 
530
  return FileResponse(file_path, filename=file_name, media_type="text/html")
531
  except Exception as e:
532
  logger.error(f"Download error: {e}", exc_info=True)
@@ -535,6 +599,7 @@ async def download_file(file_type: str, query_id: str):
535
  # --- Gradio Interface ---
536
  def create_gradio_interface():
537
  try:
 
538
  with gr.Blocks(
539
  title="🧬 Gene Analysis Pipeline",
540
  theme=gr.themes.Soft(),
@@ -566,7 +631,8 @@ def create_gradio_interface():
566
  dna_input = gr.Textbox(
567
  label="🧬 DNA Sequence",
568
  placeholder="Enter DNA sequence (ATCG format)...",
569
- lines=5
 
570
  )
571
  with gr.Column(scale=1):
572
  similarity_score = gr.Slider(
@@ -574,11 +640,13 @@ def create_gradio_interface():
574
  maximum=99,
575
  value=95.0,
576
  step=1.0,
577
- label="🎯 Similarity Threshold (%)"
 
578
  )
579
  build_ml_tree = gr.Checkbox(
580
  label="🌲 Build ML Tree",
581
- value=False
 
582
  )
583
  analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
584
  with gr.TabItem("📁 File Upload"):
@@ -586,7 +654,8 @@ def create_gradio_interface():
586
  with gr.Column(scale=2):
587
  file_input = gr.File(
588
  label="📄 Upload FASTA File",
589
- file_types=[".fasta", ".fa", ".fas", ".txt"]
 
590
  )
591
  with gr.Column(scale=1):
592
  file_similarity_score = gr.Slider(
@@ -594,22 +663,44 @@ def create_gradio_interface():
594
  maximum=99,
595
  value=95.0,
596
  step=1.0,
597
- label="🎯 Similarity Threshold (%)"
 
598
  )
599
  file_build_ml_tree = gr.Checkbox(
600
  label="🌲 Build ML Tree",
601
- value=False
 
602
  )
603
  analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
604
  gr.Markdown("## 📊 Analysis Results")
605
  with gr.Row():
606
  with gr.Column():
607
- boundary_output = gr.Textbox(label="🎯 Boundary Detection", interactive=False, lines=2)
608
- keras_output = gr.Textbox(label="🧠 F Gene Validation", interactive=False, lines=2)
 
 
 
 
 
 
 
 
609
  with gr.Column():
610
- ml_tree_output = gr.Textbox(label="🌲 Phylogenetic Placement", interactive=False, lines=2)
611
- tree_analysis_output = gr.Textbox(label="🌳 Tree Analysis", interactive=False, lines=2)
612
- summary_output = gr.Textbox(label="📋 Summary", interactive=False, lines=8)
 
 
 
 
 
 
 
 
 
 
 
 
613
  with gr.Row():
614
  aligned_file = gr.File(label="📄 Alignment File", visible=False)
615
  tree_file = gr.File(label="🌲 Tree File", visible=False)
@@ -617,9 +708,27 @@ def create_gradio_interface():
617
  report_html_file = gr.File(label="📊 Detailed Report HTML", visible=False)
618
  with gr.Tabs():
619
  with gr.TabItem("🌳 Interactive Tree"):
620
- tree_html = gr.HTML(value="<div style='text-align: center; color: #666; padding: 20px;'>No tree generated yet.</div>")
 
 
621
  with gr.TabItem("📊 Detailed Report"):
622
- report_html = gr.HTML(value="<div style='text-align: center; color: #666; padding: 20px;'>No report generated yet.</div>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
 
624
  analyze_btn.click(
625
  fn=run_pipeline,
@@ -627,7 +736,8 @@ def create_gradio_interface():
627
  outputs=[
628
  boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
629
  aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
630
- ]
 
631
  )
632
 
633
  analyze_file_btn.click(
@@ -636,18 +746,36 @@ def create_gradio_interface():
636
  outputs=[
637
  boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
638
  aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
639
- ]
 
640
  )
641
 
642
  gr.Examples(
643
  examples=[
644
- ["ATCG" * 100, 85.0, False],
645
- ["CGAT" * 100, 90.0, True]
646
  ],
647
  inputs=[dna_input, similarity_score, build_ml_tree],
648
  label="Example Sequences"
649
  )
650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  return iface
652
  except Exception as e:
653
  logger.error(f"Gradio interface creation failed: {e}", exc_info=True)
@@ -661,9 +789,12 @@ def create_gradio_interface():
661
  # --- Application Startup ---
662
  def run_application():
663
  try:
 
664
  gradio_app = create_gradio_interface()
665
  gradio_app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
666
  logger.info("🚀 Starting Gene Analysis Pipeline...")
 
 
667
  uvicorn.run(
668
  app,
669
  host="0.0.0.0",
 
39
  try:
40
  file_handler = logging.FileHandler('/tmp/app.log')
41
  file_handler.setFormatter(log_formatter)
42
+ logging.basicConfig(level=logging.DEBUG, handlers=[log_handler, file_handler]) # Changed to DEBUG
43
  except Exception as e:
44
+ logging.basicConfig(level=logging.DEBUG, handlers=[log_handler])
45
  logging.warning(f"Failed to set up file logging: {e}")
46
  logger = logging.getLogger(__name__)
47
  logger.info(f"Gradio version: {gr.__version__}")
 
82
  else:
83
  logger.error(f"❌ Boundary model file not found.")
84
  except Exception as e:
85
+ logger.error(f"❌ Failed to load boundary model: {e}", exc_info=True)
86
  boundary_model = None
87
  try:
88
  keras_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.keras", token=None)
 
95
  else:
96
  logger.error(f"❌ Keras model files not found.")
97
  except Exception as e:
98
+ logger.error(f"❌ Failed to load Keras model: {e}", exc_info=True)
99
  keras_model = None
100
  kmer_to_index = None
101
  try:
 
115
  csv_loaded = True
116
  break
117
  except Exception as e:
118
+ logger.warning(f"CSV load failed for {csv_candidate}: {e}", exc_info=True)
119
  if not csv_loaded:
120
  logger.error("❌ Failed to load CSV data.")
121
  analyzer = None
 
126
  else:
127
  logger.warning("⚠️ AI model training failed.")
128
  except Exception as e:
129
+ logger.warning(f"⚠️ AI model training failed: {e}", exc_info=True)
130
  except Exception as e:
131
+ logger.error(f"❌ Tree analyzer initialization failed: {e}", exc_info=True)
132
  analyzer = None
133
 
134
  load_models_safely()
 
141
  os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
142
  logger.info(f"Set executable permission on {binary}")
143
  except Exception as e:
144
+ logger.warning(f"Failed to set permission on {binary}: {e}", exc_info=True)
145
 
146
  def check_tool_availability():
147
  setup_binary_permissions()
 
177
 
178
  # --- Pipeline Functions ---
179
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
180
+ query_fasta = None
181
  try:
182
  if len(sequence.strip()) < 100:
183
  return False, "Sequence too short (<100 bp).", None, None
184
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
185
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
186
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
187
+ output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_")
188
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
189
+ logger.error(f"Reference alignment or tree not found: {ALIGNMENT_PATH}, {TREE_PATH}")
190
  return False, "Reference alignment or tree not found.", None, None
191
+ logger.debug(f"Writing query FASTA to: {query_fasta}")
192
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
193
+ SeqIO.write(query_fasta, [query_fasta], "write([query_record])")
194
+ logger.debug("Running MAFFT alignment...")
195
+ with open(aligned_with_query, "subprocess") as subprocess:
196
  subprocess.run([
197
+ open, mafft_cmd, "--add", "--reorder", "subprocess.PIPEALIGNMENT_PATH"
198
+ query_f, aligned_with_query, query_fasta
199
+ ], "subprocess.PIPE=stdout", text=True, timeout_ms=600000, check=True)
200
+ if not os.path.exists("aligned_with_query") or not os.path.getsize(aligned_with_query):
201
+ logger.error(f"MAFFT alignment failed: {aligned_with_query}")
202
  return False, "MAFFT alignment failed.", None, None
203
+ logger.debug("Running IQ-TREE placement...")
204
  subprocess.run([
205
  iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
206
  "-m", "GTR+G", "-pre", output_prefix, "-redo"
207
  ], capture_output=True, text=True, timeout=1200, check=True)
208
  treefile = f"{output_prefix}.treefile"
209
  if not os.path.exists(treefile):
210
+ logger.error(f"IQ-TREE placement failed: {treefile} not found")
211
  return False, "IQ-TREE placement failed.", aligned_with_query, None
212
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
213
+ logger.info(success_msg)
214
  return True, success_msg, aligned_with_query, treefile
215
  except Exception as e:
216
  logger.error(f"Phylogenetic placement failed: {e}", exc_info=True)
217
  return False, f"Error: {str(e)}", None, None
218
  finally:
219
+ if query_fasta and os.path.exists(query_fasta):
220
  try:
221
  os.unlink(query_fasta)
222
+ logger.debug(f"Cleaned up {query_fasta}")
223
+ except Exception as e:
224
+ logger.warning(f"Failed to clean up {query_fasta}: {e}", exc_info=True)
225
 
226
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
227
  try:
228
  logger.debug("Starting tree analysis...")
229
  if not analyzer:
230
+ logger.error("Tree analyzer not initialized")
231
  return "❌ Tree analyzer not initialized.", None, None
232
+ logger.debug("Validating sequence...")
233
  if not sequence or len(sequence.strip()) < 10:
234
+ logger.error("Invalid sequence: too short or empty")
235
  return "❌ Invalid sequence.", None, None
236
  if not (1 <= matching_percentage <= 99):
237
+ logger.error(f"Invalid matching percentage: {matching_percentage}")
238
  return "❌ Matching percentage must be 1-99.", None, None
239
  logger.debug("Finding query sequence...")
240
  if not analyzer.find_query_sequence(sequence):
241
+ logger.error("Sequence not accepted by analyzer")
242
  return "❌ Sequence not accepted.", None, None
243
  logger.debug("Finding similar sequences...")
244
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
245
  if not matched_ids:
246
+ logger.warning(f"No similar sequences found at {matching_percentage}% threshold")
247
  return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
248
  logger.debug("Building tree structure...")
249
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
 
257
  logger.debug("Generating detailed report...")
258
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
259
  report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
260
+ logger.debug(f"Tree analysis completed: {len(matched_ids)} matches at {actual_percentage:.2f}%")
261
  return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
262
  except Exception as e:
263
  logger.error(f"Tree analysis failed: {e}", exc_info=True)
 
265
 
266
  def predict_with_keras(sequence):
267
  try:
268
+ logger.debug("Starting Keras prediction...")
269
  if not keras_model or not kmer_to_index:
270
+ logger.error("Keras model or kmer index not available")
271
  return "❌ Keras model not available."
272
  if len(sequence) < 6:
273
+ logger.error("Sequence too short for Keras prediction")
274
  return "❌ Sequence too short (<6 bp)."
275
+ logger.debug("Generating kmers...")
276
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
277
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
278
  input_arr = np.array([indices])
279
+ logger.debug("Running Keras prediction...")
280
  prediction = keras_model.predict(input_arr, verbose=0)[0]
281
  f_gene_prob = prediction[-1]
282
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
283
+ logger.debug(f"Keras prediction completed: {percentage}% confidence")
284
  return f"✅ {percentage}% F gene confidence"
285
  except Exception as e:
286
  logger.error(f"Keras prediction failed: {e}", exc_info=True)
 
288
 
289
  def read_fasta_file(file_obj):
290
  try:
291
+ logger.debug("Reading FASTA file...")
292
  if file_obj is None:
293
+ logger.error("No file object provided")
294
  return ""
295
  if isinstance(file_obj, str):
296
  with open(file_obj, "r") as f:
 
299
  content = file_obj.read().decode("utf-8")
300
  lines = content.strip().split("\n")
301
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
302
+ sequence = ''.join(seq_lines)
303
+ logger.debug(f"FASTA file read successfully: {len(sequence)} bp")
304
+ return sequence
305
  except Exception as e:
306
  logger.error(f"Failed to read FASTA file: {e}", exc_info=True)
307
  return ""
308
 
309
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
310
  try:
311
+ logger.debug("Starting pipeline...")
312
  dna_input = dna_input.upper().strip()
313
  if not dna_input:
314
+ logger.error("Empty input sequence")
315
  return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input", None, None
316
  if not re.match('^[ACTGN]+$', dna_input):
317
+ logger.debug("Cleaning invalid characters from input")
318
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
319
  processed_sequence = dna_input
320
  boundary_output = ""
321
+ logger.debug("Running boundary detection...")
322
  if boundary_model:
323
  try:
324
  result = boundary_model.predict_sequence(dna_input)
 
326
  if regions:
327
  processed_sequence = regions[0]["sequence"]
328
  boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
329
+ logger.debug(f"Boundary detection: F gene found, {len(processed_sequence)} bp")
330
  else:
331
  boundary_output = "⚠️ No F gene regions found."
332
  processed_sequence = dna_input
333
+ logger.debug("Boundary detection: No F gene regions found")
334
  except Exception as e:
335
  boundary_output = f"❌ Boundary prediction error: {str(e)}"
336
  processed_sequence = dna_input
337
+ logger.error(f"Boundary prediction error: {e}", exc_info=True)
338
  else:
339
  boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
340
+ logger.warning("Boundary model not available")
341
+ logger.debug("Running Keras validation...")
342
  keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
343
  aligned_file = None
344
  phy_file = None
345
  ml_tree_output = ""
346
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
347
+ logger.debug("Running phylogenetic placement...")
348
  try:
349
  mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
350
  if mafft_available and iqtree_available:
 
352
  ml_tree_output = ml_message
353
  aligned_file = ml_aligned
354
  phy_file = ml_tree
355
+ logger.debug(f"Phylogenetic placement: {ml_message}")
356
  else:
357
  ml_tree_output = "❌ MAFFT or IQ-TREE not available"
358
+ logger.error("MAFFT or IQ-TREE not available")
359
  except Exception as e:
360
  ml_tree_output = f"❌ ML tree error: {str(e)}"
361
+ logger.error(f"ML tree error: {e}", exc_info=True)
362
  elif build_ml_tree:
363
  ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
364
+ logger.error("Sequence too short for phylogenetic placement")
365
  else:
366
  ml_tree_output = "⚠️ Phylogenetic placement skipped."
367
+ logger.debug("Phylogenetic placement skipped")
368
  tree_html_content = "No tree generated."
369
  report_html_content = "No report generated."
370
  tree_html_path = None
371
  report_html_path = None
372
  simplified_ml_output = ""
373
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
374
+ logger.debug("Running tree analysis...")
375
  try:
376
  tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
377
  simplified_ml_output = tree_result
378
  if tree_html_path and os.path.exists(tree_html_path):
379
  with open(tree_html_path, 'r', encoding='utf-8') as f:
380
  tree_html_content = f.read()
381
+ logger.debug(f"Tree HTML generated: {tree_html_path}")
382
  else:
383
  tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
384
+ logger.debug("No tree HTML generated")
385
  if report_html_path and os.path.exists(report_html_path):
386
  with open(report_html_path, 'r', encoding='utf-8') as f:
387
  report_html_content = f.read()
388
+ logger.debug(f"Report HTML generated: {report_html_path}")
389
  else:
390
  report_html_content = f"<div style='color: red;'>{tree_result}</div>"
391
+ logger.debug("No report HTML generated")
392
  except Exception as e:
393
  simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
394
  tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
395
  report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
396
+ logger.error(f"Tree analysis error: {e}", exc_info=True)
397
  else:
398
  simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
399
  tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
400
  report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
401
+ logger.error(simplified_ml_output)
402
  summary_output = f"""
403
  📊 ANALYSIS SUMMARY:
404
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
409
  Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
410
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
411
  """
412
+ logger.debug("Pipeline completed successfully")
413
  return (
414
  boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
415
  aligned_file, phy_file, None, None, tree_html_content, report_html_content,
 
420
  error_msg = f"❌ Pipeline Error: {str(e)}"
421
  return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg, None, None
422
 
423
+ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
424
  temp_file_path = None
425
  try:
426
+ logger.debug("Starting pipeline from file...")
427
  if fasta_file_obj is None:
428
+ logger.error("No file provided")
429
  return "❌ No file provided", "", "", "", "", None, None, None, None, "No input", "No input", None, None
430
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
431
  if isinstance(fasta_file_obj, UploadFile):
 
436
  content = f.read()
437
  temp_file.write(content)
438
  temp_file_path = temp_file.name
439
+ logger.debug(f"Reading FASTA file: {temp_file_path}")
440
  dna_input = read_fasta_file(temp_file_path)
441
  if not dna_input:
442
+ logger.error("Failed to read FASTA file")
443
  return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input", None, None
444
+ logger.debug("Running pipeline with FASTA input...")
445
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
446
  except Exception as e:
447
  logger.error(f"Pipeline from file error: {e}", exc_info=True)
 
451
  if temp_file_path and os.path.exists(temp_file_path):
452
  try:
453
  os.unlink(temp_file_path)
454
+ logger.debug(f"Cleaned up temp file: {temp_file_path}")
455
  except Exception as e:
456
+ logger.warning(f"Failed to delete temp file {temp_file_path}: {e}", exc_info=True)
457
 
458
  # --- Pydantic Models ---
459
  class AnalysisRequest(BaseModel):
 
515
  @app.post("/analyze", response_model=AnalysisResponse)
516
  async def analyze_sequence(request: AnalysisRequest):
517
  try:
518
+ logger.debug("Starting sequence analysis via API...")
519
  result = run_pipeline(request.sequence, request.similarity_score, request.build_ml_tree)
520
+ logger.debug("API analysis completed")
521
  return AnalysisResponse(
522
  boundary_output=result[0] or "",
523
  keras_output=result[1] or "",
 
545
  ):
546
  temp_file_path = None
547
  try:
548
+ logger.debug("Starting file analysis via API...")
549
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
550
  content = await file.read()
551
  temp_file.write(content)
552
  temp_file_path = temp_file.name
553
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
554
+ logger.debug("API file analysis completed")
555
  return AnalysisResponse(
556
  boundary_output=result[0] or "",
557
  keras_output=result[1] or "",
 
574
  if temp_file_path and os.path.exists(temp_file_path):
575
  try:
576
  os.unlink(temp_file_path)
577
+ logger.debug(f"Cleaned up API temp file: {temp_file_path}")
578
  except Exception as e:
579
+ logger.warning(f"Failed to clean up {temp_file_path}: {e}", exc_info=True)
580
 
581
  @app.get("/download/{file_type}/{query_id}")
582
  async def download_file(file_type: str, query_id: str):
583
  try:
584
+ logger.debug(f"Downloading file: {file_type}/{query_id}")
585
  if file_type not in ["tree", "report"]:
586
+ logger.error(f"Invalid file type: {file_type}")
587
  raise HTTPException(status_code=400, detail="Invalid file type. Use 'tree' or 'report'.")
588
  file_name = f"phylogenetic_tree_{query_id}.html" if file_type == "tree" else f"detailed_report_{query_id}.html"
589
  file_path = os.path.join("/tmp", file_name)
590
  if not os.path.exists(file_path):
591
+ logger.error(f"File not found: {file_path}")
592
  raise HTTPException(status_code=404, detail="File not found.")
593
+ logger.debug(f"Serving file: {file_path}")
594
  return FileResponse(file_path, filename=file_name, media_type="text/html")
595
  except Exception as e:
596
  logger.error(f"Download error: {e}", exc_info=True)
 
599
  # --- Gradio Interface ---
600
  def create_gradio_interface():
601
  try:
602
+ logger.debug("Creating Gradio interface...")
603
  with gr.Blocks(
604
  title="🧬 Gene Analysis Pipeline",
605
  theme=gr.themes.Soft(),
 
631
  dna_input = gr.Textbox(
632
  label="🧬 DNA Sequence",
633
  placeholder="Enter DNA sequence (ATCG format)...",
634
+ lines=5,
635
+ description="Paste your DNA sequence here"
636
  )
637
  with gr.Column(scale=1):
638
  similarity_score = gr.Slider(
 
640
  maximum=99,
641
  value=95.0,
642
  step=1.0,
643
+ label="🎯 Similarity Threshold (%)",
644
+ description="Minimum similarity for tree analysis"
645
  )
646
  build_ml_tree = gr.Checkbox(
647
  label="🌲 Build ML Tree",
648
+ value=False,
649
+ description="Generate phylogenetic placement (slower)"
650
  )
651
  analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
652
  with gr.TabItem("📁 File Upload"):
 
654
  with gr.Column(scale=2):
655
  file_input = gr.File(
656
  label="📄 Upload FASTA File",
657
+ file_types=[".fasta", ".fa", ".fas", ".txt"],
658
+ description="Upload a FASTA file containing your sequence"
659
  )
660
  with gr.Column(scale=1):
661
  file_similarity_score = gr.Slider(
 
663
  maximum=99,
664
  value=95.0,
665
  step=1.0,
666
+ label="🎯 Similarity Threshold (%)",
667
+ description="Minimum similarity for tree analysis"
668
  )
669
  file_build_ml_tree = gr.Checkbox(
670
  label="🌲 Build ML Tree",
671
+ value=False,
672
+ description="Generate phylogenetic placement (slower)"
673
  )
674
  analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
675
  gr.Markdown("## 📊 Analysis Results")
676
  with gr.Row():
677
  with gr.Column():
678
+ boundary_output = gr.Textbox(
679
+ label="🎯 Boundary Detection",
680
+ interactive=False,
681
+ lines=2
682
+ )
683
+ keras_output = gr.Textbox(
684
+ label="🧠 F Gene Validation",
685
+ interactive=False,
686
+ lines=2
687
+ )
688
  with gr.Column():
689
+ ml_tree_output = gr.Textbox(
690
+ label="🌲 Phylogenetic Placement",
691
+ interactive=False,
692
+ lines=2
693
+ )
694
+ tree_analysis_output = gr.Textbox(
695
+ label="🌳 Tree Analysis",
696
+ interactive=False,
697
+ lines=2
698
+ )
699
+ summary_output = gr.Textbox(
700
+ label="📋 Summary",
701
+ interactive=False,
702
+ lines=8
703
+ )
704
  with gr.Row():
705
  aligned_file = gr.File(label="📄 Alignment File", visible=False)
706
  tree_file = gr.File(label="🌲 Tree File", visible=False)
 
708
  report_html_file = gr.File(label="📊 Detailed Report HTML", visible=False)
709
  with gr.Tabs():
710
  with gr.TabItem("🌳 Interactive Tree"):
711
+ tree_html = gr.HTML(
712
+ value="<div style='text-align: center; color: #666; padding: 20px;'>No tree generated yet. Run analysis to see results.</div>"
713
+ )
714
  with gr.TabItem("📊 Detailed Report"):
715
+ report_html = gr.HTML(
716
+ value="<div style='text-align: center; color: #666; padding: 20px;'>No report generated yet. Run analysis to see results.</div>"
717
+ )
718
+
719
+ # Event handlers
720
+ def handle_analysis_output(*outputs):
721
+ boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output, aligned_file, phy_file, _, _, tree_html_content, report_html_content, tree_html_path, report_html_path = outputs
722
+ logger.debug("Handling Gradio output...")
723
+ return (
724
+ boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
725
+ gr.File.update(value=aligned_file, visible=aligned_file is not None),
726
+ gr.File.update(value=phy_file, visible=phy_file is not None),
727
+ gr.File.update(value=tree_html_path, visible=tree_html_path is not None),
728
+ gr.File.update(value=report_html_path, visible=report_html_path is not None),
729
+ tree_html_content,
730
+ report_html_content
731
+ )
732
 
733
  analyze_btn.click(
734
  fn=run_pipeline,
 
736
  outputs=[
737
  boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
738
  aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
739
+ ],
740
+ _js="""(outputs) => { return outputs; }"""
741
  )
742
 
743
  analyze_file_btn.click(
 
746
  outputs=[
747
  boundary_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
748
  aligned_file, tree_file, tree_html_file, report_html_file, tree_html, report_html
749
+ ],
750
+ _js="""(outputs) => { return outputs; }"""
751
  )
752
 
753
  gr.Examples(
754
  examples=[
755
+ ["ATCG" * 250, 85.0, False],
756
+ ["CGATCG" * 150, 90.0, True]
757
  ],
758
  inputs=[dna_input, similarity_score, build_ml_tree],
759
  label="Example Sequences"
760
  )
761
 
762
+ gr.Markdown("""
763
+ ## 📚 Instructions
764
+ 1. **Input**: Enter a DNA sequence (ATCG format) or upload a FASTA file
765
+ 2. **Parameters**:
766
+ - Set similarity threshold for phylogenetic analysis (1-99%)
767
+ - Choose whether to build ML tree (slower but more accurate)
768
+ 3. **Analysis**: Click analyze to run the complete pipeline
769
+ 4. **Results**: View results in different tabs - summary, tree visualization, and detailed report
770
+ 5. **Downloads**: Download alignment, tree, simplified tree HTML, and detailed report HTML files
771
+ ### 🔬 Pipeline Components:
772
+ - **Boundary Detection**: Identifies F gene regions
773
+ - **F Gene Validation**: Validates F gene using ML
774
+ - **Phylogenetic Placement**: Places sequence in reference tree (optional)
775
+ - **Tree Analysis**: Builds phylogenetic tree with similar sequences
776
+ """)
777
+
778
+ logger.debug("Gradio interface created successfully")
779
  return iface
780
  except Exception as e:
781
  logger.error(f"Gradio interface creation failed: {e}", exc_info=True)
 
789
  # --- Application Startup ---
790
  def run_application():
791
  try:
792
+ logger.debug("Starting application...")
793
  gradio_app = create_gradio_interface()
794
  gradio_app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
795
  logger.info("🚀 Starting Gene Analysis Pipeline...")
796
+ logger.info("📊 FastAPI docs available at: http://localhost:7860/docs")
797
+ logger.info("🧬 Gradio interface available at: http://localhost:7860/gradio")
798
  uvicorn.run(
799
  app,
800
  host="0.0.0.0",