re-type commited on
Commit
7537055
·
verified ·
1 Parent(s): d5239ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +515 -373
app.py CHANGED
@@ -22,8 +22,6 @@ from Bio.SeqRecord import SeqRecord
22
  import stat
23
  import time
24
  import asyncio
25
-
26
- # FastAPI imports
27
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
28
  from fastapi.responses import HTMLResponse
29
  from pydantic import BaseModel
@@ -36,19 +34,19 @@ try:
36
  except Exception:
37
  pass
38
 
 
 
 
39
  # --- Enhanced Logging ---
40
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
41
  log_handler = logging.StreamHandler()
42
  log_handler.setFormatter(log_formatter)
43
-
44
- # File handler with error handling
45
  try:
46
  file_handler = logging.FileHandler('/tmp/app.log')
47
  file_handler.setFormatter(log_formatter)
48
  logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
49
  except Exception:
50
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
51
-
52
  logger = logging.getLogger(__name__)
53
 
54
  # --- Global Variables ---
@@ -60,21 +58,13 @@ ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
60
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
61
  QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
62
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
 
 
63
 
64
  # --- Model Configuration ---
65
- boundary_model_repo = "GGproject10/best_boundary_aware_model"
66
- other_models_repo = "GGproject10/simplified_tree_AI"
67
-
68
- # Try multiple CSV locations
69
- csv_candidates = [
70
- os.path.join(BASE_DIR, "f_cleaned.csv"),
71
- os.path.join(BASE_DIR, "f cleaned.csv"),
72
- "f_cleaned.csv",
73
- os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
74
- os.path.join(MODELS_DIR, "f_cleaned.csv")
75
- ]
76
-
77
- hf_token = os.getenv("HF_TOKEN")
78
 
79
  # Initialize models as None
80
  boundary_model = None
@@ -82,44 +72,35 @@ keras_model = None
82
  kmer_to_index = None
83
  analyzer = None
84
 
85
- # --- Create directories ---
86
- os.makedirs(MODELS_DIR, exist_ok=True)
87
- os.makedirs("/tmp/hf_cache", exist_ok=True)
88
-
89
- # --- Enhanced Model Loading ---
90
  def load_models_safely():
91
  global boundary_model, keras_model, kmer_to_index, analyzer
92
-
93
  logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
94
  logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
95
- logger.info(f"🔑 HF_TOKEN available: {hf_token is not None}")
96
-
97
  # Load Boundary Model
98
  try:
99
  local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
100
-
101
  if os.path.exists(local_boundary_path):
102
  logger.info(f"✅ Loading boundary model from local: {local_boundary_path}")
103
  boundary_model = EnhancedGenePredictor(local_boundary_path)
104
  logger.info("✅ Boundary model loaded successfully")
105
- elif hf_token:
106
- logger.info("🌐 Downloading boundary model from HF...")
107
- try:
108
- boundary_path = hf_hub_download(
109
- repo_id=boundary_model_repo,
110
- filename="best_boundary_aware_model.pth",
111
- token=hf_token,
112
- cache_dir="/tmp/hf_cache",
113
- local_dir=MODELS_DIR,
114
- local_dir_use_symlinks=False
115
- )
116
- if os.path.exists(boundary_path):
117
- boundary_model = EnhancedGenePredictor(boundary_path)
118
- logger.info("✅ Boundary model downloaded and loaded")
119
- else:
120
- logger.warning("❌ Boundary model download failed")
121
- except Exception as e:
122
- logger.error(f"❌ HF download failed: {e}")
123
  else:
124
  logger.warning("❌ No boundary model found and no HF_TOKEN")
125
  except Exception as e:
@@ -130,43 +111,37 @@ def load_models_safely():
130
  try:
131
  local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
132
  local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
133
-
134
  if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
135
- logger.info(f"��� Loading Keras model from local files")
136
  keras_model = load_model(local_keras_path)
137
  with open(local_kmer_path, "rb") as f:
138
  kmer_to_index = pickle.load(f)
139
  logger.info("✅ Keras model loaded successfully")
140
-
141
- elif hf_token:
142
- logger.info("🌐 Downloading Keras model from HF...")
143
- try:
144
- keras_path = hf_hub_download(
145
- repo_id=other_models_repo,
146
- filename="best_model.keras",
147
- token=hf_token,
148
- cache_dir="/tmp/hf_cache",
149
- local_dir=MODELS_DIR,
150
- local_dir_use_symlinks=False
151
- )
152
- kmer_path = hf_hub_download(
153
- repo_id=other_models_repo,
154
- filename="kmer_to_index.pkl",
155
- token=hf_token,
156
- cache_dir="/tmp/hf_cache",
157
- local_dir=MODELS_DIR,
158
- local_dir_use_symlinks=False
159
- )
160
-
161
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
162
- keras_model = load_model(keras_path)
163
- with open(kmer_path, "rb") as f:
164
- kmer_to_index = pickle.load(f)
165
- logger.info("✅ Keras model downloaded and loaded")
166
- else:
167
- logger.warning("❌ Keras model download failed")
168
- except Exception as e:
169
- logger.error(f"❌ Keras HF download failed: {e}")
170
  else:
171
  logger.warning("❌ No Keras model found and no HF_TOKEN")
172
  except Exception as e:
@@ -178,12 +153,18 @@ def load_models_safely():
178
  try:
179
  logger.info("🌳 Initializing tree analyzer...")
180
  analyzer = PhylogeneticTreeAnalyzer()
181
-
 
 
 
 
 
 
182
  csv_loaded = False
183
  for csv_candidate in csv_candidates:
184
  if os.path.exists(csv_candidate):
 
185
  try:
186
- logger.info(f"📊 Trying CSV: {csv_candidate}")
187
  if analyzer.load_data(csv_candidate):
188
  logger.info(f"✅ CSV loaded from: {csv_candidate}")
189
  csv_loaded = True
@@ -191,29 +172,27 @@ def load_models_safely():
191
  except Exception as e:
192
  logger.warning(f"CSV load failed for {csv_candidate}: {e}")
193
  continue
194
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  if not csv_loaded:
196
  logger.error("❌ No CSV data loaded")
197
- if hf_token:
198
- try:
199
- logger.info("🌐 Downloading CSV from HF...")
200
- csv_path = hf_hub_download(
201
- repo_id=other_models_repo,
202
- filename="f_cleaned.csv",
203
- token=hf_token,
204
- cache_dir="/tmp/hf_cache",
205
- local_dir=BASE_DIR,
206
- local_dir_use_symlinks=False
207
- )
208
- if analyzer.load_data(csv_path):
209
- logger.info("✅ CSV downloaded and loaded")
210
- csv_loaded = True
211
- except Exception as e:
212
- logger.error(f"❌ CSV HF download failed: {e}")
213
-
214
- if not csv_loaded:
215
- analyzer = None
216
-
217
  except Exception as e:
218
  logger.error(f"❌ Tree analyzer initialization failed: {e}")
219
  analyzer = None
@@ -233,187 +212,254 @@ def setup_binary_permissions():
233
 
234
  def check_tool_availability():
235
  setup_binary_permissions()
236
-
237
- # Check MAFFT
238
  mafft_available = False
239
  mafft_cmd = None
240
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
241
-
242
  for candidate in mafft_candidates:
243
  if shutil.which(candidate) or os.path.exists(candidate):
244
  try:
245
  result = subprocess.run(
246
- [candidate, "--help"],
247
- capture_output=True,
248
- text=True,
249
  timeout=5
250
  )
251
  if result.returncode == 0 or "mafft" in result.stderr.lower():
252
  mafft_available = True
253
  mafft_cmd = candidate
254
- logger.info(f"✅ MAFFT found: {candidate}")
255
  break
256
  except Exception as e:
257
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
258
-
259
- # Check IQ-TREE
260
  iqtree_available = False
261
  iqtree_cmd = None
262
- iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', IQTREE_PATH]
263
-
264
  for candidate in iqtree_candidates:
265
  if shutil.which(candidate) or os.path.exists(candidate):
266
  try:
267
  result = subprocess.run(
268
- [candidate, "--help"],
269
- capture_output=True,
270
- text=True,
271
  timeout=5
272
  )
273
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
274
  iqtree_available = True
275
  iqtree_cmd = candidate
276
- logger.info(f"✅ IQ-TREE found: {candidate}")
277
  break
278
  except Exception as e:
279
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
280
-
281
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
282
 
283
- # --- Core Pipeline Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  def predict_with_keras(sequence):
285
  try:
286
  if not keras_model or not kmer_to_index:
287
- return "❌ Keras model not available"
288
-
289
  if len(sequence) < 6:
290
- return "❌ Sequence too short (<6 bp)"
291
-
292
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
293
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
294
  input_arr = np.array([indices])
295
-
296
  prediction = keras_model.predict(input_arr, verbose=0)[0]
297
  f_gene_prob = prediction[-1]
298
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
299
-
300
  return f"✅ {percentage}% F gene confidence"
301
  except Exception as e:
302
  logger.error(f"Keras prediction failed: {e}")
303
  return f"❌ Error: {str(e)}"
304
 
305
- def run_simple_pipeline(dna_input, similarity_score=95.0):
306
- """Simplified pipeline that avoids complex Gradio components"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  try:
308
- if not dna_input or not dna_input.strip():
309
- return "❌ Empty input"
310
-
311
  dna_input = dna_input.upper().strip()
312
-
313
- # Clean sequence
314
  if not re.match('^[ACTGN]+$', dna_input):
315
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
316
-
317
  processed_sequence = dna_input
318
- results = []
319
-
320
- # Boundary prediction
321
  if boundary_model:
322
  try:
323
  result = boundary_model.predict_sequence(dna_input)
324
  regions = result['gene_regions']
325
  if regions:
326
  processed_sequence = regions[0]["sequence"]
327
- results.append(f"✅ F gene region: {len(processed_sequence)} bp")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  else:
329
- results.append("⚠️ No F gene regions found")
330
  except Exception as e:
331
- results.append(f"❌ Boundary error: {str(e)}")
 
 
332
  else:
333
- results.append("⚠️ Boundary model not available")
334
-
335
- # Keras prediction
336
- keras_result = predict_with_keras(processed_sequence)
337
- results.append(keras_result)
338
-
339
- # Tree analysis
340
- if analyzer and len(processed_sequence) >= 10:
341
  try:
342
- tree_result = analyze_sequence_simple(processed_sequence, similarity_score)
343
- results.append(tree_result)
 
 
 
 
 
 
 
 
 
 
344
  except Exception as e:
345
- results.append(f"❌ Tree analysis error: {str(e)}")
 
 
346
  else:
347
- results.append("❌ Tree analyzer not available" if not analyzer else "❌ Sequence too short")
348
-
349
- # Summary
350
- summary = f"""
351
  📊 ANALYSIS SUMMARY:
352
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
353
  Input: {len(dna_input)} bp
354
  F Gene: {len(processed_sequence)} bp
355
- Results:
356
- {chr(10).join(f" - {r}" for r in results)}
 
357
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
358
  """
359
-
360
- return summary
361
-
 
362
  except Exception as e:
363
- logger.error(f"Simple pipeline error: {e}")
364
- return f"❌ Pipeline Error: {str(e)}"
 
365
 
366
- def analyze_sequence_simple(sequence: str, matching_percentage: float):
367
- """Simplified tree analysis"""
368
  try:
369
- if not analyzer:
370
- return "❌ Tree analyzer not available"
371
-
372
- if not analyzer.find_query_sequence(sequence):
373
- return "❌ Sequence not accepted"
374
-
375
- matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
376
- if not matched_ids:
377
- return f"❌ No similar sequences at {matching_percentage}% threshold"
378
-
379
- return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity"
380
-
381
  except Exception as e:
382
- logger.error(f"Simple tree analysis failed: {e}")
383
- return f"❌ Tree analysis error: {str(e)}"
384
-
385
- def read_fasta_simple(file_obj):
386
- """Simplified FASTA reader"""
387
- try:
388
- if file_obj is None:
389
- return ""
390
-
391
- if hasattr(file_obj, 'name'):
392
- with open(file_obj.name, "r") as f:
393
- content = f.read()
394
- else:
395
- content = file_obj.read()
396
- if isinstance(content, bytes):
397
- content = content.decode("utf-8")
398
-
399
- lines = content.strip().split("\n")
400
- seq_lines = [line.strip() for line in lines if not line.startswith(">")]
401
- return ''.join(seq_lines)
402
-
403
- except Exception as e:
404
- logger.error(f"FASTA read failed: {e}")
405
- return ""
406
-
407
- # --- FastAPI App Setup ---
408
- app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
409
 
410
  # --- Pydantic Models ---
411
  class AnalysisRequest(BaseModel):
412
  sequence: str
413
  similarity_score: float = 95.0
 
414
 
415
  class AnalysisResponse(BaseModel):
416
- result: str
 
 
 
 
417
  success: bool
418
  error_message: Optional[str] = None
419
 
@@ -427,7 +473,8 @@ async def root():
427
  "docs": "/docs",
428
  "health": "/health",
429
  "gradio": "/gradio",
430
- "analyze": "/analyze"
 
431
  }
432
  }
433
 
@@ -443,202 +490,297 @@ async def health_check():
443
  "tree_analyzer": analyzer is not None,
444
  "mafft_available": mafft_available,
445
  "iqtree_available": iqtree_available
 
 
 
 
 
 
 
446
  }
447
  }
448
  except Exception as e:
 
449
  return {"status": "unhealthy", "error": str(e)}
450
 
451
  @app.post("/analyze", response_model=AnalysisResponse)
452
  async def analyze_sequence(request: AnalysisRequest):
453
  try:
454
- result = run_simple_pipeline(request.sequence, request.similarity_score)
455
- return AnalysisResponse(result=result, success=True)
 
 
 
 
 
 
 
456
  except Exception as e:
457
- logger.error(f"API analyze error: {e}")
458
- return AnalysisResponse(result="", success=False, error_message=str(e))
 
 
 
 
459
 
460
- # --- Simplified Gradio Interface ---
461
- def create_simple_gradio_interface():
462
- """Create a simple, robust Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  try:
464
- # Get system status
465
- status_info = []
466
- status_info.append(f"🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}")
467
- status_info.append(f"🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}")
468
- status_info.append(f"🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}")
469
-
470
- mafft_available, iqtree_available, _, _ = check_tool_availability()
471
- status_info.append(f"🔬 MAFFT: {'✅ Available' if mafft_available else '❌ Missing'}")
472
- status_info.append(f"🔬 IQ-TREE: {'✅ Available' if iqtree_available else '❌ Missing'}")
473
-
474
- status_text = "\n".join(status_info)
475
-
476
  with gr.Blocks(
477
  title="🧬 Gene Analysis Pipeline",
478
- theme=gr.themes.Default()
479
- ) as interface:
480
-
 
 
 
 
 
 
481
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
482
-
483
- # System status
484
- gr.Markdown("## 🔧 System Status")
485
- gr.Textbox(
486
- value=status_text,
487
- label="Component Status",
488
- lines=6,
489
- interactive=False
490
- )
491
-
492
- # Input section
493
- gr.Markdown("## 📝 Input")
494
-
495
- with gr.Tab("Text Input"):
496
- dna_input = gr.Textbox(
497
- label="🧬 DNA Sequence",
498
- placeholder="Enter DNA sequence (ATCG format)...",
499
- lines=5
500
- )
501
-
502
- with gr.Tab("File Upload"):
503
- fasta_file = gr.File(
504
- label="📄 Upload FASTA File",
505
- file_types=[".fasta", ".fa", ".txt"]
506
- )
507
-
508
- # Parameters
509
- similarity_slider = gr.Slider(
510
- minimum=1,
511
- maximum=99,
512
- value=95,
513
- step=1,
514
- label="🎯 Similarity Threshold (%)"
515
- )
516
-
517
- # Buttons
518
  with gr.Row():
519
- analyze_text_btn = gr.Button("🔍 Analyze Text", variant="primary")
520
- analyze_file_btn = gr.Button("📁 Analyze File", variant="secondary")
521
- clear_btn = gr.Button("🗑️ Clear")
522
-
523
- # Output
524
- gr.Markdown("## 📊 Results")
525
- output_text = gr.Textbox(
526
- label="Analysis Results",
527
- lines=15,
528
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  )
530
-
531
- # Event handlers
532
- def analyze_text(sequence, similarity):
533
- if not sequence or not sequence.strip():
534
- return " Please enter a DNA sequence"
535
- return run_simple_pipeline(sequence, similarity)
536
-
537
- def analyze_file(file_obj, similarity):
538
- if file_obj is None:
539
- return " Please upload a file"
540
- sequence = read_fasta_simple(file_obj)
541
- if not sequence:
542
- return " Failed to read sequence from file"
543
- return run_simple_pipeline(sequence, similarity)
544
-
545
- def clear_all():
546
- return "", None, ""
547
-
548
- # Connect events
549
- analyze_text_btn.click(
550
- fn=analyze_text,
551
- inputs=[dna_input, similarity_slider],
552
- outputs=[output_text]
553
  )
554
-
555
  analyze_file_btn.click(
556
- fn=analyze_file,
557
- inputs=[fasta_file, similarity_slider],
558
- outputs=[output_text]
559
- )
560
-
561
- clear_btn.click(
562
- fn=clear_all,
563
- outputs=[dna_input, fasta_file, output_text]
564
  )
565
-
566
- # Examples
567
- gr.Markdown("## 🧪 Examples")
 
 
568
  gr.Examples(
569
- examples=[
570
- ["ATGAAACTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAGTGC", 95],
571
- ["ATGGAGCTGCAGCTGAGGTCCCTGGTGGTGAACAAGCTCAGCAGCAAGTGCTGAACTGGATGGGCGAGAAGAGCAACTGCATCCAGTGCAAGCGCCTGAAGAGGAACTGCAAGAAGGTGGTGGACCTGCAG", 85]
572
- ],
573
- inputs=[dna_input, similarity_slider]
574
  )
575
-
576
- # Info
577
- gr.Markdown("""
578
- ## ℹ️ About
579
-
580
- This tool analyzes DNA sequences for F gene characteristics using:
581
- - **Boundary Detection**: ML-based F gene region identification
582
- - **Keras Validation**: Neural network sequence validation
583
- - **Tree Analysis**: Phylogenetic similarity analysis
584
-
585
- **Requirements**: Sequences should be in ATCG format, minimum 10 bp recommended.
586
- """)
587
-
588
- return interface
589
-
 
 
 
 
 
 
 
590
  except Exception as e:
591
  logger.error(f"Failed to create Gradio interface: {e}")
592
-
593
- # Ultra-simple fallback interface
594
- with gr.Blocks() as fallback:
595
- gr.Markdown("# 🧬 Gene Analysis Pipeline (Safe Mode)")
596
- gr.Markdown(f"⚠️ Interface error: {str(e)}")
597
-
598
- sequence_input = gr.Textbox(label="DNA Sequence", lines=3)
599
- analyze_btn = gr.Button("Analyze")
600
- result_output = gr.Textbox(label="Result", lines=10)
601
-
602
- analyze_btn.click(
603
- fn=lambda seq: run_simple_pipeline(seq, 95.0),
604
- inputs=[sequence_input],
605
- outputs=[result_output]
606
- )
607
-
608
- return fallback
609
 
610
  # --- Application Startup ---
611
- if __name__ == "__main__":
612
  try:
613
- # Create simplified Gradio interface
614
- gr_interface = create_simple_gradio_interface()
615
-
616
- # Mount to FastAPI with error handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  try:
618
- gr_app = gr.mount_gradio_app(app, gr_interface, path="/gradio")
 
 
 
 
 
 
 
 
619
  except Exception as e:
620
- logger.error(f"Failed to mount Gradio: {e}")
621
- # Continue with just FastAPI
622
-
623
- # Log startup info
624
- logger.info("🚀 Starting Gene Analysis Pipeline...")
625
- logger.info(f"📁 Base directory: {BASE_DIR}")
626
- logger.info(f"🤖 Models: Boundary={boundary_model is not None}, Keras={keras_model is not None}")
627
- logger.info(f"🌳 Tree analyzer: {analyzer is not None}")
628
-
629
- # Start server
630
- logger.info("🌐 Server starting on http://0.0.0.0:7860")
631
- logger.info("📊 FastAPI docs: http://0.0.0.0:7860/docs")
632
- logger.info("🎮 Gradio: http://0.0.0.0:7860/gradio")
633
-
634
- uvicorn.run(
635
- app,
636
- host="0.0.0.0",
637
- port=7860,
638
- log_level="info"
639
- )
640
-
641
- except Exception as e:
642
- logger.error(f"❌ Startup failed: {e}")
643
- print(f"❌ Application failed to start: {e}")
644
- sys.exit(1)
 
22
  import stat
23
  import time
24
  import asyncio
 
 
25
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
26
  from fastapi.responses import HTMLResponse
27
  from pydantic import BaseModel
 
34
  except Exception:
35
  pass
36
 
37
+ # --- FastAPI App Setup ---
38
+ app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
39
+
40
  # --- Enhanced Logging ---
41
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
42
  log_handler = logging.StreamHandler()
43
  log_handler.setFormatter(log_formatter)
 
 
44
  try:
45
  file_handler = logging.FileHandler('/tmp/app.log')
46
  file_handler.setFormatter(log_formatter)
47
  logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
48
  except Exception:
49
  logging.basicConfig(level=logging.INFO, handlers=[log_handler])
 
50
  logger = logging.getLogger(__name__)
51
 
52
  # --- Global Variables ---
 
58
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
59
  QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
60
  os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
61
+ os.makedirs(MODELS_DIR, exist_ok=True)
62
+ os.makedirs("/tmp/hf_cache", exist_ok=True)
63
 
64
  # --- Model Configuration ---
65
+ BOUNDARY_MODEL_REPO = "GGproject10/best_boundary_aware_model"
66
+ OTHER_MODELS_REPO = "GGproject10/simplified_tree_AI"
67
+ HF_TOKEN = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Initialize models as None
70
  boundary_model = None
 
72
  kmer_to_index = None
73
  analyzer = None
74
 
75
+ # --- Model Loading ---
 
 
 
 
76
  def load_models_safely():
77
  global boundary_model, keras_model, kmer_to_index, analyzer
 
78
  logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
79
  logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
80
+ logger.info(f"🔑 HF_TOKEN available: {HF_TOKEN is not None}")
81
+
82
  # Load Boundary Model
83
  try:
84
  local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
 
85
  if os.path.exists(local_boundary_path):
86
  logger.info(f"✅ Loading boundary model from local: {local_boundary_path}")
87
  boundary_model = EnhancedGenePredictor(local_boundary_path)
88
  logger.info("✅ Boundary model loaded successfully")
89
+ elif HF_TOKEN:
90
+ logger.info(f"🌐 Downloading boundary model from {BOUNDARY_MODEL_REPO}")
91
+ boundary_path = hf_hub_download(
92
+ repo_id=BOUNDARY_MODEL_REPO,
93
+ filename="best_boundary_aware_model.pth",
94
+ token=HF_TOKEN,
95
+ cache_dir="/tmp/hf_cache",
96
+ local_dir=MODELS_DIR,
97
+ local_dir_use_symlinks=False
98
+ )
99
+ if os.path.exists(boundary_path):
100
+ boundary_model = EnhancedGenePredictor(boundary_path)
101
+ logger.info("✅ Boundary model downloaded and loaded")
102
+ else:
103
+ logger.warning(f"❌ Boundary model download failed from {BOUNDARY_MODEL_REPO}")
 
 
 
104
  else:
105
  logger.warning("❌ No boundary model found and no HF_TOKEN")
106
  except Exception as e:
 
111
  try:
112
  local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
113
  local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
 
114
  if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
115
+ logger.info(f" Loading Keras model from local: {local_keras_path}")
116
  keras_model = load_model(local_keras_path)
117
  with open(local_kmer_path, "rb") as f:
118
  kmer_to_index = pickle.load(f)
119
  logger.info("✅ Keras model loaded successfully")
120
+ elif HF_TOKEN:
121
+ logger.info(f"🌐 Downloading Keras model from {OTHER_MODELS_REPO}")
122
+ keras_path = hf_hub_download(
123
+ repo_id=OTHER_MODELS_REPO,
124
+ filename="best_model.keras",
125
+ token=HF_TOKEN,
126
+ cache_dir="/tmp/hf_cache",
127
+ local_dir=MODELS_DIR,
128
+ local_dir_use_symlinks=False
129
+ )
130
+ kmer_path = hf_hub_download(
131
+ repo_id=OTHER_MODELS_REPO,
132
+ filename="kmer_to_index.pkl",
133
+ token=HF_TOKEN,
134
+ cache_dir="/tmp/hf_cache",
135
+ local_dir=MODELS_DIR,
136
+ local_dir_use_symlinks=False
137
+ )
138
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
139
+ keras_model = load_model(keras_path)
140
+ with open(kmer_path, "rb") as f:
141
+ kmer_to_index = pickle.load(f)
142
+ logger.info("✅ Keras model downloaded and loaded")
143
+ else:
144
+ logger.warning(f"❌ Keras model download failed from {OTHER_MODELS_REPO}")
 
 
 
 
 
145
  else:
146
  logger.warning("❌ No Keras model found and no HF_TOKEN")
147
  except Exception as e:
 
153
  try:
154
  logger.info("🌳 Initializing tree analyzer...")
155
  analyzer = PhylogeneticTreeAnalyzer()
156
+ csv_candidates = [
157
+ os.path.join(BASE_DIR, "f_cleaned.csv"),
158
+ os.path.join(BASE_DIR, "f cleaned.csv"),
159
+ os.path.join(MODELS_DIR, "f_cleaned.csv"),
160
+ os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
161
+ "f_cleaned.csv"
162
+ ]
163
  csv_loaded = False
164
  for csv_candidate in csv_candidates:
165
  if os.path.exists(csv_candidate):
166
+ logger.info(f"📊 Trying CSV: {csv_candidate}")
167
  try:
 
168
  if analyzer.load_data(csv_candidate):
169
  logger.info(f"✅ CSV loaded from: {csv_candidate}")
170
  csv_loaded = True
 
172
  except Exception as e:
173
  logger.warning(f"CSV load failed for {csv_candidate}: {e}")
174
  continue
175
+ if not csv_loaded and HF_TOKEN:
176
+ logger.info(f"🌐 Downloading CSV from {OTHER_MODELS_REPO}")
177
+ try:
178
+ csv_path = hf_hub_download(
179
+ repo_id=OTHER_MODELS_REPO,
180
+ filename="f_cleaned.csv",
181
+ token=HF_TOKEN,
182
+ cache_dir="/tmp/hf_cache",
183
+ local_dir=BASE_DIR,
184
+ local_dir_use_symlinks=False
185
+ )
186
+ if os.path.exists(csv_path) and analyzer.load_data(csv_path):
187
+ logger.info("✅ CSV downloaded and loaded")
188
+ csv_loaded = True
189
+ else:
190
+ logger.warning(f"❌ CSV download failed from {OTHER_MODELS_REPO}")
191
+ except Exception as e:
192
+ logger.error(f"❌ CSV HF download failed: {e}")
193
  if not csv_loaded:
194
  logger.error("❌ No CSV data loaded")
195
+ analyzer = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  except Exception as e:
197
  logger.error(f"❌ Tree analyzer initialization failed: {e}")
198
  analyzer = None
 
212
 
213
  def check_tool_availability():
214
  setup_binary_permissions()
 
 
215
  mafft_available = False
216
  mafft_cmd = None
217
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
218
  for candidate in mafft_candidates:
219
  if shutil.which(candidate) or os.path.exists(candidate):
220
  try:
221
  result = subprocess.run(
222
+ [candidate, "--help"],
223
+ capture_output=True,
224
+ text=True,
225
  timeout=5
226
  )
227
  if result.returncode == 0 or "mafft" in result.stderr.lower():
228
  mafft_available = True
229
  mafft_cmd = candidate
230
+ logger.info(f"✅ MAFFT found at: {candidate}")
231
  break
232
  except Exception as e:
233
  logger.debug(f"MAFFT test failed for {candidate}: {e}")
 
 
234
  iqtree_available = False
235
  iqtree_cmd = None
236
+ iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
237
  for candidate in iqtree_candidates:
238
  if shutil.which(candidate) or os.path.exists(candidate):
239
  try:
240
  result = subprocess.run(
241
+ [candidate, "--help"],
242
+ capture_output=True,
243
+ text=True,
244
  timeout=5
245
  )
246
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
247
  iqtree_available = True
248
  iqtree_cmd = candidate
249
+ logger.info(f"✅ IQ-TREE found at: {candidate}")
250
  break
251
  except Exception as e:
252
  logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
 
253
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
254
 
255
+ # --- Pipeline Functions ---
256
+ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
257
+ try:
258
+ if len(sequence.strip()) < 100:
259
+ return False, "Sequence too short (<100 bp).", None, None
260
+ query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
261
+ query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
262
+ aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
263
+ output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
264
+ if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
265
+ return False, "Reference alignment or tree not found.", None, None
266
+ query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
267
+ SeqIO.write([query_record], query_fasta, "fasta")
268
+ with open(aligned_with_query, "w") as output_file:
269
+ subprocess.run([
270
+ mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
271
+ ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
272
+ if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
273
+ return False, "MAFFT alignment failed.", None, None
274
+ subprocess.run([
275
+ iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
276
+ "-m", "GTR+G", "-pre", output_prefix, "-redo"
277
+ ], capture_output=True, text=True, timeout=1200, check=True)
278
+ treefile = f"{output_prefix}.treefile"
279
+ if not os.path.exists(treefile):
280
+ return False, "IQ-TREE placement failed.", aligned_with_query, None
281
+ success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
282
+ return True, success_msg, aligned_with_query, treefile
283
+ except Exception as e:
284
+ logger.error(f"Phylogenetic placement failed: {e}")
285
+ return False, f"Error: {str(e)}", None, None
286
+ finally:
287
+ if 'query_fasta' in locals() and os.path.exists(query_fasta):
288
+ try:
289
+ os.unlink(query_fasta)
290
+ except:
291
+ pass
292
+
293
+ def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
294
+ try:
295
+ if not analyzer:
296
+ return "❌ Tree analyzer not initialized.", None, None
297
+ if not sequence or len(sequence.strip()) < 10:
298
+ return "❌ Invalid sequence.", None, None
299
+ if not (1 <= matching_percentage <= 99):
300
+ return "❌ Matching percentage must be 1-99.", None, None
301
+ if not analyzer.find_query_sequence(sequence):
302
+ return "❌ Sequence not accepted.", None, None
303
+ matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
304
+ if not matched_ids:
305
+ return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
306
+ analyzer.build_tree_structure_with_ml_safe(matched_ids)
307
+ fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
308
+ query_id = analyzer.query_id or f"query_{int(time.time())}"
309
+ tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
310
+ fig.write_html(tree_html_path)
311
+ analyzer.matching_percentage = matching_percentage
312
+ report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
313
+ report_html_path = os.path.join("/tmp", f'detailed_report_{query_id}.html') if report_success else None
314
+ return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
315
+ except Exception as e:
316
+ logger.error(f"Tree analysis failed: {e}")
317
+ return f"❌ Error: {str(e)}", None, None
318
+
319
  def predict_with_keras(sequence):
320
  try:
321
  if not keras_model or not kmer_to_index:
322
+ return "❌ Keras model not available."
 
323
  if len(sequence) < 6:
324
+ return "❌ Sequence too short (<6 bp)."
 
325
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
326
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
327
  input_arr = np.array([indices])
 
328
  prediction = keras_model.predict(input_arr, verbose=0)[0]
329
  f_gene_prob = prediction[-1]
330
  percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
 
331
  return f"✅ {percentage}% F gene confidence"
332
  except Exception as e:
333
  logger.error(f"Keras prediction failed: {e}")
334
  return f"❌ Error: {str(e)}"
335
 
336
+ def read_fasta_file(file_obj):
337
+ try:
338
+ if file_obj is None:
339
+ return ""
340
+ if isinstance(file_obj, str):
341
+ with open(file_obj, "r") as f:
342
+ content = f.read()
343
+ else:
344
+ content = file_obj.read().decode("utf-8")
345
+ lines = content.strip().split("\n")
346
+ seq_lines = [line.strip() for line in lines if not line.startswith(">")]
347
+ return ''.join(seq_lines)
348
+ except Exception as e:
349
+ logger.error(f"Failed to read FASTA file: {e}")
350
+ return ""
351
+
352
+ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
353
  try:
 
 
 
354
  dna_input = dna_input.upper().strip()
355
+ if not dna_input:
356
+ return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
357
  if not re.match('^[ACTGN]+$', dna_input):
358
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
 
359
  processed_sequence = dna_input
360
+ boundary_output = ""
 
 
361
  if boundary_model:
362
  try:
363
  result = boundary_model.predict_sequence(dna_input)
364
  regions = result['gene_regions']
365
  if regions:
366
  processed_sequence = regions[0]["sequence"]
367
+ boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
368
+ else:
369
+ boundary_output = "⚠️ No F gene regions found."
370
+ processed_sequence = dna_input
371
+ except Exception as e:
372
+ boundary_output = f"❌ Boundary prediction error: {str(e)}"
373
+ processed_sequence = dna_input
374
+ else:
375
+ boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
376
+ keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
377
+ aligned_file = None
378
+ phy_file = None
379
+ ml_tree_output = ""
380
+ if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
381
+ try:
382
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
383
+ if mafft_available and iqtree_available:
384
+ ml_success, ml_message, ml_aligned, ml_tree = phylogenetic_placement(processed_sequence, mafft_cmd, iqtree_cmd)
385
+ ml_tree_output = ml_message
386
+ aligned_file = ml_aligned
387
+ phy_file = ml_tree
388
  else:
389
+ ml_tree_output = " MAFFT or IQ-TREE not available"
390
  except Exception as e:
391
+ ml_tree_output = f"❌ ML tree error: {str(e)}"
392
+ elif build_ml_tree:
393
+ ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
394
  else:
395
+ ml_tree_output = "⚠️ Phylogenetic placement skipped."
396
+ tree_html_content = "No tree generated."
397
+ report_html_content = "No report generated."
398
+ simplified_ml_output = ""
399
+ if analyzer and processed_sequence and len(processed_sequence) >= 10:
 
 
 
400
  try:
401
+ tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
402
+ simplified_ml_output = tree_result
403
+ if tree_html_path and os.path.exists(tree_html_path):
404
+ with open(tree_html_path, 'r', encoding='utf-8') as f:
405
+ tree_html_content = f.read()
406
+ else:
407
+ tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
408
+ if report_html_path and os.path.exists(report_html_path):
409
+ with open(report_html_path, 'r', encoding='utf-8') as f:
410
+ report_html_content = f.read()
411
+ else:
412
+ report_html_content = f"<div style='color: red;'>{tree_result}</div>"
413
  except Exception as e:
414
+ simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
415
+ tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
416
+ report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
417
  else:
418
+ simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
419
+ tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
420
+ report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
421
+ summary_output = f"""
422
  📊 ANALYSIS SUMMARY:
423
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
424
  Input: {len(dna_input)} bp
425
  F Gene: {len(processed_sequence)} bp
426
+ Validation: {keras_output.split(':')[-1].strip() if ':' in keras_output else keras_output}
427
+ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skipped' in ml_tree_output else '❌ Failed'}
428
+ Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
429
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
430
  """
431
+ return (
432
+ boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
433
+ aligned_file, phy_file, None, None, tree_html_content, report_html_content
434
+ )
435
  except Exception as e:
436
+ logger.error(f"Pipeline error: {e}")
437
+ error_msg = f"❌ Pipeline Error: {str(e)}"
438
+ return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
439
 
440
+ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
 
441
  try:
442
+ dna_input = read_fasta_file(fasta_file_obj)
443
+ if not dna_input:
444
+ return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input"
445
+ return run_pipeline(dna_input, similarity_score, build_ml_tree)
 
 
 
 
 
 
 
 
446
  except Exception as e:
447
+ logger.error(f"Pipeline from file error: {e}")
448
+ error_msg = f"❌ Error: {str(e)}"
449
+ return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
  # --- Pydantic Models ---
452
  class AnalysisRequest(BaseModel):
453
  sequence: str
454
  similarity_score: float = 95.0
455
+ build_ml_tree: bool = False
456
 
457
  class AnalysisResponse(BaseModel):
458
+ boundary_output: str
459
+ keras_output: str
460
+ ml_tree_output: str
461
+ tree_analysis_output: str
462
+ summary_output: str
463
  success: bool
464
  error_message: Optional[str] = None
465
 
 
473
  "docs": "/docs",
474
  "health": "/health",
475
  "gradio": "/gradio",
476
+ "analyze": "/analyze",
477
+ "analyze_file": "/analyze-file"
478
  }
479
  }
480
 
 
490
  "tree_analyzer": analyzer is not None,
491
  "mafft_available": mafft_available,
492
  "iqtree_available": iqtree_available
493
+ },
494
+ "paths": {
495
+ "base_dir": BASE_DIR,
496
+ "models_dir": MODELS_DIR,
497
+ "hf_cache": "/tmp/hf_cache",
498
+ "models_dir_exists": os.path.exists(MODELS_DIR),
499
+ "hf_cache_exists": os.path.exists("/tmp/hf_cache")
500
  }
501
  }
502
  except Exception as e:
503
+ logger.error(f"Health check error: {e}")
504
  return {"status": "unhealthy", "error": str(e)}
505
 
506
  @app.post("/analyze", response_model=AnalysisResponse)
507
  async def analyze_sequence(request: AnalysisRequest):
508
  try:
509
+ result = run_pipeline(request.sequence, request.similarity_score, request.build_ml_tree)
510
+ return AnalysisResponse(
511
+ boundary_output=result[0] or "",
512
+ keras_output=result[1] or "",
513
+ ml_tree_output=result[2] or "",
514
+ tree_analysis_output=result[3] or "",
515
+ summary_output=result[4] or "",
516
+ success=True
517
+ )
518
  except Exception as e:
519
+ logger.error(f"Analyze error: {e}")
520
+ return AnalysisResponse(
521
+ boundary_output="", keras_output="", ml_tree_output="",
522
+ tree_analysis_output="", summary_output="",
523
+ success=False, error_message=str(e)
524
+ )
525
 
526
+ @app.post("/analyze-file")
527
+ async def analyze_file(
528
+ file: UploadFile = File(...),
529
+ similarity_score: float = Form(95.0),
530
+ build_ml_tree: bool = Form(False)
531
+ ):
532
+ temp_file_path = None
533
+ try:
534
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
535
+ content = await file.read()
536
+ temp_file.write(content)
537
+ temp_file_path = temp_file.name
538
+ result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
539
+ return AnalysisResponse(
540
+ boundary_output=result[0] or "",
541
+ keras_output=result[1] or "",
542
+ ml_tree_output=result[2] or "",
543
+ tree_analysis_output=result[3] or "",
544
+ summary_output=result[4] or "",
545
+ success=True
546
+ )
547
+ except Exception as e:
548
+ logger.error(f"Analyze-file error: {e}")
549
+ return AnalysisResponse(
550
+ boundary_output="", keras_output="", ml_tree_output="",
551
+ tree_analysis_output="", summary_output="",
552
+ success=False, error_message=str(e)
553
+ )
554
+ finally:
555
+ if temp_file_path and os.path.exists(temp_file_path):
556
+ try:
557
+ os.unlink(temp_file_path)
558
+ except:
559
+ pass
560
+
561
+ # --- Gradio Interface ---
562
+ def create_gradio_interface():
563
  try:
 
 
 
 
 
 
 
 
 
 
 
 
564
  with gr.Blocks(
565
  title="🧬 Gene Analysis Pipeline",
566
+ theme=gr.themes.Soft(),
567
+ css="""
568
+ .gradio-container { max-width: 1200px !important; }
569
+ .status-box { padding: 10px; border-radius: 5px; margin: 5px 0; }
570
+ .success { background-color: #d4edda; border: 1px solid #c3e6cb; color: #155724; }
571
+ .warning { background-color: #fff3cd; border: 1px solid #ffeaa7; color: #856404; }
572
+ .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
573
+ """
574
+ ) as iface:
575
  gr.Markdown("# 🧬 Gene Analysis Pipeline")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  with gr.Row():
577
+ with gr.Column():
578
+ status_display = gr.HTML(value=f"""
579
+ <div class="status-box">
580
+ <h3>🔧 System Status</h3>
581
+ <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
582
+ <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
583
+ <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}</p>
584
+ <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
585
+ <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
586
+ </div>
587
+ """)
588
+ with gr.Tabs() as tabs:
589
+ with gr.TabItem("📝 Text Input"):
590
+ with gr.Row():
591
+ with gr.Column(scale=2):
592
+ dna_input = gr.Textbox(
593
+ label="🧬 DNA Sequence",
594
+ placeholder="Enter DNA sequence (ATCG format)...",
595
+ lines=5,
596
+ info="Paste your DNA sequence here"
597
+ )
598
+ with gr.Column(scale=1):
599
+ similarity_score = gr.Slider(
600
+ minimum=1,
601
+ maximum=99,
602
+ value=95.0,
603
+ step=1.0,
604
+ label="🎯 Similarity Threshold (%)",
605
+ info="Minimum similarity for tree analysis"
606
+ )
607
+ build_ml_tree = gr.Checkbox(
608
+ label="🌲 Build ML Tree",
609
+ value=False,
610
+ info="Generate phylogenetic placement (slower)"
611
+ )
612
+ analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
613
+ with gr.TabItem("📁 File Upload"):
614
+ with gr.Row():
615
+ with gr.Column(scale=2):
616
+ file_input = gr.File(
617
+ label="📄 Upload FASTA File",
618
+ file_types=[".fasta", ".fa", ".fas", ".txt"],
619
+ info="Upload a FASTA file containing your sequence"
620
+ )
621
+ with gr.Column(scale=1):
622
+ file_similarity_score = gr.Slider(
623
+ minimum=1,
624
+ maximum=99,
625
+ value=95.0,
626
+ step=1.0,
627
+ label="🎯 Similarity Threshold (%)"
628
+ )
629
+ file_build_ml_tree = gr.Checkbox(
630
+ label="🌲 Build ML Tree",
631
+ value=False
632
+ )
633
+ analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
634
+ gr.Markdown("## 📊 Analysis Results")
635
+ with gr.Row():
636
+ with gr.Column():
637
+ boundary_output = gr.Textbox(
638
+ label="🎯 Boundary Detection",
639
+ interactive=False,
640
+ lines=2
641
+ )
642
+ keras_output = gr.Textbox(
643
+ label="🧠 F Gene Validation",
644
+ interactive=False,
645
+ lines=2
646
+ )
647
+ with gr.Column():
648
+ ml_tree_output = gr.Textbox(
649
+ label="🌲 Phylogenetic Placement",
650
+ interactive=False,
651
+ lines=2
652
+ )
653
+ tree_analysis_output = gr.Textbox(
654
+ label="🌳 Tree Analysis",
655
+ interactive=False,
656
+ lines=2
657
+ )
658
+ summary_output = gr.Textbox(
659
+ label="📋 Summary",
660
+ interactive=False,
661
+ lines=8
662
  )
663
+ with gr.Row():
664
+ aligned_file = gr.File(label="📄 Alignment File", visible=False)
665
+ tree_file = gr.File(label="🌲 Tree File", visible=False)
666
+ with gr.Tabs():
667
+ with gr.TabItem("🌳 Interactive Tree"):
668
+ tree_html = gr.HTML(
669
+ label="Phylogenetic Tree",
670
+ value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
671
+ )
672
+ with gr.TabItem("📊 Detailed Report"):
673
+ report_html = gr.HTML(
674
+ label="Analysis Report",
675
+ value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
676
+ )
677
+ analyze_btn.click(
678
+ fn=run_pipeline,
679
+ inputs=[dna_input, similarity_score, build_ml_tree],
680
+ outputs=[
681
+ boundary_output, keras_output, ml_tree_output,
682
+ tree_analysis_output, summary_output,
683
+ aligned_file, tree_file, gr.State(), gr.State(),
684
+ tree_html, report_html
685
+ ]
686
  )
 
687
  analyze_file_btn.click(
688
+ fn=run_pipeline_from_file,
689
+ inputs=[file_input, file_similarity_score, file_build_ml_tree],
690
+ outputs=[
691
+ boundary_output, keras_output, ml_tree_output,
692
+ tree_analysis_output, summary_output,
693
+ aligned_file, tree_file, gr.State(), gr.State(),
694
+ tree_html, report_html
695
+ ]
696
  )
697
+ gr.Markdown("## 🔬 Example Sequences")
698
+ example_sequences = [
699
+ ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
700
+ ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True]
701
+ ]
702
  gr.Examples(
703
+ examples=example_sequences,
704
+ inputs=[dna_input, similarity_score, build_ml_tree],
705
+ label="Click to load example sequences"
 
 
706
  )
707
+ with gr.Accordion("❓ Help & Information", open=False):
708
+ gr.Markdown("""
709
+ ### 🧬 Gene Analysis Pipeline
710
+ This tool performs comprehensive analysis of F gene sequences:
711
+ **🎯 Boundary Detection**: Identifies F gene regions within your sequence
712
+ **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
713
+ **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
714
+ **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
715
+ ### 📋 Input Requirements
716
+ - DNA sequences in ATCG format
717
+ - Minimum 10 bp for basic analysis
718
+ - Minimum 100 bp for phylogenetic placement
719
+ - FASTA files supported for upload
720
+ ### ⚙️ Parameters
721
+ - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
722
+ - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
723
+ ### 📊 Output Files
724
+ - Alignment files (.fa format)
725
+ - Tree files (.treefile format)
726
+ - Interactive HTML visualizations
727
+ """)
728
+ return iface
729
  except Exception as e:
730
  logger.error(f"Failed to create Gradio interface: {e}")
731
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
732
 
733
  # --- Application Startup ---
734
+ def mount_gradio_app():
735
  try:
736
+ gradio_app = create_gradio_interface()
737
+ if gradio_app:
738
+ app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
739
+ logger.info("✅ Gradio interface mounted at /gradio")
740
+ else:
741
+ logger.error("❌ Failed to create Gradio interface")
742
+ except Exception as e:
743
+ logger.error(f"❌ Failed to mount Gradio app: {e}")
744
+
745
+ # Initialize Gradio
746
+ mount_gradio_app()
747
+
748
+ # --- Main Application ---
749
+ if __name__ == "__main__":
750
+ import argparse
751
+ parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
752
+ parser.add_argument("--host", default="0.0.0.0", help="Host address")
753
+ parser.add_argument("--port", type=int, default=7860, help="Port number")
754
+ parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
755
+ parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
756
+ args = parser.parse_args()
757
+ if args.gradio_only:
758
+ logger.info("🚀 Starting Gradio interface only...")
759
+ iface = create_gradio_interface()
760
+ if iface:
761
+ iface.launch(
762
+ server_name=args.host,
763
+ server_port=args.port,
764
+ share=False,
765
+ show_error=True
766
+ )
767
+ else:
768
+ logger.error("❌ Failed to create Gradio interface")
769
+ sys.exit(1)
770
+ else:
771
+ logger.info(f"🚀 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
772
+ logger.info("📊 API Documentation: http://localhost:7860/docs")
773
+ logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
774
  try:
775
+ uvicorn.run(
776
+ "app:app" if args.reload else app,
777
+ host=args.host,
778
+ port=args.port,
779
+ reload=args.reload,
780
+ log_level="info"
781
+ )
782
+ except KeyboardInterrupt:
783
+ logger.info("🛑 Application stopped by user")
784
  except Exception as e:
785
+ logger.error(f" Application failed: {e}")
786
+ sys.exit(1)