re-type commited on
Commit
574dbbb
·
verified ·
1 Parent(s): 103437c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +585 -320
app.py CHANGED
@@ -22,47 +22,37 @@ from Bio.SeqRecord import SeqRecord
22
  import stat
23
  import time
24
  import asyncio
25
- from fastapi import FastAPI, File, UploadFile, Form, Request
 
 
26
  from fastapi.responses import HTMLResponse
27
- from fastapi.middleware.cors import CORSMiddleware
28
  from pydantic import BaseModel
29
  from typing import Optional
30
  import uvicorn
31
 
32
  # Set event loop policy for Spaces
33
- asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
 
 
 
34
 
35
  # --- FastAPI App Setup ---
36
  app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
37
 
38
- # Add CORS and logging middleware
39
- app.add_middleware(
40
- CORSMiddleware,
41
- allow_origins=["*"],
42
- allow_credentials=True,
43
- allow_methods=["*"],
44
- allow_headers=["*"],
45
- )
46
- @app.middleware("http")
47
- async def log_requests(request: Request, call_next):
48
- logging.debug(f"Request: {request.method} {request.url}")
49
- try:
50
- response = await call_next(request)
51
- logging.debug(f"Response: {response.status_code}")
52
- return response
53
- except Exception as e:
54
- logging.error(f"Request error: {e}", exc_info=True)
55
- raise
56
 
57
- # --- Logging ---
58
- logging.basicConfig(
59
- level=logging.DEBUG,
60
- format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
61
- handlers=[logging.StreamHandler(), logging.FileHandler('/tmp/app.log')]
62
- )
63
- logging.getLogger('uvicorn').setLevel(logging.DEBUG)
64
- logging.getLogger('fastapi').setLevel(logging.DEBUG)
65
- logging.getLogger('gradio').setLevel(logging.DEBUG)
66
 
67
  # --- Global Variables ---
68
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -78,50 +68,100 @@ model_repo = "GGproject10/best_boundary_aware_model"
78
  csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
79
  hf_token = os.getenv("HF_TOKEN")
80
 
81
- # --- Load Models ---
82
  boundary_model = None
83
  keras_model = None
84
  kmer_to_index = None
85
- try:
86
- boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token, cache_dir="/tmp/hf_cache")
87
- if os.path.exists(boundary_path):
88
- boundary_model = EnhancedGenePredictor(boundary_path)
89
- logging.info("Boundary model loaded.")
90
- else:
91
- logging.warning("Boundary model not found.")
92
- except Exception as e:
93
- logging.error(f"Failed to load boundary model: {e}", exc_info=True)
94
 
95
- try:
96
- keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras", token=hf_token, cache_dir="/tmp/hf_cache")
97
- kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl", token=hf_token, cache_dir="/tmp/hf_cache")
98
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
99
- keras_model = load_model(keras_path)
100
- with open(kmer_path, "rb") as f:
101
- kmer_to_index = pickle.load(f)
102
- logging.info("Keras model loaded.")
103
- else:
104
- logging.warning("Keras model not found.")
105
- except Exception as e:
106
- logging.error(f"Failed to load Keras model: {e}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- # --- Initialize Tree Analyzer ---
109
- analyzer = None
110
- try:
111
- analyzer = PhylogeneticTreeAnalyzer()
112
- csv_candidates = [csv_path, os.path.join(BASE_DIR, "f cleaned.csv"), "f_cleaned.csv"]
113
- csv_loaded = False
114
- for csv_candidate in csv_candidates:
115
- if os.path.exists(csv_candidate):
116
- if analyzer.load_data(csv_candidate):
117
- logging.info(f"Tree analyzer loaded from: {csv_candidate}")
118
- csv_loaded = True
119
- break
120
- if not csv_loaded:
121
- logging.error("Failed to load CSV data.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  analyzer = None
123
- except Exception as e:
124
- logging.error(f"Failed to initialize tree analyzer: {e}", exc_info=True)
 
125
 
126
  # --- Tool Detection ---
127
  def setup_binary_permissions():
@@ -129,153 +169,301 @@ def setup_binary_permissions():
129
  if os.path.exists(binary):
130
  try:
131
  os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
132
- logging.info(f"Set executable permission on {binary}")
133
  except Exception as e:
134
- logging.warning(f"Failed to set permission on {binary}: {e}")
135
 
136
  def check_tool_availability():
137
  setup_binary_permissions()
 
 
138
  mafft_available = False
139
  mafft_cmd = None
140
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
 
141
  for candidate in mafft_candidates:
142
- if shutil.which(candidate):
143
  try:
144
- result = subprocess.run([candidate, "--help"], capture_output=True, text=True, timeout=5)
 
 
 
 
 
145
  if result.returncode == 0 or "mafft" in result.stderr.lower():
146
  mafft_available = True
147
  mafft_cmd = candidate
148
- logging.info(f"MAFFT found at: {candidate}")
149
  break
150
  except Exception as e:
151
- logging.debug(f"MAFFT test failed for {candidate}: {e}")
 
 
152
  iqtree_available = False
153
  iqtree_cmd = None
154
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
 
155
  for candidate in iqtree_candidates:
156
- if shutil.which(candidate):
157
  try:
158
- result = subprocess.run([candidate, "--help"], capture_output=True, text=True, timeout=5)
 
 
 
 
 
159
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
160
  iqtree_available = True
161
  iqtree_cmd = candidate
162
- logging.info(f"IQ-TREE found at: {candidate}")
163
  break
164
  except Exception as e:
165
- logging.debug(f"IQ-TREE test failed for {candidate}: {e}")
 
166
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
167
 
168
- # --- Pipeline Functions ---
169
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
170
  try:
171
  if len(sequence.strip()) < 100:
172
  return False, "Sequence too short (<100 bp).", None, None
 
173
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
174
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
175
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
176
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
 
177
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
178
  return False, "Reference alignment or tree not found.", None, None
 
179
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
180
  SeqIO.write([query_record], query_fasta, "fasta")
 
181
  with open(aligned_with_query, "w") as output_file:
182
- subprocess.run([mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
 
 
 
183
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
184
  return False, "MAFFT alignment failed.", None, None
185
- subprocess.run([iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH, "-m", "GTR+G", "-pre", output_prefix, "-redo"], capture_output=True, text=True, timeout=1200, check=True)
 
 
 
 
 
186
  treefile = f"{output_prefix}.treefile"
187
  if not os.path.exists(treefile):
188
  return False, "IQ-TREE placement failed.", aligned_with_query, None
 
189
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
190
  return True, success_msg, aligned_with_query, treefile
 
191
  except Exception as e:
192
- logging.error(f"Phylogenetic placement failed: {e}", exc_info=True)
193
  return False, f"Error: {str(e)}", None, None
194
  finally:
195
  if 'query_fasta' in locals() and os.path.exists(query_fasta):
196
- os.unlink(query_fasta)
 
 
 
197
 
198
- def build_maximum_likelihood_tree(f_gene_sequence):
199
  try:
200
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
201
- status_msg = f"MAFFT: {'OK' if mafft_available else 'Missing'}\nIQ-TREE: {'OK' if iqtree_available else 'Missing'}\n"
202
- if not mafft_available or not iqtree_available:
203
- return False, f"{status_msg}\nInstall: conda install -c bioconda mafft iqtree", None, None
204
- if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
205
- return False, f"{status_msg}\nReference files missing.", None, None
206
- success, message, aligned_file, tree_file = phylogenetic_placement(f_gene_sequence, mafft_cmd, iqtree_cmd)
207
- if success:
208
- if aligned_file:
209
- shutil.copy2(aligned_file, "query_with_references_aligned.fasta")
210
- aligned_file = "query_with_references_aligned.fasta"
211
- if tree_file:
212
- shutil.copy2(tree_file, "query_placement_tree.treefile")
213
- tree_file = "query_placement_tree.treefile"
214
- return True, f"{status_msg}\n{message}", aligned_file, tree_file
215
- return False, f"{status_msg}\n{message}", aligned_file, tree_file
216
  except Exception as e:
217
- logging.error(f"ML tree construction failed: {e}", exc_info=True)
218
- return False, f"Error: {str(e)}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
 
220
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
221
  try:
222
  if not analyzer:
223
- return "Tree analyzer not initialized.", None, None
 
224
  if not sequence or len(sequence.strip()) < 10:
225
- return "Invalid sequence.", None, None
 
226
  if not (1 <= matching_percentage <= 99):
227
- return "Matching percentage must be 1-99.", None, None
 
228
  if not analyzer.find_query_sequence(sequence):
229
- return "Sequence not accepted.", None, None
 
230
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
231
  if not matched_ids:
232
- return f"No similar sequences at {matching_percentage}% threshold.", None, None
 
233
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
234
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
 
235
  query_id = analyzer.query_id or f"query_{int(time.time())}"
236
  tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
237
  fig.write_html(tree_html_path)
 
238
  analyzer.matching_percentage = matching_percentage
239
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
240
  report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
241
- return f"Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
242
- except Exception as e:
243
- logging.error(f"Tree analysis failed: {e}", exc_info=True)
244
- return f"Error: {str(e)}", None, None
245
-
246
- def predict_with_keras(sequence):
247
- try:
248
- if not keras_model or not kmer_to_index:
249
- return "Keras model not available."
250
- if len(sequence) < 6:
251
- return "Sequence too short (<6 bp)."
252
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
253
- indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
254
- input_arr = np.array([indices])
255
- prediction = keras_model.predict(input_arr, verbose=0)[0]
256
- f_gene_prob = prediction[-1]
257
- percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
258
- return f"{percentage}% F gene"
259
  except Exception as e:
260
- logging.error(f"Keras prediction failed: {e}", exc_info=True)
261
- return f"Error: {str(e)}"
262
 
263
  def read_fasta_file(file_obj):
264
  try:
265
  if file_obj is None:
266
  return ""
 
267
  if isinstance(file_obj, str):
268
  with open(file_obj, "r") as f:
269
  content = f.read()
270
  else:
271
  content = file_obj.read().decode("utf-8")
 
272
  lines = content.strip().split("\n")
273
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
274
  return ''.join(seq_lines)
 
275
  except Exception as e:
276
- logging.error(f"Failed to read FASTA file: {e}", exc_info=True)
277
  return ""
278
 
 
 
 
 
 
 
 
 
 
 
 
279
  # --- Pydantic Models ---
280
  class AnalysisRequest(BaseModel):
281
  sequence: str
@@ -291,107 +479,42 @@ class AnalysisResponse(BaseModel):
291
  success: bool
292
  error_message: Optional[str] = None
293
 
294
- # --- Pipeline Execution ---
295
- async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
296
- try:
297
- dna_input = read_fasta_file(fasta_file_obj)
298
- if not dna_input:
299
- return "Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input"
300
- return run_pipeline(dna_input, similarity_score, build_ml_tree)
301
- except Exception as e:
302
- logging.error(f"Pipeline from file error: {e}", exc_info=True)
303
- return f"Error: {str(e)}", "", "", "", "", None, None, None, None, f"Error: {str(e)}", f"Error: {str(e)}"
304
-
305
- def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
306
- try:
307
- dna_input = dna_input.upper().strip()
308
- if not dna_input:
309
- return "Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
310
- if not re.match('^[ACTGN]+$', dna_input):
311
- dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
312
- processed_sequence = dna_input
313
- boundary_output = ""
314
- if boundary_model:
315
- result = boundary_model.predict_sequence(dna_input)
316
- regions = result['gene_regions']
317
- if regions:
318
- processed_sequence = regions[0]["sequence"]
319
- boundary_output = processed_sequence
320
- else:
321
- boundary_output = "No F gene regions found."
322
- processed_sequence = dna_input
323
- else:
324
- boundary_output = f"Boundary model not available. Using input: {len(dna_input)} bp"
325
- keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "Sequence too short."
326
- aligned_file = None
327
- phy_file = None
328
- ml_tree_output = ""
329
- if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
330
- ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
331
- ml_tree_output = ml_message
332
- aligned_file = ml_aligned
333
- phy_file = ml_tree
334
- elif build_ml_tree:
335
- ml_tree_output = "Sequence too short for placement (<100 bp)."
336
- else:
337
- ml_tree_output = "Phylogenetic placement skipped."
338
- tree_html_file = None
339
- report_html_file = None
340
- tree_html_content = "No tree generated."
341
- report_html_content = "No report generated."
342
- simplified_ml_output = ""
343
- if analyzer and processed_sequence and len(processed_sequence) >= 10:
344
- tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
345
- if tree_html_path and os.path.exists(tree_html_path):
346
- output_dir = os.path.join("/tmp", "output")
347
- os.makedirs(output_dir, exist_ok=True)
348
- safe_seq_name = re.sub(r'[^a-zA-Z0-9_-]', '', processed_sequence[:20])
349
- timestamp = str(int(time.time()))
350
- tree_html_filename = f"tree_{safe_seq_name}_{timestamp}.html"
351
- tree_html_final_path = os.path.join(output_dir, tree_html_filename)
352
- shutil.copy2(tree_html_path, tree_html_final_path)
353
- tree_html_file = tree_html_final_path
354
- with open(tree_html_path, 'r', encoding='utf-8') as f:
355
- tree_html_content = f.read()
356
- os.unlink(tree_html_path)
357
- if report_html_path and os.path.exists(report_html_path):
358
- report_html_filename = f"report_{safe_seq_name}_{timestamp}.html"
359
- report_html_final_path = os.path.join(output_dir, report_html_filename)
360
- shutil.copy2(report_html_path, report_html_final_path)
361
- report_html_file = report_html_final_path
362
- with open(report_html_path, 'r', encoding='utf-8') as f:
363
- report_html_content = f.read()
364
- os.unlink(report_html_path)
365
- simplified_ml_output = tree_result
366
- if not tree_html_file:
367
- tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
368
- if not report_html_file:
369
- report_html_content = f"<div style='color: red;'>{tree_result}</div>"
370
- else:
371
- simplified_ml_output = "Tree analyzer not available." if not analyzer else "Sequence too short (<10 bp)."
372
- tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
373
- report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
374
- summary_output = f"""
375
- ANALYSIS SUMMARY:
376
- Input: {len(dna_input)} bp
377
- F Gene: {len(processed_sequence)} bp
378
- Validation: {keras_output}
379
- Placement: {'OK' if 'successfully' in ml_tree_output else 'Skipped' if 'skipped' in ml_tree_output else 'Failed'}
380
- Tree Analysis: {'OK' if 'Found' in simplified_ml_output else 'Failed'}
381
- Report: {'OK' if report_html_file else 'Failed'}
382
- """
383
- return (
384
- boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
385
- aligned_file, phy_file, tree_html_file, report_html_file, tree_html_content, report_html_content
386
- )
387
- except Exception as e:
388
- logging.error(f"Pipeline error: {e}", exc_info=True)
389
- return f"Error: {str(e)}", "", "", "", "", None, None, None, None, f"Error: {str(e)}", f"Error: {str(e)}"
390
-
391
  # --- FastAPI Endpoints ---
392
  @app.get("/")
393
  async def root():
394
- return {"message": "Gene Analysis Pipeline API", "docs": "/docs"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
  @app.post("/analyze", response_model=AnalysisResponse)
397
  async def analyze_sequence(request: AnalysisRequest):
@@ -406,21 +529,28 @@ async def analyze_sequence(request: AnalysisRequest):
406
  success=True
407
  )
408
  except Exception as e:
409
- logging.error(f"Analyze error: {e}", exc_info=True)
410
  return AnalysisResponse(
411
- boundary_output="", keras_output="", ml_tree_output="", tree_analysis_output="", summary_output="",
 
412
  success=False, error_message=str(e)
413
  )
414
 
415
  @app.post("/analyze-file")
416
- async def analyze_file(file: UploadFile = File(...), similarity_score: float = Form(95.0), build_ml_tree: bool = Form(False)):
 
 
 
 
 
417
  try:
418
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
419
  content = await file.read()
420
  temp_file.write(content)
421
  temp_file_path = temp_file.name
 
422
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
423
- os.unlink(temp_file_path)
424
  return AnalysisResponse(
425
  boundary_output=result[0] or "",
426
  keras_output=result[1] or "",
@@ -430,111 +560,246 @@ async def analyze_file(file: UploadFile = File(...), similarity_score: float = F
430
  success=True
431
  )
432
  except Exception as e:
433
- logging.error(f"Analyze-file error: {e}", exc_info=True)
434
- if 'temp_file_path' in locals():
435
- os.unlink(temp_file_path)
436
  return AnalysisResponse(
437
- boundary_output="", keras_output="", ml_tree_output="", tree_analysis_output="", summary_output="",
 
438
  success=False, error_message=str(e)
439
  )
 
 
 
 
 
 
440
 
441
- @app.get("/health")
442
- async def health_check():
443
  try:
444
- mafft_available, iqtree_available, _, _ = check_tool_availability()
445
- return {
446
- "status": "healthy",
447
- "boundary_model": boundary_model is not None,
448
- "keras_model": keras_model is not None,
449
- "tree_analyzer": analyzer is not None,
450
- "mafft_available": mafft_available,
451
- "iqtree_available": iqtree_available
452
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  except Exception as e:
454
- logging.error(f"Health check error: {e}", exc_info=True)
455
- return {"status": "unhealthy", "error": str(e)}
456
-
457
- # --- Gradio Interface ---
458
- def create_gradio_interface():
459
- with gr.Blocks(title="Gene Analysis Pipeline") as iface:
460
- gr.Markdown("## Gene Analysis Pipeline")
461
- with gr.Row():
462
- dna_input = gr.Textbox(label="DNA Sequence", placeholder="Enter DNA sequence...", lines=4)
463
- fasta_file = gr.File(label="Upload FASTA File", file_types=[".fasta", ".fa"])
464
- similarity_score = gr.Slider(minimum=70.0, maximum=99.0, value=95.0, label="Similarity (%)")
465
- build_ml_tree = gr.Checkbox(label="Phylogenetic Placement")
466
- with gr.Row():
467
- analyze_text_btn = gr.Button("Analyze Text")
468
- analyze_file_btn = gr.Button("Analyze File")
469
- with gr.Tabs():
470
- with gr.TabItem("F Gene"):
471
- f_gene_output = gr.Textbox(label="F Gene Sequence")
472
- with gr.TabItem("Validation"):
473
- keras_output = gr.Textbox(label="Validation Result")
474
- with gr.TabItem("Placement"):
475
- ml_tree_output = gr.Textbox(label="Phylogenetic Placement")
476
- with gr.TabItem("Tree"):
477
- tree_analysis_output = gr.Textbox(label="Tree Analysis")
478
- tree_html_display = gr.HTML(label="Interactive Tree")
479
- with gr.TabItem("Report"):
480
- report_html_display = gr.HTML(label="Report")
481
- with gr.TabItem("Summary"):
482
- summary_output = gr.Textbox(label="Summary")
483
- with gr.Row():
484
- alignment_file = gr.File(label="Alignment")
485
- tree_file = gr.File(label="Tree")
486
- html_tree_file = gr.File(label="Interactive Tree (HTML)")
487
- report_file = gr.File(label="Report (HTML)")
488
- analyze_text_btn.click(
489
- fn=run_pipeline,
490
- inputs=[dna_input, similarity_score, build_ml_tree],
491
- outputs=[f_gene_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
492
- alignment_file, tree_file, html_tree_file, report_file, tree_html_display, report_html_display]
493
- )
494
- analyze_file_btn.click(
495
- fn=run_pipeline_from_file,
496
- inputs=[fasta_file, similarity_score, build_ml_tree],
497
- outputs=[f_gene_output, keras_output, ml_tree_output, tree_analysis_output, summary_output,
498
- alignment_file, tree_file, html_tree_file, report_file, tree_html_display, report_html_display]
499
  )
500
- return iface
501
 
502
- # --- Mount Gradio ---
503
  try:
504
  gradio_app = create_gradio_interface()
505
  app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
506
- logging.info("Gradio mounted at /gradio")
507
  except Exception as e:
508
- logging.error(f"Gradio mounting failed: {e}", exc_info=True)
509
- @app.get("/gradio")
510
- async def gradio_fallback():
511
- health = await health_check()
512
- return HTMLResponse(f"""
513
- <h1>Gradio UI Failed</h1>
514
- <p>Error: Check /tmp/app.log for details.</p>
515
- <p>Health: {health}</p>
516
- <p>Try: <a href="/docs">API Docs</a> | <a href="/health">Health Check</a></p>
517
- """, status_code=503)
518
 
519
- # --- Main Execution ---
520
  if __name__ == "__main__":
521
  try:
522
- logging.info("Starting Gene Analysis Pipeline")
523
- logging.info(f"Boundary Model: {'OK' if boundary_model else 'Missing'}")
524
- logging.info(f"Keras Model: {'OK' if keras_model else 'Missing'}")
525
- logging.info(f"Tree Analyzer: {'OK' if analyzer else 'Missing'}")
526
- mafft_available, iqtree_available, _, _ = check_tool_availability()
527
- logging.info(f"MAFFT: {'OK' if mafft_available else 'Missing'}")
528
- logging.info(f"IQ-TREE: {'OK' if iqtree_available else 'Missing'}")
529
- logging.info("Starting server...")
530
- logging.info("API Docs: http://localhost:8000/docs")
531
- logging.info("Gradio UI: http://localhost:8000/gradio")
 
 
 
 
 
 
 
532
  uvicorn.run(
533
- app,
534
- host="0.0.0.0",
535
- port=8000,
536
- reload=False
 
537
  )
 
538
  except Exception as e:
539
- logging.error(f"Server startup failed: {e}", exc_info=True)
540
- sys.exit(1)
 
 
 
 
 
22
  import stat
23
  import time
24
  import asyncio
25
+
26
+ # FastAPI imports
27
+ from fastapi import FastAPI, File, UploadFile, Form, HTTPException
28
  from fastapi.responses import HTMLResponse
 
29
  from pydantic import BaseModel
30
  from typing import Optional
31
  import uvicorn
32
 
33
  # Set event loop policy for Spaces
34
+ try:
35
+ asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
36
+ except Exception:
37
+ pass
38
 
39
  # --- FastAPI App Setup ---
40
  app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
41
 
42
+ # --- Enhanced Logging ---
43
+ log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44
+ log_handler = logging.StreamHandler()
45
+ log_handler.setFormatter(log_formatter)
46
+
47
+ # File handler with error handling
48
+ try:
49
+ file_handler = logging.FileHandler('/tmp/app.log')
50
+ file_handler.setFormatter(log_formatter)
51
+ logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
52
+ except Exception:
53
+ logging.basicConfig(level=logging.INFO, handlers=[log_handler])
 
 
 
 
 
 
54
 
55
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
56
 
57
  # --- Global Variables ---
58
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
68
  csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
69
  hf_token = os.getenv("HF_TOKEN")
70
 
71
+ # Initialize models as None
72
  boundary_model = None
73
  keras_model = None
74
  kmer_to_index = None
75
+ analyzer = None
 
 
 
 
 
 
 
 
76
 
77
+ # --- Enhanced Model Loading with Better Error Handling ---
78
+ def load_models_safely():
79
+ global boundary_model, keras_model, kmer_to_index, analyzer
80
+
81
+ # Load Boundary Model
82
+ try:
83
+ if hf_token:
84
+ logger.info("Attempting to load boundary model from Hugging Face...")
85
+ boundary_path = hf_hub_download(
86
+ repo_id=model_repo,
87
+ filename="best_boundary_aware_model.pth",
88
+ token=hf_token,
89
+ cache_dir="/tmp/hf_cache"
90
+ )
91
+ if os.path.exists(boundary_path):
92
+ boundary_model = EnhancedGenePredictor(boundary_path)
93
+ logger.info("✅ Boundary model loaded successfully")
94
+ else:
95
+ logger.warning("❌ Boundary model file not found after download")
96
+ else:
97
+ logger.warning("❌ HF_TOKEN not found, skipping boundary model")
98
+ except Exception as e:
99
+ logger.error(f"❌ Failed to load boundary model: {e}")
100
+ boundary_model = None
101
 
102
+ # Load Keras Model
103
+ try:
104
+ if hf_token:
105
+ logger.info("Attempting to load Keras model from Hugging Face...")
106
+ keras_path = hf_hub_download(
107
+ repo_id=model_repo,
108
+ filename="best_model.keras",
109
+ token=hf_token,
110
+ cache_dir="/tmp/hf_cache"
111
+ )
112
+ kmer_path = hf_hub_download(
113
+ repo_id=model_repo,
114
+ filename="kmer_to_index.pkl",
115
+ token=hf_token,
116
+ cache_dir="/tmp/hf_cache"
117
+ )
118
+
119
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
120
+ keras_model = load_model(keras_path)
121
+ with open(kmer_path, "rb") as f:
122
+ kmer_to_index = pickle.load(f)
123
+ logger.info("✅ Keras model loaded successfully")
124
+ else:
125
+ logger.warning("❌ Keras model files not found after download")
126
+ else:
127
+ logger.warning("❌ HF_TOKEN not found, skipping Keras model")
128
+ except Exception as e:
129
+ logger.error(f"❌ Failed to load Keras model: {e}")
130
+ keras_model = None
131
+ kmer_to_index = None
132
+
133
+ # Initialize Tree Analyzer
134
+ try:
135
+ logger.info("Initializing tree analyzer...")
136
+ analyzer = PhylogeneticTreeAnalyzer()
137
+ csv_candidates = [
138
+ csv_path,
139
+ os.path.join(BASE_DIR, "f cleaned.csv"),
140
+ "f_cleaned.csv",
141
+ os.path.join(BASE_DIR, "data", "f_cleaned.csv")
142
+ ]
143
+
144
+ csv_loaded = False
145
+ for csv_candidate in csv_candidates:
146
+ if os.path.exists(csv_candidate):
147
+ try:
148
+ if analyzer.load_data(csv_candidate):
149
+ logger.info(f"✅ Tree analyzer loaded from: {csv_candidate}")
150
+ csv_loaded = True
151
+ break
152
+ except Exception as e:
153
+ logger.warning(f"Failed to load CSV from {csv_candidate}: {e}")
154
+ continue
155
+
156
+ if not csv_loaded:
157
+ logger.error("❌ Failed to load CSV data from any location")
158
+ analyzer = None
159
+ except Exception as e:
160
+ logger.error(f"❌ Failed to initialize tree analyzer: {e}")
161
  analyzer = None
162
+
163
+ # Load models at startup
164
+ load_models_safely()
165
 
166
  # --- Tool Detection ---
167
  def setup_binary_permissions():
 
169
  if os.path.exists(binary):
170
  try:
171
  os.chmod(binary, os.stat(binary).st_mode | stat.S_IEXEC)
172
+ logger.info(f"Set executable permission on {binary}")
173
  except Exception as e:
174
+ logger.warning(f"Failed to set permission on {binary}: {e}")
175
 
176
  def check_tool_availability():
177
  setup_binary_permissions()
178
+
179
+ # Check MAFFT
180
  mafft_available = False
181
  mafft_cmd = None
182
  mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
183
+
184
  for candidate in mafft_candidates:
185
+ if shutil.which(candidate) or os.path.exists(candidate):
186
  try:
187
+ result = subprocess.run(
188
+ [candidate, "--help"],
189
+ capture_output=True,
190
+ text=True,
191
+ timeout=5
192
+ )
193
  if result.returncode == 0 or "mafft" in result.stderr.lower():
194
  mafft_available = True
195
  mafft_cmd = candidate
196
+ logger.info(f"MAFFT found at: {candidate}")
197
  break
198
  except Exception as e:
199
+ logger.debug(f"MAFFT test failed for {candidate}: {e}")
200
+
201
+ # Check IQ-TREE
202
  iqtree_available = False
203
  iqtree_cmd = None
204
  iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
205
+
206
  for candidate in iqtree_candidates:
207
+ if shutil.which(candidate) or os.path.exists(candidate):
208
  try:
209
+ result = subprocess.run(
210
+ [candidate, "--help"],
211
+ capture_output=True,
212
+ text=True,
213
+ timeout=5
214
+ )
215
  if result.returncode == 0 or "iqtree" in result.stderr.lower():
216
  iqtree_available = True
217
  iqtree_cmd = candidate
218
+ logger.info(f"IQ-TREE found at: {candidate}")
219
  break
220
  except Exception as e:
221
+ logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
222
+
223
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
224
 
225
+ # --- Pipeline Functions (keeping your original logic) ---
226
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
227
  try:
228
  if len(sequence.strip()) < 100:
229
  return False, "Sequence too short (<100 bp).", None, None
230
+
231
  query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
232
  query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
233
  aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
234
  output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
235
+
236
  if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
237
  return False, "Reference alignment or tree not found.", None, None
238
+
239
  query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
240
  SeqIO.write([query_record], query_fasta, "fasta")
241
+
242
  with open(aligned_with_query, "w") as output_file:
243
+ subprocess.run([
244
+ mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
245
+ ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
246
+
247
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
248
  return False, "MAFFT alignment failed.", None, None
249
+
250
+ subprocess.run([
251
+ iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
252
+ "-m", "GTR+G", "-pre", output_prefix, "-redo"
253
+ ], capture_output=True, text=True, timeout=1200, check=True)
254
+
255
  treefile = f"{output_prefix}.treefile"
256
  if not os.path.exists(treefile):
257
  return False, "IQ-TREE placement failed.", aligned_with_query, None
258
+
259
  success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
260
  return True, success_msg, aligned_with_query, treefile
261
+
262
  except Exception as e:
263
+ logger.error(f"Phylogenetic placement failed: {e}")
264
  return False, f"Error: {str(e)}", None, None
265
  finally:
266
  if 'query_fasta' in locals() and os.path.exists(query_fasta):
267
+ try:
268
+ os.unlink(query_fasta)
269
+ except:
270
+ pass
271
 
272
+ def predict_with_keras(sequence):
273
  try:
274
+ if not keras_model or not kmer_to_index:
275
+ return " Keras model not available."
276
+
277
+ if len(sequence) < 6:
278
+ return "❌ Sequence too short (<6 bp)."
279
+
280
+ kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
281
+ indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
282
+ input_arr = np.array([indices])
283
+
284
+ prediction = keras_model.predict(input_arr, verbose=0)[0]
285
+ f_gene_prob = prediction[-1]
286
+ percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
287
+
288
+ return f"{percentage}% F gene confidence"
 
289
  except Exception as e:
290
+ logger.error(f"Keras prediction failed: {e}")
291
+ return f"Error: {str(e)}"
292
+
293
+ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
294
+ try:
295
+ dna_input = dna_input.upper().strip()
296
+ if not dna_input:
297
+ return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
298
+
299
+ # Clean sequence
300
+ if not re.match('^[ACTGN]+$', dna_input):
301
+ dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
302
+
303
+ processed_sequence = dna_input
304
+
305
+ # Boundary prediction
306
+ boundary_output = ""
307
+ if boundary_model:
308
+ try:
309
+ result = boundary_model.predict_sequence(dna_input)
310
+ regions = result['gene_regions']
311
+ if regions:
312
+ processed_sequence = regions[0]["sequence"]
313
+ boundary_output = f"✅ F gene region found: {len(processed_sequence)} bp"
314
+ else:
315
+ boundary_output = "⚠️ No F gene regions found."
316
+ processed_sequence = dna_input
317
+ except Exception as e:
318
+ boundary_output = f"❌ Boundary prediction error: {str(e)}"
319
+ processed_sequence = dna_input
320
+ else:
321
+ boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
322
+
323
+ # Keras prediction
324
+ keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
325
+
326
+ # ML Tree (keeping your original logic)
327
+ aligned_file = None
328
+ phy_file = None
329
+ ml_tree_output = ""
330
+
331
+ if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
332
+ try:
333
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
334
+ if mafft_available and iqtree_available:
335
+ ml_success, ml_message, ml_aligned, ml_tree = phylogenetic_placement(processed_sequence, mafft_cmd, iqtree_cmd)
336
+ ml_tree_output = ml_message
337
+ aligned_file = ml_aligned
338
+ phy_file = ml_tree
339
+ else:
340
+ ml_tree_output = "❌ MAFFT or IQ-TREE not available"
341
+ except Exception as e:
342
+ ml_tree_output = f"❌ ML tree error: {str(e)}"
343
+ elif build_ml_tree:
344
+ ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
345
+ else:
346
+ ml_tree_output = "⚠️ Phylogenetic placement skipped."
347
+
348
+ # Tree analysis
349
+ tree_html_content = "No tree generated."
350
+ report_html_content = "No report generated."
351
+ simplified_ml_output = ""
352
+
353
+ if analyzer and processed_sequence and len(processed_sequence) >= 10:
354
+ try:
355
+ tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
356
+ simplified_ml_output = tree_result
357
+
358
+ if tree_html_path and os.path.exists(tree_html_path):
359
+ with open(tree_html_path, 'r', encoding='utf-8') as f:
360
+ tree_html_content = f.read()
361
+ else:
362
+ tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
363
+
364
+ if report_html_path and os.path.exists(report_html_path):
365
+ with open(report_html_path, 'r', encoding='utf-8') as f:
366
+ report_html_content = f.read()
367
+ else:
368
+ report_html_content = f"<div style='color: red;'>{tree_result}</div>"
369
+
370
+ except Exception as e:
371
+ simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
372
+ tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
373
+ report_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
374
+ else:
375
+ simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
376
+ tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
377
+ report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
378
+
379
+ # Summary
380
+ summary_output = f"""
381
+ 📊 ANALYSIS SUMMARY:
382
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
383
+ Input: {len(dna_input)} bp
384
+ F Gene: {len(processed_sequence)} bp
385
+ Validation: {keras_output.split(':')[-1].strip() if ':' in keras_output else keras_output}
386
+ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skipped' in ml_tree_output else '❌ Failed'}
387
+ Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
388
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
389
+ """
390
+
391
+ return (
392
+ boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
393
+ aligned_file, phy_file, None, None, tree_html_content, report_html_content
394
+ )
395
+
396
+ except Exception as e:
397
+ logger.error(f"Pipeline error: {e}")
398
+ error_msg = f"❌ Pipeline Error: {str(e)}"
399
+ return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
400
 
401
+ # Keep your other functions (analyze_sequence_for_tree, build_maximum_likelihood_tree, etc.)
402
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
403
  try:
404
  if not analyzer:
405
+ return "Tree analyzer not initialized.", None, None
406
+
407
  if not sequence or len(sequence.strip()) < 10:
408
+ return "Invalid sequence.", None, None
409
+
410
  if not (1 <= matching_percentage <= 99):
411
+ return "Matching percentage must be 1-99.", None, None
412
+
413
  if not analyzer.find_query_sequence(sequence):
414
+ return "Sequence not accepted.", None, None
415
+
416
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
417
  if not matched_ids:
418
+ return f"No similar sequences at {matching_percentage}% threshold.", None, None
419
+
420
  analyzer.build_tree_structure_with_ml_safe(matched_ids)
421
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
422
+
423
  query_id = analyzer.query_id or f"query_{int(time.time())}"
424
  tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
425
  fig.write_html(tree_html_path)
426
+
427
  analyzer.matching_percentage = matching_percentage
428
  report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
429
  report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
430
+
431
+ return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
432
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  except Exception as e:
434
+ logger.error(f"Tree analysis failed: {e}")
435
+ return f"Error: {str(e)}", None, None
436
 
437
  def read_fasta_file(file_obj):
438
  try:
439
  if file_obj is None:
440
  return ""
441
+
442
  if isinstance(file_obj, str):
443
  with open(file_obj, "r") as f:
444
  content = f.read()
445
  else:
446
  content = file_obj.read().decode("utf-8")
447
+
448
  lines = content.strip().split("\n")
449
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
450
  return ''.join(seq_lines)
451
+
452
  except Exception as e:
453
+ logger.error(f"Failed to read FASTA file: {e}")
454
  return ""
455
 
456
+ async def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
457
+ try:
458
+ dna_input = read_fasta_file(fasta_file_obj)
459
+ if not dna_input:
460
+ return "❌ Failed to read FASTA file", "", "", "", "", None, None, None, None, "No input", "No input"
461
+ return run_pipeline(dna_input, similarity_score, build_ml_tree)
462
+ except Exception as e:
463
+ logger.error(f"Pipeline from file error: {e}")
464
+ error_msg = f"❌ Error: {str(e)}"
465
+ return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
466
+
467
  # --- Pydantic Models ---
468
  class AnalysisRequest(BaseModel):
469
  sequence: str
 
479
  success: bool
480
  error_message: Optional[str] = None
481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  # --- FastAPI Endpoints ---
483
  @app.get("/")
484
  async def root():
485
+ return {
486
+ "message": "🧬 Gene Analysis Pipeline API",
487
+ "status": "running",
488
+ "endpoints": {
489
+ "docs": "/docs",
490
+ "health": "/health",
491
+ "gradio": "/gradio",
492
+ "analyze": "/analyze",
493
+ "analyze_file": "/analyze-file"
494
+ }
495
+ }
496
+
497
+ @app.get("/health")
498
+ async def health_check():
499
+ try:
500
+ mafft_available, iqtree_available, _, _ = check_tool_availability()
501
+ return {
502
+ "status": "healthy",
503
+ "components": {
504
+ "boundary_model": boundary_model is not None,
505
+ "keras_model": keras_model is not None,
506
+ "tree_analyzer": analyzer is not None,
507
+ "mafft_available": mafft_available,
508
+ "iqtree_available": iqtree_available
509
+ },
510
+ "recommendations": {
511
+ "hf_token": "Set HF_TOKEN environment variable" if not hf_token else "OK",
512
+ "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
513
+ }
514
+ }
515
+ except Exception as e:
516
+ logger.error(f"Health check error: {e}")
517
+ return {"status": "unhealthy", "error": str(e)}
518
 
519
  @app.post("/analyze", response_model=AnalysisResponse)
520
  async def analyze_sequence(request: AnalysisRequest):
 
529
  success=True
530
  )
531
  except Exception as e:
532
+ logger.error(f"Analyze error: {e}")
533
  return AnalysisResponse(
534
+ boundary_output="", keras_output="", ml_tree_output="",
535
+ tree_analysis_output="", summary_output="",
536
  success=False, error_message=str(e)
537
  )
538
 
539
  @app.post("/analyze-file")
540
+ async def analyze_file(
541
+ file: UploadFile = File(...),
542
+ similarity_score: float = Form(95.0),
543
+ build_ml_tree: bool = Form(False)
544
+ ):
545
+ temp_file_path = None
546
  try:
547
  with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", dir="/tmp") as temp_file:
548
  content = await file.read()
549
  temp_file.write(content)
550
  temp_file_path = temp_file.name
551
+
552
  result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
553
+
554
  return AnalysisResponse(
555
  boundary_output=result[0] or "",
556
  keras_output=result[1] or "",
 
560
  success=True
561
  )
562
  except Exception as e:
563
+ logger.error(f"Analyze-file error: {e}")
 
 
564
  return AnalysisResponse(
565
+ boundary_output="", keras_output="", ml_tree_output="",
566
+ tree_analysis_output="", summary_output="",
567
  success=False, error_message=str(e)
568
  )
569
+ finally:
570
+ if temp_file_path and os.path.exists(temp_file_path):
571
+ try:
572
+ os.unlink(temp_file_path)
573
+ except:
574
+ pass
575
 
576
+ # --- Enhanced Gradio Interface ---
577
+ def create_gradio_interface():
578
  try:
579
+ with gr.Blocks(
580
+ title="🧬 Gene Analysis Pipeline",
581
+ theme=gr.themes.Soft(),
582
+ css="""
583
+ .gradio-container { max-width: 1200px !important; }
584
+ .status-box { padding: 10px; border-radius: 5px; margin: 5px 0; }
585
+ .success { background-color: #d4edda; border: 1px solid #c3e6cb; color: #155724; }
586
+ .warning { background-color: #fff3cd; border: 1px solid #ffeaa7; color: #856404; }
587
+ .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
588
+ """
589
+ ) as iface:
590
+
591
+ gr.Markdown("# 🧬 Gene Analysis Pipeline")
592
+
593
+ # Status display
594
+ with gr.Row():
595
+ with gr.Column():
596
+ status_display = gr.HTML(value=f"""
597
+ <div class="status-box">
598
+ <h3>🔧 System Status</h3>
599
+ <p>🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}</p>
600
+ <p>🧠 Keras Model: {'✅ Loaded' if keras_model else '❌ Missing'}</p>
601
+ <p>🌳 Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Missing'}</p>
602
+ <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
603
+ <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
604
+ </div>
605
+ """)
606
+
607
+ # Input section
608
+ with gr.Row():
609
+ with gr.Column(scale=2):
610
+ dna_input = gr.Textbox(
611
+ label="🧬 DNA Sequence",
612
+ placeholder="Enter DNA sequence (ATCG)...",
613
+ lines=4,
614
+ info="Paste your DNA sequence here"
615
+ )
616
+ with gr.Column(scale=1):
617
+ fasta_file = gr.File(
618
+ label="📁 Upload FASTA File",
619
+ file_types=[".fasta", ".fa", ".txt"],
620
+ info="Or upload a FASTA file"
621
+ )
622
+
623
+ # Parameters
624
+ with gr.Row():
625
+ similarity_score = gr.Slider(
626
+ minimum=70.0,
627
+ maximum=99.0,
628
+ value=95.0,
629
+ label="🎯 Similarity Threshold (%)",
630
+ info="Minimum similarity for phylogenetic analysis"
631
+ )
632
+ build_ml_tree = gr.Checkbox(
633
+ label="🌲 Enable Phylogenetic Placement",
634
+ value=False,
635
+ info="Computationally intensive"
636
+ )
637
+
638
+ # Action buttons
639
+ with gr.Row():
640
+ analyze_text_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
641
+ analyze_file_btn = gr.Button("📁 Analyze File", variant="secondary")
642
+ clear_btn = gr.Button("🗑️ Clear", variant="stop")
643
+
644
+ # Results section
645
+ gr.Markdown("## 📊 Results")
646
+
647
+ with gr.Tabs():
648
+ with gr.TabItem("📋 Summary"):
649
+ summary_output = gr.Textbox(label="Analysis Summary", lines=8)
650
+
651
+ with gr.TabItem("🎯 F Gene Detection"):
652
+ f_gene_output = gr.Textbox(label="F Gene Sequence Detection")
653
+ keras_output = gr.Textbox(label="Validation Result")
654
+
655
+ with gr.TabItem("🌲 Phylogenetic Analysis"):
656
+ ml_tree_output = gr.Textbox(label="Phylogenetic Placement")
657
+ tree_analysis_output = gr.Textbox(label="Tree Analysis")
658
+
659
+ with gr.TabItem("🌳 Interactive Tree"):
660
+ tree_html_display = gr.HTML(label="Interactive Tree Visualization")
661
+
662
+ with gr.TabItem("📄 Detailed Report"):
663
+ report_html_display = gr.HTML(label="Analysis Report")
664
+
665
+ with gr.TabItem("📁 Download Files"):
666
+ with gr.Row():
667
+ aligned_file_output = gr.File(label="Aligned Sequences", visible=False)
668
+ tree_file_output = gr.File(label="Phylogenetic Tree", visible=False)
669
+ custom_file_1 = gr.File(label="Additional Output 1", visible=False)
670
+ custom_file_2 = gr.File(label="Additional Output 2", visible=False)
671
+
672
+ # Event handlers
673
+ def run_analysis_text(dna_input_val, similarity_val, build_ml_val):
674
+ if not dna_input_val.strip():
675
+ return "❌ Please enter a DNA sequence", "", "", "", "", None, None, None, None, "", ""
676
+ return run_pipeline(dna_input_val, similarity_val, build_ml_val)
677
+
678
+ def run_analysis_file(file_obj, similarity_val, build_ml_val):
679
+ if file_obj is None:
680
+ return "❌ Please upload a file", "", "", "", "", None, None, None, None, "", ""
681
+ try:
682
+ # Run the async function in a synchronous context
683
+ import asyncio
684
+ loop = asyncio.new_event_loop()
685
+ asyncio.set_event_loop(loop)
686
+ try:
687
+ result = loop.run_until_complete(run_pipeline_from_file(file_obj, similarity_val, build_ml_val))
688
+ return result
689
+ finally:
690
+ loop.close()
691
+ except Exception as e:
692
+ error_msg = f"❌ Error processing file: {str(e)}"
693
+ return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
694
+
695
+ def clear_all():
696
+ return ("", None, 95.0, False, "", "", "", "", "", None, None, None, None, "", "")
697
+
698
+ # Wire up the interface
699
+ analyze_text_btn.click(
700
+ fn=run_analysis_text,
701
+ inputs=[dna_input, similarity_score, build_ml_tree],
702
+ outputs=[
703
+ f_gene_output, keras_output, ml_tree_output, tree_analysis_output,
704
+ summary_output, aligned_file_output, tree_file_output,
705
+ custom_file_1, custom_file_2, tree_html_display, report_html_display
706
+ ]
707
+ )
708
+
709
+ analyze_file_btn.click(
710
+ fn=run_analysis_file,
711
+ inputs=[fasta_file, similarity_score, build_ml_tree],
712
+ outputs=[
713
+ f_gene_output, keras_output, ml_tree_output, tree_analysis_output,
714
+ summary_output, aligned_file_output, tree_file_output,
715
+ custom_file_1, custom_file_2, tree_html_display, report_html_display
716
+ ]
717
+ )
718
+
719
+ clear_btn.click(
720
+ fn=clear_all,
721
+ outputs=[
722
+ dna_input, fasta_file, similarity_score, build_ml_tree,
723
+ f_gene_output, keras_output, ml_tree_output, tree_analysis_output,
724
+ summary_output, aligned_file_output, tree_file_output,
725
+ custom_file_1, custom_file_2, tree_html_display, report_html_display
726
+ ]
727
+ )
728
+
729
+ # Example section
730
+ gr.Markdown("""
731
+ ## 💡 Examples
732
+
733
+ Try these sample sequences:
734
+
735
+ **Short F Gene Sequence:**
736
+ ```
737
+ ATGGAGTTGCCACACCATCACAGAGGCCTCGAGATGCCAAGTCGTTAACC
738
+ ```
739
+
740
+ **Medium Length Sequence:**
741
+ ```
742
+ ATGGAGTTGCCACACCATCACAGAGGCCTCGAGATGCCAAGTCGTTAACCCTACTAAGCTCCCTGTCTGACATACTTGATGTGGAGGCTATAGATATTATCAATCAAGCAGTGACCATTCTGAAGATGAATGGACCCAACACCACCTACATATACCCTGACAAACTGGAAAATCTGGCAATGCTGACATTGGATGAACAACTTGAGAGGGTGATGATTATCAATGCCACCATCCAAGAGACAGATAATAATTACAACAACATTATTAGAAAATACACAAGCAATGATGACCTTGAACAAGATGAAGAGATGAAACGGAAAATACCAGAGGAAAAGACTAAGGGATCCGGATTGATCCACAACATGAAGAGGAAGAAGCACTACGACCTGACCATGACCATGAAAAAGCACGAGACACTAACCATGAACACCTTGACAATGATCATGACTTTGGACATGCAAGAGGCCAAATTGAAGGACTTGATGACTACAACCAACACCACATCCGTGGCCACCTCAAGGAAGTCTTTGACACACAAGCGCAACGCCAAGCTGACCATGACCTACATCCAAGCCAACACGGTGAACACCGTGGACATGATGAAGAACACAACATCCAAGGACACAGACAAGATGATGAAGAACACAATGACCTCCTACAACACCATGACCACAATGATGAACACCGTGACAATGATGAAGAACACCATCTCCAAGAACACAAGGAAGATGAAAAACACAACGATCCACAATGCCATGAACATGATGAACCCTCTGACAAACCTGAACAATATTATCAAGAACACAAACATGAACAACCTGGACAAGCTGATGAACACCATCTCCAAGAACACAAGGAAGATGAAAAACACAACGATCCACAATGCCATGAACATGATGAACCCTCTGACAAACCTGAACAATATTATCAAGAACACAAACATGAACAACCTGGACAAGCTGATGAACACCATCTCCAAGAACACAAGGAAGATG
743
+ ```
744
+
745
+ **Tips:**
746
+ - Use sequences at least 100 bp for phylogenetic placement
747
+ - Higher similarity thresholds (95-99%) provide more specific results
748
+ - Phylogenetic placement is computationally intensive
749
+ """)
750
+
751
+ return iface
752
+
753
  except Exception as e:
754
+ logger.error(f"Failed to create Gradio interface: {e}")
755
+ # Fallback minimal interface
756
+ return gr.Interface(
757
+ fn=lambda x: f"Error creating interface: {e}",
758
+ inputs=gr.Textbox(label="Input"),
759
+ outputs=gr.Textbox(label="Error"),
760
+ title="Gene Analysis Pipeline - Error"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  )
 
762
 
763
+ # --- Mount Gradio App ---
764
  try:
765
  gradio_app = create_gradio_interface()
766
  app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
767
+ logger.info("Gradio interface mounted successfully")
768
  except Exception as e:
769
+ logger.error(f" Failed to mount Gradio interface: {e}")
 
 
 
 
 
 
 
 
 
770
 
771
+ # --- Main Function ---
772
  if __name__ == "__main__":
773
  try:
774
+ # Print startup information
775
+ print("🧬 Gene Analysis Pipeline Starting...")
776
+ print(f"📍 Working Directory: {BASE_DIR}")
777
+ print(f"🔑 HF Token: {'✅ Set' if hf_token else 'Missing'}")
778
+ print(f"🤖 Boundary Model: {'✅ Loaded' if boundary_model else '❌ Missing'}")
779
+ print(f"🧠 Keras Model: {'✅ Loaded' if keras_model else 'Missing'}")
780
+ print(f"🌳 Tree Analyzer: {'✅ Loaded' if analyzer else 'Missing'}")
781
+
782
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
783
+ print(f"🧬 MAFFT: {'✅ Available' if mafft_available else '❌ Missing'} - {mafft_cmd if mafft_available else 'Not found'}")
784
+ print(f"🌲 IQ-TREE: {'✅ Available' if iqtree_available else '❌ Missing'} - {iqtree_cmd if iqtree_available else 'Not found'}")
785
+
786
+ print("\n🚀 Starting server...")
787
+ print("📱 FastAPI docs: http://localhost:7860/docs")
788
+ print("🎨 Gradio interface: http://localhost:7860/gradio")
789
+
790
+ # Start the server
791
  uvicorn.run(
792
+ app,
793
+ host="0.0.0.0",
794
+ port=7860,
795
+ log_level="info",
796
+ access_log=True
797
  )
798
+
799
  except Exception as e:
800
+ logger.error(f" Failed to start server: {e}")
801
+ print(f"❌ Server startup failed: {e}")
802
+ sys.exit(1)
803
+ except KeyboardInterrupt:
804
+ print("\n👋 Server stopped by user")
805
+ sys.exit(0)