re-type commited on
Commit
74167c4
·
verified ·
1 Parent(s): e856e28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +713 -407
app.py CHANGED
@@ -1,833 +1,1139 @@
1
- # app.py
2
- import gradio as gr
3
- import torch
4
- import pickle
5
- import subprocess
6
- import pandas as pd
7
- import os
8
- import re
9
- import logging
10
- import numpy as np
11
- from predictor import GenePredictor # Kept for potential future use, but not loaded
12
- from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
  import tempfile
15
  import shutil
16
- import stat
17
  from pathlib import Path
18
- from huggingface_hub import hf_hub_download
19
- from tensorflow.keras.preprocessing.sequence import pad_sequences
20
 
21
  # --- Global Variables ---
22
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
23
- IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
24
-
25
- # --- Logging ---
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
 
28
  # --- Paths ---
 
 
 
29
  model_repo = "GGproject10/best_boundary_aware_model"
30
  csv_path = "f cleaned.csv"
31
- classifier_model_dir = "model" # Directory for second model files
32
 
33
  # Get HF token from environment (if available)
34
  hf_token = os.getenv("HF_TOKEN")
35
-
36
- # --- Load Models ---
37
- boundary_model = None # Disabled as per request
38
  keras_model = None
39
  kmer_to_index = None
40
- classifier_model = None
41
- classifier_kmer_to_index = None
42
- classifier_maxlen = None
43
 
44
- # Note: Boundary Model is disabled as per user request
45
- logging.info("Boundary Model is currently disabled. Input will be used directly for verification and tree analysis.")
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # Try to load Keras model from Hugging Face Hub
48
  try:
49
- keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras", token=hf_token)
50
- kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl", token=hf_token)
 
 
 
 
 
 
 
 
 
51
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
52
  keras_model = load_model(keras_path)
53
  with open(kmer_path, "rb") as f:
54
- kmer_to_index = pickle.load(f)
55
- logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
56
- else:
57
- logging.warning(f"Keras model or kmer files not found after download")
58
  except Exception as e:
59
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
60
 
61
- # Try to load classifier model (second model)
62
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
63
- try:
64
- classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
65
- classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
66
- classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
67
- missing_files = []
68
- if not os.path.exists(classifier_path):
69
- missing_files.append("best_model.keras")
70
- if not os.path.exists(classifier_kmer_path):
71
- missing_files.append("kmer_to_index.pkl")
72
- if not os.path.exists(classifier_maxlen_path):
73
- missing_files.append("maxlen.txt")
74
- if missing_files:
75
- logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
76
- else:
77
- classifier_model = load_model(classifier_path)
78
- with open(classifier_kmer_path, "rb") as f:
79
- classifier_kmer_to_index = pickle.load(f)
80
- with open(classifier_maxlen_path, "r") as f:
81
- classifier_maxlen = int(f.read().strip())
82
- logging.info("Classifier model loaded successfully.")
83
- except Exception as e:
84
- logging.error(f"Failed to load classifier model: {e}")
85
- logging.warning("Falling back to existing Keras model for validation.")
86
 
87
- LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  # --- Initialize Tree Analyzer ---
90
  analyzer = None
91
  try:
92
- analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
93
  if os.path.exists(csv_path):
94
  if analyzer.load_data(csv_path):
95
  logging.info("Tree analyzer initialized successfully")
 
96
  try:
97
  if not analyzer.train_ai_model():
98
  logging.warning("AI model training failed; proceeding with basic analysis.")
99
- except Exception as e:
100
- logging.warning(f"AI model training failed: {e}")
101
- else:
102
- logging.error("Failed to load CSV data for tree analyzer")
103
- analyzer = None
104
- else:
105
- logging.error(f"CSV file not found: {csv_path}")
106
- analyzer = None
107
- except Exception as e:
108
- logging.error(f"Failed to initialize tree analyzer: {e}")
109
  analyzer = None
110
 
111
  # --- Enhanced Tool Detection ---
112
- def check_and_fix_executable_permissions(filepath):
113
- """Check and fix executable permissions for a file"""
114
- try:
115
- if os.path.exists(filepath):
116
- if not os.access(filepath, os.X_OK):
117
- logging.info(f"File {filepath} is not executable, attempting to fix permissions...")
118
- current_permissions = os.stat(filepath).st_mode
119
- os.chmod(filepath, current_permissions | stat.S_IEXEC | stat.S_IXUSR | stat.S_IXGRP)
120
- logging.info(f"Fixed permissions for {filepath}")
121
- return True
122
- return True
123
- return False
124
- except Exception as e:
125
- logging.error(f"Failed to fix permissions for {filepath}: {e}")
126
- return False
 
 
127
 
128
- def enhanced_check_tool_availability():
129
- """Enhanced check for MAFFT and IQ-TREE availability with permission fixing"""
130
  mafft_available = False
131
  mafft_cmd = None
 
 
132
  mafft_candidates = [
133
  MAFFT_PATH,
134
  'mafft',
135
  '/usr/bin/mafft',
136
  '/usr/local/bin/mafft',
137
- '/opt/homebrew/bin/mafft',
138
- '/usr/local/homebrew/bin/mafft',
139
- 'mafft.bat',
140
  ]
 
141
  for candidate in mafft_candidates:
142
- if candidate and os.path.exists(candidate):
143
- if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
144
- check_and_fix_executable_permissions(candidate)
145
- if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
146
- mafft_available = True
147
- mafft_cmd = candidate
148
- logging.info(f"Found MAFFT at: {candidate}")
149
- break
150
- elif candidate and shutil.which(candidate) is not None:
151
  mafft_available = True
152
  mafft_cmd = candidate
153
- logging.info(f"Found MAFFT in PATH: {candidate}")
154
  break
155
-
 
156
  iqtree_available = False
157
  iqtree_cmd = None
 
 
158
  iqtree_candidates = [
159
  IQTREE_PATH,
160
  'iqtree2',
161
- 'iqtree',
162
- '/usr/bin/iqtree2',
163
  '/usr/local/bin/iqtree2',
164
  '/usr/bin/iqtree',
165
  '/usr/local/bin/iqtree',
166
- '/opt/homebrew/bin/iqtree2',
167
- 'iqtree2.exe',
168
- 'iqtree.exe',
169
  ]
 
170
  for candidate in iqtree_candidates:
171
- if candidate and os.path.exists(candidate):
172
- if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
173
- check_and_fix_executable_permissions(candidate)
174
- if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
175
- iqtree_available = True
176
- iqtree_cmd = candidate
177
- logging.info(f"Found IQ-TREE at: {candidate}")
178
- break
179
- elif candidate and shutil.which(candidate) is not None:
180
  iqtree_available = True
181
  iqtree_cmd = candidate
182
- logging.info(f"Found IQ-TREE in PATH: {candidate}")
183
  break
184
-
185
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
186
 
187
- def get_installation_instructions():
188
- """Get detailed installation instructions based on the current system"""
189
- import platform
190
- system = platform.system().lower()
191
- if system == "linux":
192
- try:
193
- with open('/etc/os-release', 'r') as f:
194
- os_info = f.read().lower()
195
- if 'ubuntu' in os_info or 'debian' in os_info:
196
- return """
197
- 📦 INSTALLATION INSTRUCTIONS (Ubuntu/Debian):
198
- 1. Update package list: sudo apt-get update
199
- 2. Install MAFFT and IQ-TREE: sudo apt-get install mafft iqtree
200
- 3. Verify installation: mafft --version, iqtree2 --version
201
- Alternative using Conda: conda install -c bioconda mafft iqtree
202
- """
203
- elif 'centos' in os_info or 'rhel' in os_info or 'fedora' in os_info:
204
- return """
205
- 📦 INSTALLATION INSTRUCTIONS (CentOS/RHEL/Fedora):
206
- 1. Install EPEL repository (CentOS/RHEL): sudo yum install epel-release
207
- 2. Install packages: sudo yum install mafft iqtree
208
- 3. Verify installation: mafft --version, iqtree2 --version
209
- """
210
- except:
211
- pass
212
- elif system == "darwin":
213
- return """
214
- 📦 INSTALLATION INSTRUCTIONS (macOS):
215
- Using Homebrew: 1. Install Homebrew: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
216
- 2. Install MAFFT and IQ-TREE: brew install mafft iqtree
217
- 3. Verify installation: mafft --version, iqtree2 --version
218
- Using Conda: conda install -c bioconda mafft iqtree
219
- """
220
- elif system == "windows":
221
- return """
222
- 📦 INSTALLATION INSTRUCTIONS (Windows):
223
- Option 1 - Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
224
- Option 2 - Manual: 1. Download MAFFT: https://mafft.cbrc.jp/alignment/software/
225
- 2. Download IQ-TREE: http://www.iqtree.org/
226
- 3. Add to PATH
227
- """
228
- return """
229
- 📦 GENERAL INSTALLATION INSTRUCTIONS:
230
- Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
231
- Manual: 1. MAFFT: https://mafft.cbrc.jp/alignment/software/
232
- 2. IQ-TREE: http://www.iqtree.org/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  """
 
234
 
235
- def run_mafft_alignment_improved(input_fasta, output_fasta, mafft_cmd):
236
- """Run MAFFT alignment with improved permission and error handling"""
237
  try:
238
- if not os.access(mafft_cmd, os.X_OK):
239
- logging.warning(f"MAFFT executable {mafft_cmd} is not executable")
240
- if not check_and_fix_executable_permissions(mafft_cmd):
241
- return False, f"Cannot make {mafft_cmd} executable"
242
- try:
243
- test_result = subprocess.run([mafft_cmd, '--version'], capture_output=True, text=True, timeout=10)
244
- if test_result.returncode != 0:
245
- return False, f"MAFFT version check failed: {test_result.stderr}"
246
- except Exception as e:
247
- return False, f"MAFFT version check failed: {str(e)}"
248
- cmd = [mafft_cmd, '--auto', '--quiet', '--thread', '2', input_fasta]
249
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
250
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
 
 
 
 
 
 
 
 
 
251
  if result.returncode == 0:
 
252
  with open(output_fasta, 'w') as f:
253
  f.write(result.stdout)
254
  logging.info(f"MAFFT alignment completed: {output_fasta}")
 
 
255
  if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
256
  return True, output_fasta
257
  else:
258
- return False, "MAFFT completed but output file is empty"
259
- else:
260
  error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
261
  logging.error(f"MAFFT failed: {error_msg}")
262
  return False, f"MAFFT error: {error_msg}"
 
263
  except subprocess.TimeoutExpired:
264
  logging.error("MAFFT timeout")
265
  return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
266
- except PermissionError as e:
267
- logging.error(f"Permission error running MAFFT: {e}")
268
- return False, f"Permission denied: {mafft_cmd}. Please check file permissions."
269
  except FileNotFoundError:
270
  return False, f"MAFFT executable not found: {mafft_cmd}"
271
  except Exception as e:
272
- logging.error(f"MAFFT execution failed: {e}")
273
- return False, f"MAFFT execution failed: {str(e)}"
274
-
275
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
276
  """Run IQ-TREE with enhanced options and error handling"""
277
  try:
278
- if not os.access(iqtree_cmd, os.X_OK):
279
- logging.warning(f"IQ-TREE executable {iqtree_cmd} is not executable")
280
- if not check_and_fix_executable_permissions(iqtree_cmd):
281
- return False, f"Cannot make {iqtree_cmd} executable"
282
- try:
283
- test_result = subprocess.run([iqtree_cmd, '--version'], capture_output=True, text=True, timeout=10)
284
- if test_result.returncode != 0:
285
- return False, f"IQ-TREE version check failed: {test_result.stderr}"
286
- except Exception as e:
287
- return False, f"IQ-TREE version check failed: {str(e)}"
288
- cmd = [iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000', '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '-redo', '--quiet']
 
 
289
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
290
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200, cwd=os.getcwd())
 
 
 
 
 
 
 
 
 
291
  if result.returncode == 0:
292
  tree_file = f"{output_prefix}.treefile"
293
  if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
294
- logging.info(f"IQ-TREE analysis completed: {tree_file}")
295
- return True, tree_file
296
- else:
297
- logging.error("IQ-TREE completed but tree file not found or empty")
298
- return False, "Tree file not generated or empty"
299
- else:
300
  error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
301
  logging.error(f"IQ-TREE failed: {error_msg}")
302
  return False, f"IQ-TREE error: {error_msg}"
 
303
  except subprocess.TimeoutExpired:
304
  logging.error("IQ-TREE timeout")
305
  return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
306
- except PermissionError as e:
307
- logging.error(f"Permission error running IQ-TREE: {e}")
308
- return False, f"Permission denied: {iqtree_cmd}. Please check file permissions."
309
  except FileNotFoundError:
310
  return False, f"IQ-TREE executable not found: {iqtree_cmd}"
311
  except Exception as e:
312
- logging.error(f"IQ-TREE execution failed: {e}")
313
- return False, f"IQ-TREE execution failed: {str(e)}"
314
-
315
  def create_simple_neighbor_joining_tree(sequences_dict):
316
  """Create a simple distance-based tree when ML tools are not available"""
317
  try:
 
 
318
  import random
 
319
  seq_names = list(sequences_dict.keys())
320
  n_seqs = len(seq_names)
 
321
  if n_seqs < 2:
322
  return None, "Need at least 2 sequences for tree construction"
 
 
323
  if n_seqs == 2:
324
  tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
325
  else:
 
326
  tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
 
 
327
  tree_file = "simple_tree.nwk"
328
  with open(tree_file, 'w') as f:
329
  f.write(tree_str)
 
330
  return tree_file, "Simple distance-based tree created"
 
331
  except Exception as e:
332
  return None, f"Simple tree creation failed: {str(e)}"
333
 
334
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
335
  """Create a multi-FASTA file with query sequence and reference sequences"""
336
  try:
 
337
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
 
 
338
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
 
 
339
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
340
  if os.path.exists(ref_fasta_path):
341
  with open(ref_fasta_path, 'r') as ref_file:
342
  temp_fasta.write(ref_file.read())
343
  logging.info(f"Added reference sequences from {ref_fasta_path}")
344
  else:
 
345
  if analyzer and hasattr(analyzer, 'data'):
346
  count = 0
347
  for idx, row in analyzer.data.iterrows():
348
- if 'sequence' in row and len(str(row['sequence'])) > 50:
349
- seq_id = row.get('id', f"Ref_{count}")
350
  sequence = str(row['sequence']).upper()
351
  temp_fasta.write(f">{seq_id}\n{sequence}\n")
352
  count += 1
353
- if count >= 20:
354
  break
355
  logging.info(f"Added {count} reference sequences from CSV")
 
356
  temp_fasta.close()
357
  return temp_fasta.name
 
358
  except Exception as e:
359
  logging.error(f"Failed to create multi-FASTA: {e}")
360
  return None
361
-
362
  def build_maximum_likelihood_tree(f_gene_sequence):
363
  """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
364
  try:
365
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = enhanced_check_tool_availability()
 
 
 
366
  status_msg = "🔍 Checking dependencies...\n"
367
- status_msg += f"✅ MAFFT found: {mafft_cmd}\n" if mafft_available else "❌ MAFFT not found\n"
368
- status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n" if iqtree_available else "❌ IQ-TREE not found\n"
369
- if not mafft_available or not iqtree_available:
370
- instructions = get_installation_instructions()
371
- return False, f"{status_msg}\n{instructions}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  output_dir = "ml_tree_output"
373
  os.makedirs(output_dir, exist_ok=True)
 
 
374
  logging.info("Creating multi-FASTA file...")
375
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
376
  if not multi_fasta:
377
  return False, f"{status_msg}❌ Failed to create input FASTA", None, None
 
 
378
  logging.info("Running MAFFT alignment...")
379
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
380
- mafft_success, mafft_result = run_mafft_alignment_improved(multi_fasta, aligned_fasta, mafft_cmd)
 
 
381
  os.unlink(multi_fasta)
 
382
  if not mafft_success:
383
  return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
 
 
384
  logging.info("Running IQ-TREE analysis...")
385
  tree_prefix = os.path.join(output_dir, "ml_tree")
386
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
 
387
  if not iqtree_success:
388
  return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
 
 
389
  tree_file = iqtree_result
390
  log_file = f"{tree_prefix}.log"
 
 
391
  standard_aligned = "f_gene_sequences_aligned.fasta"
392
  standard_tree = "f_gene_sequences.phy.treefile"
 
393
  if os.path.exists(aligned_fasta):
394
  shutil.copy2(aligned_fasta, standard_aligned)
395
  if os.path.exists(tree_file):
396
  shutil.copy2(tree_file, standard_tree)
397
- success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}\n"
 
 
 
 
398
  if os.path.exists(log_file):
399
  try:
400
  with open(log_file, 'r') as f:
401
  log_content = f.read()
 
402
  if "Best-fit model:" in log_content:
403
  model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
404
  if model_lines:
405
  success_msg += f"- {model_lines[0].strip()}\n"
406
  except Exception as e:
407
  logging.warning(f"Could not read log file: {e}")
 
408
  logging.info("Maximum likelihood tree construction completed")
409
  return True, success_msg, aligned_fasta, tree_file
 
410
  except Exception as e:
411
  logging.error(f"ML tree construction failed: {e}")
412
  return False, f"ML tree construction failed: {str(e)}", None, None
413
 
 
414
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
415
- """Analyze sequence and create phylogenetic tree"""
 
 
416
  try:
417
  if not analyzer:
418
  return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
 
419
  if not sequence:
420
  return "Error: Please provide a sequence."
 
421
  if not (1 <= matching_percentage <= 99):
422
  return "Error: Matching percentage must be between 1 and 99."
 
 
423
  if not analyzer.find_query_sequence(sequence):
424
  return "Error: Invalid query sequence or sequence not found in dataset."
 
 
425
  analyzer.matching_percentage = matching_percentage
 
 
426
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
 
427
  if not matched_ids:
428
  return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
 
429
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
 
 
430
  tree_structure = analyzer.build_tree_structure(matched_ids)
431
  if not tree_structure:
432
  return "Error: Failed to build tree structure."
 
 
433
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
434
  if not fig:
435
  return "Error: Failed to create tree visualization."
 
 
436
  html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
 
 
437
  output_dir = "output"
438
  os.makedirs(output_dir, exist_ok=True)
 
 
439
  safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
440
  html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
 
441
  with open(html_filename, "w", encoding='utf-8') as f:
442
  f.write(html_content)
 
443
  logging.info(f"Tree HTML saved to {html_filename}")
 
444
  return html_content
 
445
  except Exception as e:
446
  error_msg = f"Tree analysis error: {str(e)}"
447
  logging.error(error_msg)
448
- import traceback
449
  logging.error(f"Full traceback: {traceback.format_exc()}")
450
  return error_msg
451
 
 
452
  def predict_with_keras(sequence):
453
- """Keras prediction for initial sequence processing"""
454
  try:
455
  if not keras_model or not kmer_to_index:
456
  return f"Keras model not available. Input sequence: {sequence[:100]}..."
 
457
  if len(sequence) < 6:
458
  return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
 
 
459
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
460
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
 
 
461
  input_arr = np.array([indices])
462
  prediction = keras_model.predict(input_arr, verbose=0)[0]
 
 
463
  result = ''.join([str(round(p, 3)) for p in prediction])
464
  return result
465
  except Exception as e:
466
  logging.error(f"Keras prediction failed: {e}")
467
  return f"Keras prediction failed: {str(e)}"
468
 
469
- def classify_sequence(sequence):
470
- """Classify sequence using the second model or fallback"""
471
- try:
472
- if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
473
- if keras_model and kmer_to_index: # Fallback to Keras model
474
- logging.warning("Using Keras model as fallback for classification.")
475
- if len(sequence) < 6:
476
- return {
477
- "status": "error",
478
- "message": "Sequence too short for k-mer prediction (minimum 6 nucleotides).",
479
- "confidence": None,
480
- "predicted_label": None
481
- }
482
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
483
- indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
484
- input_arr = np.array([indices])
485
- pred = keras_model.predict(input_arr, verbose=0)[0]
486
- confidence = float(np.max(pred))
487
- label = "F" if confidence > 0.5 else "Unknown" # Simple threshold-based fallback
488
- return {
489
- "status": "success" if label == "F" else "warning",
490
- "message": f"F gene detected (fallback)" if label == "F" else "Uncertain classification (fallback)",
491
- "confidence": confidence,
492
- "predicted_label": label
493
- }
494
- return {
495
- "status": "error",
496
- "message": "No classification model available.",
497
- "confidence": None,
498
- "predicted_label": None
499
- }
500
- if len(sequence) < 1500:
501
- return {
502
- "status": "error",
503
- "message": "Sequence too short. Must be at least 1500 bases.",
504
- "confidence": None,
505
- "predicted_label": None
506
- }
507
- tokens = [sequence[i:i+6] for i in range(len(sequence)-5+1)]
508
- encoded = [classifier_kmer_to_index.get(kmer, 0) for kmer in tokens]
509
- padded = pad_sequences([encoded], maxlen=classifier_maxlen, padding='post')
510
- pred = classifier_model.predict(padded, verbose=0)
511
- predicted_class = int(np.argmax(pred))
512
- label = LABELS[predicted_class]
513
- confidence = float(np.max(pred))
514
- if label == "F":
515
- return {
516
- "status": "success",
517
- "message": "F gene detected.",
518
- "confidence": confidence,
519
- "predicted_label": label
520
- }
521
- elif label == "Random":
522
- return {
523
- "status": "error",
524
- "message": "Unidentified sequence detected. Make sure you're entering the F gene of the NDV.",
525
- "confidence": confidence,
526
- "predicted_label": label
527
- }
528
- else:
529
- return {
530
- "status": "error",
531
- "message": "No F-gene detected. Please enter an NDV's F gene.",
532
- "confidence": confidence,
533
- "predicted_label": label
534
- }
535
- except Exception as e:
536
- logging.error(f"Classifier prediction failed: {e}")
537
- return {
538
- "status": "error",
539
- "message": f"Prediction failed: {str(e)}",
540
- "confidence": None,
541
- "predicted_label": None
542
- }
543
 
544
  def read_fasta_file(file_obj):
545
- """Read FASTA file content"""
546
  try:
547
  if file_obj is None:
548
  return ""
 
 
549
  if hasattr(file_obj, 'name'):
550
  with open(file_obj.name, "r") as f:
551
  content = f.read()
552
  else:
553
  content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
 
554
  lines = content.strip().split("\n")
555
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
556
  return ''.join(seq_lines)
557
- except Exception as e:
558
  logging.error(f"Failed to read FASTA file: {e}")
559
  return ""
560
 
 
561
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
562
- """Run pipeline from FASTA file"""
563
  try:
564
  dna_input = read_fasta_file(fasta_file_obj)
565
  if not dna_input:
566
- return "Failed to read FASTA file", "", "", "", "", "", "", "", "", None, None, None, "No input sequence"
567
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
568
  except Exception as e:
569
  error_msg = f"Pipeline error: {str(e)}"
570
  logging.error(error_msg)
571
- return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
572
 
573
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
574
- """Run the full pipeline with direct input to verification and ML tree"""
575
  try:
 
576
  dna_input = dna_input.upper().strip()
577
  if not dna_input:
578
- return "Empty input", "", "", "", "", "", "", "", "", None, None, None, "No input provided"
 
 
579
  if not re.match('^[ACTGN]+$', dna_input):
580
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
581
  logging.info("DNA sequence sanitized")
582
 
583
- # Step 1: Direct input (Boundary Model disabled)
584
- processed_sequence = dna_input
585
- boundary_output = "Boundary Model disabled. Using raw input: " + str(len(dna_input)) + " bp"
586
- logging.info("Using raw input directly for verification and tree analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
 
588
- # Step 2: Keras Prediction (Verification)
589
  keras_output = ""
590
  if processed_sequence and len(processed_sequence) >= 6:
591
  keras_prediction = predict_with_keras(processed_sequence)
592
- keras_output = keras_prediction if not keras_prediction.startswith(("Keras", "Sequence")) else keras_prediction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
- # Step 3: Classifier Prediction
595
- classifier_result = classify_sequence(processed_sequence)
596
- classifier_status = classifier_result["status"]
597
- classifier_message = classifier_result["message"]
598
- classifier_label = classifier_result["predicted_label"]
599
- classifier_confidence = classifier_result["confidence"]
600
 
601
- # Step 4: Maximum Likelihood Tree
602
  aligned_file = None
603
  phy_file = None
604
  ml_tree_output = ""
 
605
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
606
  try:
607
  logging.info("Starting maximum likelihood tree construction...")
608
  ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
 
609
  if ml_success:
610
  ml_tree_output = ml_message
611
  aligned_file = ml_aligned
612
  phy_file = ml_tree
613
  else:
614
- ml_tree_output = ml_message
 
615
  except Exception as e:
616
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
617
  logging.error(f"ML Tree failed: {e}")
618
- elif build_ml_tree:
619
- ml_tree_output = "❌ F gene sequence too short for ML tree construction (minimum 50 bp)"
620
  else:
621
  ml_tree_output = "ML tree construction skipped (not requested)"
622
 
623
- # Step 5: ML Simplified Tree
624
  html_file = None
625
  tree_html_content = "No tree generated"
626
  simplified_ml_output = ""
 
627
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
628
  try:
629
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
 
 
630
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
 
631
  if tree_result and not tree_result.startswith("Error:"):
 
632
  tree_html_content = tree_result
633
  simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
 
 
634
  output_dir = "output"
635
  if os.path.exists(output_dir):
636
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
637
  if html_files:
638
- html_file = os.path.join(output_dir, html_files[-1])
639
  simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
 
 
640
  if analyzer.find_query_sequence(processed_sequence):
641
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
642
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
643
- simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
644
  else:
645
  simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
646
  tree_html_content = f"<p>Error: {tree_result}</p>"
 
647
  except Exception as e:
648
  logging.error(f"Simplified ML tree analysis failed: {e}")
649
  simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
650
- tree_html_content = f"<p>Error: {str(e)}</p>"
651
- else:
652
- if not analyzer:
653
- simplified_ml_output = "❌ Tree analyzer not available"
654
- else:
655
- simplified_ml_output = "❌ F gene sequence too short for tree analysis (minimum 10 bp)"
656
 
657
  # Return all results
658
  return (
659
- boundary_output,
660
- keras_output,
661
- classifier_status,
662
- classifier_message,
663
- classifier_label,
664
- classifier_confidence,
665
- ml_tree_output,
666
- simplified_ml_output,
667
- tree_html_content,
668
- aligned_file,
669
- phy_file,
670
- html_file,
671
- f"Pipeline completed. Input length: {len(processed_sequence)} bp"
672
  )
 
673
  except Exception as e:
674
  error_msg = f"Pipeline execution failed: {str(e)}"
675
  logging.error(error_msg)
676
  import traceback
677
  logging.error(f"Full traceback: {traceback.format_exc()}")
678
  return (
679
- error_msg, "", "", "", "", "", "", "", f"<p>Error: {error_msg}</p>",
680
  None, None, None, error_msg
681
  )
682
 
683
  # --- Gradio Interface ---
684
  def create_interface():
685
  """Create the Gradio interface with enhanced layout and features"""
 
 
686
  custom_css = """
687
- .gradio-container { max-width: 1200px !important; }
688
- .tab-nav button { font-size: 16px !important; }
689
- .output-html { height: 600px !important; overflow: auto; }
 
 
 
 
 
 
 
690
  """
 
691
  with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
692
  gr.Markdown("""
693
  # 🧬 F Gene Analysis Pipeline
694
 
695
- This tool analyzes input sequences directly (Boundary Model disabled):
696
- - **Gene Validation**: Validates with machine learning.
697
- - **Gene Classification**: Classifies sequence type (F gene or other).
698
- - **Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
 
699
 
700
  **Instructions:**
701
- 1. Enter your sequence or upload a FASTA file
702
- 2. Adjust similarity threshold (1-99%)
703
- 3. Choose whether to build ML tree (requires MAFFT & IQ-TREE)
704
- 4. Click "Run Analysis" to start
705
  """)
706
 
707
  with gr.Tab("🔬 Analysis Pipeline"):
708
  with gr.Row():
709
  with gr.Column(scale=2):
 
710
  gr.Markdown("### Input Sequence")
711
- dna_input = gr.Textbox(label="DNA Sequence", placeholder="Enter your DNA sequence here (ATCG format)...", lines=5, max_lines=10)
712
- fasta_file = gr.File(label="Or Upload FASTA File", file_types=[".fasta", ".fa", ".fas", ".txt"])
 
 
 
 
 
 
 
 
 
 
713
  with gr.Row():
714
- similarity_score = gr.Slider(minimum=1, maximum=99, value=95.0, step=1.0, label="Similarity Threshold (%)", info="Minimum similarity for phylogenetic analysis")
715
- build_ml_tree = gr.Checkbox(label="Build ML Tree", value=False, info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  with gr.Row():
717
  run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
718
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
 
719
  with gr.Column(scale=1):
 
720
  gr.Markdown("### Analysis Status")
721
- status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
 
 
 
 
 
 
 
722
  gr.Markdown("### Available Models")
723
  model_status = []
724
- model_status.append("❌ Boundary Detection Model (Disabled)") # Reflect disabled state
 
 
 
 
725
  if keras_model:
726
  model_status.append("✅ Gene Validation Model")
727
  else:
728
  model_status.append("❌ Gene Validation Model")
729
- if classifier_model:
730
- model_status.append("✅ Gene Classification Model")
731
- else:
732
- model_status.append("❌ Gene Classification Model")
733
  if analyzer:
734
  model_status.append("✅ Tree Analysis Module")
735
  else:
736
  model_status.append("❌ Tree Analysis Module")
 
737
  gr.Markdown("\n".join(model_status))
738
 
739
  with gr.Tab("📊 Results"):
740
  with gr.Row():
741
  with gr.Column():
742
- boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False, value="Boundary Model disabled. Using raw input.")
743
- keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
744
- classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
745
- classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
746
- classifier_label = gr.Textbox(label="🏷️ Predicted Label", lines=1, interactive=False)
747
- classifier_confidence = gr.Textbox(label="📊 Confidence Score", lines=1, interactive=False)
 
 
 
 
 
 
 
748
  with gr.Column():
749
- ml_tree_output = gr.Textbox(label="🌳 Maximum Likelihood Tree", lines=5, interactive=False)
750
- simplified_ml_output = gr.Textbox(label="📈 Simplified Phylogenetic Analysis", lines=3, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
751
  gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
752
- tree_html = gr.HTML(label="Interactive Tree", value="<p>No tree generated yet. Run analysis to see results.</p>")
 
 
 
 
 
753
  gr.Markdown("### 📁 Download Results")
754
  with gr.Row():
755
- aligned_file = gr.File(label="Aligned Sequences (FASTA)", interactive=False)
756
- phy_file = gr.File(label="Phylogenetic Tree File", interactive=False)
757
- html_file = gr.File(label="Interactive Tree (HTML)", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
758
 
759
  with gr.Tab("ℹ️ Help & Info"):
760
  gr.Markdown("""
761
  ## About This Tool
762
 
763
  ### F Gene Analysis Pipeline
764
- - **🎯 F Gene Extraction**: Disabled; uses raw input directly.
765
- - **🔍 Gene Validation**: Validates with k-mer based machine learning.
766
- - **🧬 Gene Classification**: Classifies sequences (F gene or other).
767
- - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768
 
769
  ### Input Requirements
770
- - DNA Sequences: ATCG format, minimum 50 bp.
771
- - FASTA Files: Standard format.
772
- - Similarity Threshold: 1-99%.
773
 
774
  ### Dependencies
775
- **For ML Trees:**
 
776
  ```bash
777
- # Ubuntu/Debian: sudo apt-get install mafft iqtree
778
- # macOS: brew install mafft iqtree
779
- # Conda: conda install -c bioconda mafft iqtree
 
 
 
 
 
780
  ```
781
 
 
 
 
 
 
782
  ### Troubleshooting
783
- - *"No similar sequences"*: Lower similarity threshold.
784
- - *"Sequence too short"*: Provide >50 bp.
785
- - *"MAFFT/IQ-TREE not found"*: Install dependencies.
786
- - *"Model not available"*: Check model files.
 
 
 
 
 
 
 
 
 
 
 
787
  """)
788
 
 
 
 
 
 
 
 
789
  def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
 
790
  if file_obj is not None:
791
  return run_pipeline_from_file(file_obj, sim_score, build_tree)
792
  else:
793
- return run_pipeline(dna_seq, sim_score, build_tree)
794
-
795
  def clear_inputs():
796
  return "", None, 95.0, False, "Ready to analyze"
797
 
 
798
  run_btn.click(
799
  fn=run_analysis_combined,
800
  inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
801
  outputs=[
802
- boundary_output, keras_output, classifier_status, classifier_message,
803
- classifier_label, classifier_confidence, ml_tree_output, simplified_ml_output,
804
- tree_html, aligned_file, phy_file, html_file, status_display
805
  ]
806
  )
 
807
  clear_btn.click(
808
  fn=clear_inputs,
809
  outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
810
  )
811
 
 
 
812
  example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
 
813
  def load_example():
814
  example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
815
  return example_seq, "Example F gene sequence loaded"
816
- example_btn.click(fn=load_example, outputs=[dna_input, status_display])
 
 
 
 
817
 
818
  return iface
819
 
820
  # --- Main Execution ---
821
  if __name__ == "__main__":
 
822
  interface = create_interface()
 
 
823
  interface.launch(
824
- server_name="0.0.0.0",
825
- server_port=7860,
826
- share=False,
827
- debug=True,
828
- show_error=True,
829
- max_threads=4,
830
- auth=None,
831
- ssl_verify=False,
832
- quiet=False
833
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import ml_simplified_tree
2
  import tempfile
3
  import shutil
4
+ import sys
5
  from pathlib import Path
6
+
7
+
8
 
9
  # --- Global Variables ---
10
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
 
 
 
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
 
13
  # --- Paths ---
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ # Model repository and file paths
17
  model_repo = "GGproject10/best_boundary_aware_model"
18
  csv_path = "f cleaned.csv"
19
+
20
 
21
  # Get HF token from environment (if available)
22
  hf_token = os.getenv("HF_TOKEN")
23
+ boundary_model = None
 
 
24
  keras_model = None
25
  kmer_to_index = None
 
 
 
26
 
27
+
28
+
29
+
30
+ # Try to load boundary model from Hugging Face Hub
31
+ try:
32
+ boundary_path = hf_hub_download(
33
+ repo_id=model_repo,
34
+ filename="best_boundary_aware_model.pth",
35
+ token=hf_token
36
+ )
37
+ if os.path.exists(boundary_path):
38
+ boundary_model = GenePredictor(boundary_path)
39
+ logging.info("Boundary model loaded successfully from Hugging Face Hub.")
40
 
41
  # Try to load Keras model from Hugging Face Hub
42
  try:
43
+ keras_path = hf_hub_download(
44
+ repo_id=model_repo,
45
+ filename="best_model.keras",
46
+ token=hf_token
47
+ )
48
+ kmer_path = hf_hub_download(
49
+ repo_id=model_repo,
50
+ filename="kmer_to_index.pkl",
51
+ token=hf_token
52
+ )
53
+
54
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
55
  keras_model = load_model(keras_path)
56
  with open(kmer_path, "rb") as f:
 
 
 
 
57
  except Exception as e:
58
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
 
80
  # --- Initialize Tree Analyzer ---
81
  analyzer = None
82
  try:
 
83
  if os.path.exists(csv_path):
84
  if analyzer.load_data(csv_path):
85
  logging.info("Tree analyzer initialized successfully")
86
+ # Try to train AI model (optional)
87
  try:
88
  if not analyzer.train_ai_model():
89
  logging.warning("AI model training failed; proceeding with basic analysis.")
 
 
 
 
 
 
 
 
 
 
90
  analyzer = None
91
 
92
  # --- Enhanced Tool Detection ---
93
+ def check_tool_availability():
94
+ """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
95
+
96
+ # Check MAFFT
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
 
 
 
111
  mafft_available = False
112
  mafft_cmd = None
113
+
114
+ # Try multiple MAFFT locations
115
  mafft_candidates = [
116
  MAFFT_PATH,
117
  'mafft',
118
  '/usr/bin/mafft',
119
  '/usr/local/bin/mafft',
120
+ 'mafft.bat', # Windows
121
+
122
+
123
  ]
124
+
125
  for candidate in mafft_candidates:
126
+ if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
  mafft_available = True
136
  mafft_cmd = candidate
137
+ logging.info(f"Found MAFFT at: {candidate}")
138
  break
139
+
140
+ # Check IQ-TREE
141
  iqtree_available = False
142
  iqtree_cmd = None
143
+
144
+ # Try multiple IQ-TREE locations and names
145
  iqtree_candidates = [
146
  IQTREE_PATH,
147
  'iqtree2',
 
 
148
  '/usr/local/bin/iqtree2',
149
  '/usr/bin/iqtree',
150
  '/usr/local/bin/iqtree',
151
+ 'iqtree2.exe', # Windows
152
+ 'iqtree.exe', # Windows
153
+
154
  ]
155
+
156
  for candidate in iqtree_candidates:
157
+ if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
  iqtree_available = True
167
  iqtree_cmd = candidate
168
+ logging.info(f"Found IQ-TREE at: {candidate}")
169
  break
170
+
171
  return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
172
 
173
+ def install_dependencies_guide():
174
+ """Provide installation guidance for missing dependencies"""
175
+ guide = """
176
+ 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
177
+
178
+ For MAFFT:
179
+ - Ubuntu/Debian: sudo apt-get install mafft
180
+ - CentOS/RHEL: sudo yum install mafft
181
+ - macOS: brew install mafft
182
+ - Windows: Download from https://mafft.cbrc.jp/alignment/software/
183
+
184
+ For IQ-TREE:
185
+ - Ubuntu/Debian: sudo apt-get install iqtree
186
+ - CentOS/RHEL: sudo yum install iqtree
187
+ - macOS: brew install iqtree
188
+ - Windows: Download from http://www.iqtree.org/
189
+
190
+ Alternative: Use conda/mamba:
191
+ - conda install -c bioconda mafft iqtree
192
+
193
+ Docker option:
194
+ - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
195
+ - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
  """
240
+ return guide
241
 
242
+ def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
243
+ """Run MAFFT alignment with enhanced error handling"""
244
  try:
245
+ # MAFFT command with more robust options
246
+ cmd = [
247
+ mafft_cmd,
248
+ '--auto', # Automatic strategy selection
249
+ '--quiet', # Reduce output verbosity
250
+ input_fasta
251
+ ]
252
+
253
+
254
+
255
+
256
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
257
+
258
+ # Run MAFFT with enhanced error handling
259
+ result = subprocess.run(
260
+ cmd,
261
+ capture_output=True,
262
+ text=True,
263
+ timeout=600, # Increased timeout to 10 minutes
264
+ cwd=os.getcwd() # Ensure working directory is set
265
+ )
266
+
267
  if result.returncode == 0:
268
+ # Write aligned sequences to output file
269
  with open(output_fasta, 'w') as f:
270
  f.write(result.stdout)
271
  logging.info(f"MAFFT alignment completed: {output_fasta}")
272
+
273
+ # Verify output file
274
  if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
275
  return True, output_fasta
276
  else:
 
 
277
  error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
278
  logging.error(f"MAFFT failed: {error_msg}")
279
  return False, f"MAFFT error: {error_msg}"
280
+
281
  except subprocess.TimeoutExpired:
282
  logging.error("MAFFT timeout")
283
  return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
284
+
285
+
286
+
287
  except FileNotFoundError:
288
  return False, f"MAFFT executable not found: {mafft_cmd}"
289
  except Exception as e:
 
 
 
290
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
291
  """Run IQ-TREE with enhanced options and error handling"""
292
  try:
293
+ # Enhanced IQ-TREE command
294
+ cmd = [
295
+ iqtree_cmd,
296
+ '-s', aligned_fasta,
297
+ '-m', 'MFP', # ModelFinder Plus for automatic model selection
298
+ '-bb', '1000', # Bootstrap replicates
299
+ '-alrt', '1000', # SH-aLRT test
300
+ '-nt', 'AUTO', # Auto detect threads
301
+ '--prefix', output_prefix,
302
+ '-redo', # Overwrite existing files
303
+ '--quiet' # Reduce verbosity
304
+ ]
305
+
306
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
307
+
308
+ # Run IQ-TREE with enhanced error handling
309
+ result = subprocess.run(
310
+ cmd,
311
+ capture_output=True,
312
+ text=True,
313
+ timeout=1200, # 20 minute timeout for larger datasets
314
+ cwd=os.getcwd()
315
+ )
316
+
317
  if result.returncode == 0:
318
  tree_file = f"{output_prefix}.treefile"
319
  if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
 
 
 
 
 
 
320
  error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
321
  logging.error(f"IQ-TREE failed: {error_msg}")
322
  return False, f"IQ-TREE error: {error_msg}"
323
+
324
  except subprocess.TimeoutExpired:
325
  logging.error("IQ-TREE timeout")
326
  return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
327
+
328
+
329
+
330
  except FileNotFoundError:
331
  return False, f"IQ-TREE executable not found: {iqtree_cmd}"
332
  except Exception as e:
 
 
 
333
  def create_simple_neighbor_joining_tree(sequences_dict):
334
  """Create a simple distance-based tree when ML tools are not available"""
335
  try:
336
+ # This is a simplified implementation
337
+ # In a real scenario, you'd want to use a proper NJ implementation
338
  import random
339
+
340
  seq_names = list(sequences_dict.keys())
341
  n_seqs = len(seq_names)
342
+
343
  if n_seqs < 2:
344
  return None, "Need at least 2 sequences for tree construction"
345
+
346
+ # Create a simple Newick tree structure
347
  if n_seqs == 2:
348
  tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
349
  else:
350
+ # Simple clustering approach
351
  tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
352
+
353
+ # Save to temporary file
354
  tree_file = "simple_tree.nwk"
355
  with open(tree_file, 'w') as f:
356
  f.write(tree_str)
357
+
358
  return tree_file, "Simple distance-based tree created"
359
+
360
  except Exception as e:
361
  return None, f"Simple tree creation failed: {str(e)}"
362
 
363
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
364
  """Create a multi-FASTA file with query sequence and reference sequences"""
365
  try:
366
+ # Create temporary FASTA file
367
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
368
+
369
+ # Add query sequence
370
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
371
+
372
+ # Add reference sequences from existing aligned FASTA if available
373
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
374
  if os.path.exists(ref_fasta_path):
375
  with open(ref_fasta_path, 'r') as ref_file:
376
  temp_fasta.write(ref_file.read())
377
  logging.info(f"Added reference sequences from {ref_fasta_path}")
378
  else:
379
+ # If no reference file, try to create from CSV data
380
  if analyzer and hasattr(analyzer, 'data'):
381
  count = 0
382
  for idx, row in analyzer.data.iterrows():
 
 
383
  sequence = str(row['sequence']).upper()
384
  temp_fasta.write(f">{seq_id}\n{sequence}\n")
385
  count += 1
386
+ if count >= 20: # Limit to prevent too large datasets
387
  break
388
  logging.info(f"Added {count} reference sequences from CSV")
389
+
390
  temp_fasta.close()
391
  return temp_fasta.name
392
+
393
  except Exception as e:
394
  logging.error(f"Failed to create multi-FASTA: {e}")
395
  return None
 
396
  def build_maximum_likelihood_tree(f_gene_sequence):
397
  """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
398
  try:
399
+ # Check tool availability with enhanced detection
400
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
401
+
402
+ # Prepare status message
403
  status_msg = "🔍 Checking dependencies...\n"
404
+
405
+ if not mafft_available:
406
+ status_msg += "❌ MAFFT not found\n"
407
+ else:
408
+ status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
409
+
410
+ if not iqtree_available:
411
+ status_msg += "❌ IQ-TREE not found\n"
412
+ else:
413
+ status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
414
+
415
+ # If neither tool is available, provide installation guide
416
+ if not mafft_available and not iqtree_available:
417
+ guide = install_dependencies_guide()
418
+ return False, f"{status_msg}\n{guide}", None, None
419
+
420
+ # If only one tool is missing, provide specific guidance
421
+ if not mafft_available:
422
+ return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
423
+
424
+ if not iqtree_available:
425
+ status_msg += "\n⚠️ IQ-TREE not available. Attempting simple tree construction...\n"
426
+
427
+ # Try to create a simple tree as fallback
428
+ multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
429
+ if multi_fasta:
430
+ # Read sequences
431
+ sequences = {}
432
+ current_seq = ""
433
+ current_name = ""
434
+
435
+ with open(multi_fasta, 'r') as f:
436
+ for line in f:
437
+ line = line.strip()
438
+ if line.startswith('>'):
439
+ if current_name and current_seq:
440
+ sequences[current_name] = current_seq
441
+ current_name = line[1:]
442
+ current_seq = ""
443
+ else:
444
+ current_seq += line
445
+ if current_name and current_seq:
446
+ sequences[current_name] = current_seq
447
+
448
+ simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
449
+ os.unlink(multi_fasta)
450
+
451
+ if simple_tree:
452
+ return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
453
+ else:
454
+ return False, f"{status_msg}❌ {simple_msg}", None, None
455
+ else:
456
+ return False, f"{status_msg}❌ Failed to create input sequences", None, None
457
+
458
+ # Both tools available - proceed with full ML analysis
459
+ # Create output directory
460
  output_dir = "ml_tree_output"
461
  os.makedirs(output_dir, exist_ok=True)
462
+
463
+ # Step 1: Create multi-FASTA file with query and reference sequences
464
  logging.info("Creating multi-FASTA file...")
465
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
466
  if not multi_fasta:
467
  return False, f"{status_msg}❌ Failed to create input FASTA", None, None
468
+
469
+ # Step 2: Run MAFFT alignment
470
  logging.info("Running MAFFT alignment...")
471
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
472
+ mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
473
+
474
+ # Clean up temporary file
475
  os.unlink(multi_fasta)
476
+
477
  if not mafft_success:
478
  return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
479
+
480
+ # Step 3: Run IQ-TREE analysis
481
  logging.info("Running IQ-TREE analysis...")
482
  tree_prefix = os.path.join(output_dir, "ml_tree")
483
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
484
+
485
  if not iqtree_success:
486
  return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
487
+
488
+ # Step 4: Prepare output files
489
  tree_file = iqtree_result
490
  log_file = f"{tree_prefix}.log"
491
+
492
+ # Copy to standard names for compatibility
493
  standard_aligned = "f_gene_sequences_aligned.fasta"
494
  standard_tree = "f_gene_sequences.phy.treefile"
495
+
496
  if os.path.exists(aligned_fasta):
497
  shutil.copy2(aligned_fasta, standard_aligned)
498
  if os.path.exists(tree_file):
499
  shutil.copy2(tree_file, standard_tree)
500
+
501
+ success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
502
+ success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
503
+ success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
504
+
505
  if os.path.exists(log_file):
506
  try:
507
  with open(log_file, 'r') as f:
508
  log_content = f.read()
509
+ # Extract model information
510
  if "Best-fit model:" in log_content:
511
  model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
512
  if model_lines:
513
  success_msg += f"- {model_lines[0].strip()}\n"
514
  except Exception as e:
515
  logging.warning(f"Could not read log file: {e}")
516
+
517
  logging.info("Maximum likelihood tree construction completed")
518
  return True, success_msg, aligned_fasta, tree_file
519
+
520
  except Exception as e:
521
  logging.error(f"ML tree construction failed: {e}")
522
  return False, f"ML tree construction failed: {str(e)}", None, None
523
 
524
+ # --- Tree Analysis Function (Based on old Gradio API) ---
525
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
526
+ """
527
+ Analyze sequence and create phylogenetic tree using the working Gradio API pattern
528
+ """
529
  try:
530
  if not analyzer:
531
  return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
532
+
533
  if not sequence:
534
  return "Error: Please provide a sequence."
535
+
536
  if not (1 <= matching_percentage <= 99):
537
  return "Error: Matching percentage must be between 1 and 99."
538
+
539
+ # Find query sequence
540
  if not analyzer.find_query_sequence(sequence):
541
  return "Error: Invalid query sequence or sequence not found in dataset."
542
+
543
+ # Set matching percentage
544
  analyzer.matching_percentage = matching_percentage
545
+
546
+ # Find similar sequences
547
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
548
+
549
  if not matched_ids:
550
  return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
551
+
552
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
553
+
554
+ # Build tree structure
555
  tree_structure = analyzer.build_tree_structure(matched_ids)
556
  if not tree_structure:
557
  return "Error: Failed to build tree structure."
558
+
559
+ # Create interactive tree
560
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
561
  if not fig:
562
  return "Error: Failed to create tree visualization."
563
+
564
+ # Generate HTML content
565
  html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
566
+
567
+ # Save to output folder
568
  output_dir = "output"
569
  os.makedirs(output_dir, exist_ok=True)
570
+
571
+ # Create a safe filename
572
  safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
573
  html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
574
+
575
  with open(html_filename, "w", encoding='utf-8') as f:
576
  f.write(html_content)
577
+
578
  logging.info(f"Tree HTML saved to {html_filename}")
579
+
580
  return html_content
581
+
582
  except Exception as e:
583
  error_msg = f"Tree analysis error: {str(e)}"
584
  logging.error(error_msg)
 
585
  logging.error(f"Full traceback: {traceback.format_exc()}")
586
  return error_msg
587
 
588
+ # --- Keras Prediction ---
589
  def predict_with_keras(sequence):
590
+
591
  try:
592
  if not keras_model or not kmer_to_index:
593
  return f"Keras model not available. Input sequence: {sequence[:100]}..."
594
+
595
  if len(sequence) < 6:
596
  return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
597
+
598
+ # Generate k-mers
599
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
600
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
601
+
602
+ # Prepare input
603
  input_arr = np.array([indices])
604
  prediction = keras_model.predict(input_arr, verbose=0)[0]
605
+
606
+ # Format prediction as probabilities/scores (not a sequence)
607
  result = ''.join([str(round(p, 3)) for p in prediction])
608
  return result
609
  except Exception as e:
610
  logging.error(f"Keras prediction failed: {e}")
611
  return f"Keras prediction failed: {str(e)}"
612
 
613
+ # --- FASTA Reader ---
614
+
615
+
616
+
617
+
618
+
619
+
620
+
621
+
622
+
623
+
624
+
625
+
626
+
627
+
628
+
629
+
630
+
631
+
632
+
633
+
634
+
635
+
636
+
637
+
638
+
639
+
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
654
+
655
+
656
+
657
+
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
 
667
  def read_fasta_file(file_obj):
668
+
669
  try:
670
  if file_obj is None:
671
  return ""
672
+
673
+ # Handle file object
674
  if hasattr(file_obj, 'name'):
675
  with open(file_obj.name, "r") as f:
676
  content = f.read()
677
  else:
678
  content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
679
+
680
  lines = content.strip().split("\n")
681
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
682
  return ''.join(seq_lines)
 
683
  logging.error(f"Failed to read FASTA file: {e}")
684
  return ""
685
 
686
+ # --- Full Pipeline ---
687
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
688
+
689
  try:
690
  dna_input = read_fasta_file(fasta_file_obj)
691
  if not dna_input:
692
+ return "Failed to read FASTA file", "", "", "", "", None, None, None, "No input sequence"
693
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
694
  except Exception as e:
695
  error_msg = f"Pipeline error: {str(e)}"
696
  logging.error(error_msg)
697
+ return error_msg, "", "", "", "", None, None, None, error_msg
698
 
699
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
700
+
701
  try:
702
+ # Clean input
703
  dna_input = dna_input.upper().strip()
704
  if not dna_input:
705
+ return "Empty input", "", "", "", "", None, None, None, "No input provided"
706
+
707
+ # Sanitize DNA sequence
708
  if not re.match('^[ACTGN]+$', dna_input):
709
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
710
  logging.info("DNA sequence sanitized")
711
 
712
+ # Step 1: Boundary Prediction - Extract F gene sequence
713
+ processed_sequence = dna_input # This will be the sequence used for downstream analysis
714
+ boundary_output = ""
715
+
716
+ if boundary_model:
717
+ try:
718
+ predictions, probs, confidence = boundary_model.predict(dna_input)
719
+ regions = boundary_model.extract_gene_regions(predictions, dna_input)
720
+ if regions:
721
+ processed_sequence = regions[0]["sequence"] # Use the extracted gene region
722
+ boundary_output = processed_sequence # Output the actual F gene sequence
723
+ logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
724
+ else:
725
+ boundary_output = f"No F gene regions found in input sequence"
726
+ processed_sequence = dna_input
727
+ logging.warning("No gene regions found, using full sequence")
728
+ logging.info("Boundary model prediction completed")
729
+ except Exception as e:
730
+ logging.error(f"Boundary model failed: {e}")
731
+ boundary_output = f"Boundary model error: {str(e)}"
732
+ processed_sequence = dna_input # Fall back to original sequence
733
+ else:
734
+ boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
735
+ processed_sequence = dna_input
736
 
737
+ # Step 2: Keras Prediction (F gene validation)
738
  keras_output = ""
739
  if processed_sequence and len(processed_sequence) >= 6:
740
  keras_prediction = predict_with_keras(processed_sequence)
741
+ # Interpret keras prediction as F gene validation
742
+ if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
743
+ # You might want to add logic here to interpret the prediction scores
744
+ # For now, just show the prediction
745
+ keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
746
+ else:
747
+ keras_output = keras_prediction
748
+ else:
749
+ keras_output = "Skipped: sequence too short for F gene validation"
750
+
751
+ # Step 3: Maximum Likelihood Tree (MAFFT + IQ-TREE)
752
+
753
+
754
+
755
+
756
+
757
 
 
 
 
 
 
 
758
 
 
759
  aligned_file = None
760
  phy_file = None
761
  ml_tree_output = ""
762
+
763
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
764
  try:
765
  logging.info("Starting maximum likelihood tree construction...")
766
  ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
767
+
768
  if ml_success:
769
  ml_tree_output = ml_message
770
  aligned_file = ml_aligned
771
  phy_file = ml_tree
772
  else:
773
+ ml_tree_output = ml_message # This now includes detailed error information
774
+
775
  except Exception as e:
776
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
777
  logging.error(f"ML Tree failed: {e}")
 
 
778
  else:
779
  ml_tree_output = "ML tree construction skipped (not requested)"
780
 
781
+ # Step 4: ML Simplified Tree (using the existing approach)
782
  html_file = None
783
  tree_html_content = "No tree generated"
784
  simplified_ml_output = ""
785
+
786
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
787
  try:
788
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
789
+
790
+ # Use the existing tree analysis function with user-specified similarity
791
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
792
+
793
  if tree_result and not tree_result.startswith("Error:"):
794
+ # Success - we have HTML content
795
  tree_html_content = tree_result
796
  simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
797
+
798
+ # Check if HTML file was created
799
  output_dir = "output"
800
  if os.path.exists(output_dir):
801
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
802
  if html_files:
803
+ html_file = os.path.join(output_dir, html_files[-1]) # Get the latest
804
  simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
805
+
806
+ # Count sequences analyzed
807
  if analyzer.find_query_sequence(processed_sequence):
808
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
809
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
 
810
  else:
811
  simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
812
  tree_html_content = f"<p>Error: {tree_result}</p>"
813
+
814
  except Exception as e:
815
  logging.error(f"Simplified ML tree analysis failed: {e}")
816
  simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
 
 
 
 
 
 
817
 
818
  # Return all results
819
  return (
820
+ boundary_output, # F gene extraction result
821
+ keras_output, # F gene validation result
822
+ ml_tree_output, # ML tree construction status
823
+ simplified_ml_output, # Simplified tree analysis status
824
+ tree_html_content, # HTML content for tree display
825
+ aligned_file, # Path to aligned FASTA file
826
+ phy_file, # Path to phylogenetic tree file
827
+ html_file, # Path to HTML tree file
828
+ f"Pipeline completed. F gene length: {len(processed_sequence)} bp" # Summary
829
+
830
+
831
+
832
+
833
  )
834
+
835
  except Exception as e:
836
  error_msg = f"Pipeline execution failed: {str(e)}"
837
  logging.error(error_msg)
838
  import traceback
839
  logging.error(f"Full traceback: {traceback.format_exc()}")
840
  return (
841
+ error_msg, "", "", "", f"<p>Error: {error_msg}</p>",
842
  None, None, None, error_msg
843
  )
844
 
845
  # --- Gradio Interface ---
846
  def create_interface():
847
  """Create the Gradio interface with enhanced layout and features"""
848
+
849
+ # Custom CSS for better styling
850
  custom_css = """
851
+ .gradio-container {
852
+ max-width: 1200px !important;
853
+ }
854
+ .tab-nav button {
855
+ font-size: 16px !important;
856
+ }
857
+ .output-html {
858
+ height: 600px !important;
859
+ overflow: auto;
860
+ }
861
  """
862
+
863
  with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
864
  gr.Markdown("""
865
  # 🧬 F Gene Analysis Pipeline
866
 
867
+ This tool provides comprehensive analysis of F genes including:
868
+ - **Gene Boundary Detection**: Extract F gene sequences from larger genomic sequences
869
+ - **Gene Validation**: Validate extracted sequences using machine learning
870
+ - **Phylogenetic Analysis**: Build maximum likelihood trees and simplified phylogenetic trees
871
+
872
 
873
  **Instructions:**
874
+ 1. Enter your sequence directly or upload a FASTA file
875
+ 2. Adjust similarity threshold for phylogenetic analysis (1-99%)
876
+ 3. Choose whether to build maximum likelihood trees (requires MAFFT & IQ-TREE)
877
+ 4. Click "Run Analysis" to start the pipeline
878
  """)
879
 
880
  with gr.Tab("🔬 Analysis Pipeline"):
881
  with gr.Row():
882
  with gr.Column(scale=2):
883
+ # Input section
884
  gr.Markdown("### Input Sequence")
885
+ dna_input = gr.Textbox(
886
+ label="DNA Sequence",
887
+ placeholder="Enter your DNA sequence here (ATCG format)...",
888
+ lines=5,
889
+ max_lines=10
890
+ )
891
+
892
+ fasta_file = gr.File(
893
+ label="Or Upload FASTA File",
894
+ file_types=[".fasta", ".fa", ".fas", ".txt"]
895
+ )
896
+
897
  with gr.Row():
898
+ similarity_score = gr.Slider(
899
+ minimum=1,
900
+ maximum=99,
901
+ value=95.0,
902
+ step=1.0,
903
+ label="Similarity Threshold (%)",
904
+ info="Minimum similarity for phylogenetic analysis"
905
+ )
906
+
907
+ build_ml_tree = gr.Checkbox(
908
+ label="Build ML Tree",
909
+ value=False,
910
+ info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)"
911
+ )
912
+
913
+ # Action buttons
914
  with gr.Row():
915
  run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
916
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
917
+
918
  with gr.Column(scale=1):
919
+ # Status and info
920
  gr.Markdown("### Analysis Status")
921
+ status_display = gr.Textbox(
922
+ label="Status",
923
+ value="Ready to analyze",
924
+ interactive=False,
925
+ lines=3
926
+ )
927
+
928
+ # Model status
929
  gr.Markdown("### Available Models")
930
  model_status = []
931
+ if boundary_model:
932
+ model_status.append("✅ Boundary Detection Model")
933
+ else:
934
+ model_status.append("❌ Boundary Detection Model")
935
+
936
  if keras_model:
937
  model_status.append("✅ Gene Validation Model")
938
  else:
939
  model_status.append("❌ Gene Validation Model")
940
+
941
+
942
+
943
+
944
  if analyzer:
945
  model_status.append("✅ Tree Analysis Module")
946
  else:
947
  model_status.append("❌ Tree Analysis Module")
948
+
949
  gr.Markdown("\n".join(model_status))
950
 
951
  with gr.Tab("📊 Results"):
952
  with gr.Row():
953
  with gr.Column():
954
+ # Text outputs
955
+ boundary_output = gr.Textbox(
956
+ label="🎯 F Gene Extraction",
957
+ lines=5,
958
+ interactive=False
959
+ )
960
+
961
+ keras_output = gr.Textbox(
962
+ label="🔍 Gene Validation",
963
+ lines=3,
964
+ interactive=False
965
+ )
966
+
967
  with gr.Column():
968
+ ml_tree_output = gr.Textbox(
969
+ label="🌳 Maximum Likelihood Tree",
970
+ lines=5,
971
+ interactive=False
972
+ )
973
+
974
+ simplified_ml_output = gr.Textbox(
975
+ label="📈 Simplified Phylogenetic Analysis",
976
+ lines=3,
977
+ interactive=False
978
+ )
979
+
980
+ # Tree visualization
981
  gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
982
+ tree_html = gr.HTML(
983
+ label="Interactive Tree",
984
+ value="<p>No tree generated yet. Run analysis to see results.</p>"
985
+ )
986
+
987
+ # File downloads
988
  gr.Markdown("### 📁 Download Results")
989
  with gr.Row():
990
+ aligned_file = gr.File(
991
+ label="Aligned Sequences (FASTA)",
992
+ interactive=False
993
+ )
994
+
995
+ phy_file = gr.File(
996
+ label="Phylogenetic Tree File",
997
+ interactive=False
998
+ )
999
+
1000
+ html_file = gr.File(
1001
+ label="Interactive Tree (HTML)",
1002
+ interactive=False
1003
+ )
1004
 
1005
  with gr.Tab("ℹ️ Help & Info"):
1006
  gr.Markdown("""
1007
  ## About This Tool
1008
 
1009
  ### F Gene Analysis Pipeline
1010
+ This comprehensive pipeline analyzes F genes through multiple computational approaches:
1011
+
1012
+ #### 🎯 Gene Boundary Detection
1013
+ - Uses deep learning to identify and extract F gene sequences from larger genomic sequences
1014
+ - Provides confidence scores for detected boundaries
1015
+ - Automatically trims sequences to focus on the F gene region
1016
+
1017
+ #### 🔍 Gene Validation
1018
+ - Employs k-mer based machine learning models to validate extracted sequences
1019
+ - Provides probability scores indicating likelihood of being a genuine F gene
1020
+ - Uses 6-mer frequency patterns for classification
1021
+
1022
+ #### 🌳 Phylogenetic Analysis
1023
+
1024
+ **Maximum Likelihood Trees:**
1025
+ - Requires MAFFT (sequence alignment) and IQ-TREE (phylogenetic reconstruction)
1026
+ - Performs model selection and bootstrap analysis
1027
+ - Generates publication-quality phylogenetic trees
1028
+ - Provides detailed evolutionary analysis
1029
+
1030
+ **Simplified Trees:**
1031
+ - Uses built-in algorithms for quick phylogenetic analysis
1032
+ - Interactive visualization with similarity-based clustering
1033
+ - Faster alternative when external tools are not available
1034
 
1035
  ### Input Requirements
1036
+ - **DNA Sequences**: ATCG format, minimum 50 bp for meaningful analysis
1037
+ - **FASTA Files**: Standard FASTA format with single or multiple sequences
1038
+ - **Similarity Threshold**: 1-99% for controlling phylogenetic analysis sensitivity
1039
 
1040
  ### Dependencies
1041
+
1042
+ **Required for ML Trees:**
1043
  ```bash
1044
+ # Ubuntu/Debian
1045
+ sudo apt-get install mafft iqtree
1046
+
1047
+ # macOS
1048
+ brew install mafft iqtree
1049
+
1050
+ # Conda
1051
+ conda install -c bioconda mafft iqtree
1052
  ```
1053
 
1054
+ ### Output Files
1055
+ - **Aligned FASTA**: Multiple sequence alignment in FASTA format
1056
+ - **Tree File**: Newick format phylogenetic tree
1057
+ - **HTML Tree**: Interactive visualization for web browsers
1058
+
1059
  ### Troubleshooting
1060
+
1061
+ **Common Issues:**
1062
+ - *"No similar sequences found"*: Lower the similarity threshold
1063
+ - *"Sequence too short"*: Provide sequences longer than 50 bp
1064
+ - *"MAFFT/IQ-TREE not found"*: Install required dependencies
1065
+ - *"Model not available"*: Check model files are properly downloaded
1066
+
1067
+ **Performance Tips:**
1068
+ - Use sequences between 100-2000 bp for optimal performance
1069
+ - Limit to <50 sequences for faster tree construction
1070
+ - Lower similarity thresholds find more distant relatives
1071
+ - Higher thresholds focus on closely related sequences
1072
+
1073
+ ### Citation
1074
+ If you use this tool in your research, please cite the appropriate methods and tools used.
1075
  """)
1076
 
1077
+ # Event handlers
1078
+ def run_analysis_text(dna_seq, sim_score, build_tree):
1079
+ return run_pipeline(dna_seq, sim_score, build_tree)
1080
+
1081
+ def run_analysis_file(file_obj, sim_score, build_tree):
1082
+ return run_pipeline_from_file(file_obj, sim_score, build_tree)
1083
+
1084
  def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
1085
+ # Priority: file upload over text input
1086
  if file_obj is not None:
1087
  return run_pipeline_from_file(file_obj, sim_score, build_tree)
1088
  else:
 
 
1089
  def clear_inputs():
1090
  return "", None, 95.0, False, "Ready to analyze"
1091
 
1092
+ # Connect events
1093
  run_btn.click(
1094
  fn=run_analysis_combined,
1095
  inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
1096
  outputs=[
1097
+ boundary_output, keras_output, ml_tree_output,
1098
+ simplified_ml_output, tree_html, aligned_file,
1099
+ phy_file, html_file, status_display
1100
  ]
1101
  )
1102
+
1103
  clear_btn.click(
1104
  fn=clear_inputs,
1105
  outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
1106
  )
1107
 
1108
+ # Example data loading
1109
+ gr.Markdown("### 🧪 Example Data")
1110
  example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
1111
+
1112
  def load_example():
1113
  example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
1114
  return example_seq, "Example F gene sequence loaded"
1115
+
1116
+ example_btn.click(
1117
+ fn=load_example,
1118
+ outputs=[dna_input, status_display]
1119
+ )
1120
 
1121
  return iface
1122
 
1123
  # --- Main Execution ---
1124
  if __name__ == "__main__":
1125
+ # Initialize and launch interface
1126
  interface = create_interface()
1127
+
1128
+ # Launch with enhanced configuration
1129
  interface.launch(
1130
+ server_name="0.0.0.0", # Allow external connections
1131
+ server_port=7860, # Default Gradio port
1132
+ share=False, # Set to True for public sharing
1133
+ debug=True, # Enable debug mode
1134
+ show_error=True, # Show detailed errors
1135
+ max_threads=4, # Limit concurrent threads
1136
+ auth=None, # Add authentication if needed: ("username", "password")
1137
+ ssl_verify=False, # For development environments
1138
+ quiet=False # Show startup messages
1139
  )