re-type commited on
Commit
b32e104
·
verified ·
1 Parent(s): 6a65f2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +316 -508
app.py CHANGED
@@ -13,8 +13,10 @@ from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
  import tempfile
15
  import shutil
16
- import sys
17
  from pathlib import Path
 
 
18
 
19
  # --- Global Variables ---
20
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
@@ -24,11 +26,10 @@ IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
24
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
 
26
  # --- Paths ---
27
- from huggingface_hub import hf_hub_download
28
-
29
  # Model repository and file paths
30
  model_repo = "GGproject10/best_boundary_aware_model"
31
  csv_path = "f cleaned.csv"
 
32
 
33
  # Get HF token from environment (if available)
34
  hf_token = os.getenv("HF_TOKEN")
@@ -37,14 +38,13 @@ hf_token = os.getenv("HF_TOKEN")
37
  boundary_model = None
38
  keras_model = None
39
  kmer_to_index = None
 
 
 
40
 
41
  # Try to load boundary model from Hugging Face Hub
42
  try:
43
- boundary_path = hf_hub_download(
44
- repo_id=model_repo,
45
- filename="best_boundary_aware_model.pth",
46
- token=hf_token
47
- )
48
  if os.path.exists(boundary_path):
49
  boundary_model = GenePredictor(boundary_path)
50
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
@@ -55,17 +55,8 @@ except Exception as e:
55
 
56
  # Try to load Keras model from Hugging Face Hub
57
  try:
58
- keras_path = hf_hub_download(
59
- repo_id=model_repo,
60
- filename="best_model.keras",
61
- token=hf_token
62
- )
63
- kmer_path = hf_hub_download(
64
- repo_id=model_repo,
65
- filename="kmer_to_index.pkl",
66
- token=hf_token
67
- )
68
-
69
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
70
  keras_model = load_model(keras_path)
71
  with open(kmer_path, "rb") as f:
@@ -76,6 +67,26 @@ try:
76
  except Exception as e:
77
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # --- Initialize Tree Analyzer ---
80
  analyzer = None
81
  try:
@@ -83,7 +94,6 @@ try:
83
  if os.path.exists(csv_path):
84
  if analyzer.load_data(csv_path):
85
  logging.info("Tree analyzer initialized successfully")
86
- # Try to train AI model (optional)
87
  try:
88
  if not analyzer.train_ai_model():
89
  logging.warning("AI model training failed; proceeding with basic analysis.")
@@ -100,34 +110,52 @@ except Exception as e:
100
  analyzer = None
101
 
102
  # --- Enhanced Tool Detection ---
103
- def check_tool_availability():
104
- """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
105
-
106
- # Check MAFFT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  mafft_available = False
108
  mafft_cmd = None
109
-
110
- # Try multiple MAFFT locations
111
  mafft_candidates = [
112
  MAFFT_PATH,
113
  'mafft',
114
  '/usr/bin/mafft',
115
  '/usr/local/bin/mafft',
116
- 'mafft.bat', # Windows
 
 
117
  ]
118
-
119
  for candidate in mafft_candidates:
120
- if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
 
 
 
 
 
 
 
 
121
  mafft_available = True
122
  mafft_cmd = candidate
123
- logging.info(f"Found MAFFT at: {candidate}")
124
  break
125
-
126
- # Check IQ-TREE
127
  iqtree_available = False
128
  iqtree_cmd = None
129
-
130
- # Try multiple IQ-TREE locations and names
131
  iqtree_candidates = [
132
  IQTREE_PATH,
133
  'iqtree2',
@@ -136,74 +164,95 @@ def check_tool_availability():
136
  '/usr/local/bin/iqtree2',
137
  '/usr/bin/iqtree',
138
  '/usr/local/bin/iqtree',
139
- 'iqtree2.exe', # Windows
140
- 'iqtree.exe', # Windows
 
141
  ]
142
-
143
  for candidate in iqtree_candidates:
144
- if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
 
 
 
 
 
 
 
 
145
  iqtree_available = True
146
  iqtree_cmd = candidate
147
- logging.info(f"Found IQ-TREE at: {candidate}")
148
  break
149
-
150
- return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
151
-
152
- def install_dependencies_guide():
153
- """Provide installation guidance for missing dependencies"""
154
- guide = """
155
- 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
156
 
157
- For MAFFT:
158
- - Ubuntu/Debian: sudo apt-get install mafft
159
- - CentOS/RHEL: sudo yum install mafft
160
- - macOS: brew install mafft
161
- - Windows: Download from https://mafft.cbrc.jp/alignment/software/
162
-
163
- For IQ-TREE:
164
- - Ubuntu/Debian: sudo apt-get install iqtree
165
- - CentOS/RHEL: sudo yum install iqtree
166
- - macOS: brew install iqtree
167
- - Windows: Download from http://www.iqtree.org/
168
-
169
- Alternative: Use conda/mamba:
170
- - conda install -c bioconda mafft iqtree
171
 
172
- Docker option:
173
- - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
174
- - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  """
176
- return guide
177
 
178
- def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
179
- """Run MAFFT alignment with enhanced error handling"""
180
  try:
181
- # MAFFT command with more robust options
182
- cmd = [
183
- mafft_cmd,
184
- '--auto', # Automatic strategy selection
185
- '--quiet', # Reduce output verbosity
186
- input_fasta
187
- ]
188
-
 
 
 
189
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
190
-
191
- # Run MAFFT with enhanced error handling
192
- result = subprocess.run(
193
- cmd,
194
- capture_output=True,
195
- text=True,
196
- timeout=600, # Increased timeout to 10 minutes
197
- cwd=os.getcwd() # Ensure working directory is set
198
- )
199
-
200
  if result.returncode == 0:
201
- # Write aligned sequences to output file
202
  with open(output_fasta, 'w') as f:
203
  f.write(result.stdout)
204
  logging.info(f"MAFFT alignment completed: {output_fasta}")
205
-
206
- # Verify output file
207
  if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
208
  return True, output_fasta
209
  else:
@@ -212,10 +261,12 @@ def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
212
  error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
213
  logging.error(f"MAFFT failed: {error_msg}")
214
  return False, f"MAFFT error: {error_msg}"
215
-
216
  except subprocess.TimeoutExpired:
217
  logging.error("MAFFT timeout")
218
  return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
 
 
 
219
  except FileNotFoundError:
220
  return False, f"MAFFT executable not found: {mafft_cmd}"
221
  except Exception as e:
@@ -225,30 +276,19 @@ def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
225
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
226
  """Run IQ-TREE with enhanced options and error handling"""
227
  try:
228
- # Enhanced IQ-TREE command
229
- cmd = [
230
- iqtree_cmd,
231
- '-s', aligned_fasta,
232
- '-m', 'MFP', # ModelFinder Plus for automatic model selection
233
- '-bb', '1000', # Bootstrap replicates
234
- '-alrt', '1000', # SH-aLRT test
235
- '-nt', 'AUTO', # Auto detect threads
236
- '--prefix', output_prefix,
237
- '-redo', # Overwrite existing files
238
- '--quiet' # Reduce verbosity
239
- ]
240
-
241
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
242
-
243
- # Run IQ-TREE with enhanced error handling
244
- result = subprocess.run(
245
- cmd,
246
- capture_output=True,
247
- text=True,
248
- timeout=1200, # 20 minute timeout for larger datasets
249
- cwd=os.getcwd()
250
- )
251
-
252
  if result.returncode == 0:
253
  tree_file = f"{output_prefix}.treefile"
254
  if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
@@ -261,10 +301,12 @@ def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
261
  error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
262
  logging.error(f"IQ-TREE failed: {error_msg}")
263
  return False, f"IQ-TREE error: {error_msg}"
264
-
265
  except subprocess.TimeoutExpired:
266
  logging.error("IQ-TREE timeout")
267
  return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
 
 
 
268
  except FileNotFoundError:
269
  return False, f"IQ-TREE executable not found: {iqtree_cmd}"
270
  except Exception as e:
@@ -274,50 +316,33 @@ def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
274
  def create_simple_neighbor_joining_tree(sequences_dict):
275
  """Create a simple distance-based tree when ML tools are not available"""
276
  try:
277
- # This is a simplified implementation
278
- # In a real scenario, you'd want to use a proper NJ implementation
279
  import random
280
-
281
  seq_names = list(sequences_dict.keys())
282
  n_seqs = len(seq_names)
283
-
284
  if n_seqs < 2:
285
  return None, "Need at least 2 sequences for tree construction"
286
-
287
- # Create a simple Newick tree structure
288
  if n_seqs == 2:
289
  tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
290
  else:
291
- # Simple clustering approach
292
  tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
293
-
294
- # Save to temporary file
295
  tree_file = "simple_tree.nwk"
296
  with open(tree_file, 'w') as f:
297
  f.write(tree_str)
298
-
299
  return tree_file, "Simple distance-based tree created"
300
-
301
  except Exception as e:
302
  return None, f"Simple tree creation failed: {str(e)}"
303
 
304
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
305
  """Create a multi-FASTA file with query sequence and reference sequences"""
306
  try:
307
- # Create temporary FASTA file
308
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
309
-
310
- # Add query sequence
311
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
312
-
313
- # Add reference sequences from existing aligned FASTA if available
314
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
315
  if os.path.exists(ref_fasta_path):
316
  with open(ref_fasta_path, 'r') as ref_file:
317
  temp_fasta.write(ref_file.read())
318
  logging.info(f"Added reference sequences from {ref_fasta_path}")
319
  else:
320
- # If no reference file, try to create from CSV data
321
  if analyzer and hasattr(analyzer, 'data'):
322
  count = 0
323
  for idx, row in analyzer.data.iterrows():
@@ -326,13 +351,11 @@ def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
326
  sequence = str(row['sequence']).upper()
327
  temp_fasta.write(f">{seq_id}\n{sequence}\n")
328
  count += 1
329
- if count >= 20: # Limit to prevent too large datasets
330
  break
331
  logging.info(f"Added {count} reference sequences from CSV")
332
-
333
  temp_fasta.close()
334
  return temp_fasta.name
335
-
336
  except Exception as e:
337
  logging.error(f"Failed to create multi-FASTA: {e}")
338
  return None
@@ -340,189 +363,86 @@ def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
340
  def build_maximum_likelihood_tree(f_gene_sequence):
341
  """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
342
  try:
343
- # Check tool availability with enhanced detection
344
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
345
-
346
- # Prepare status message
347
  status_msg = "🔍 Checking dependencies...\n"
348
-
349
- if not mafft_available:
350
- status_msg += "❌ MAFFT not found\n"
351
- else:
352
- status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
353
-
354
- if not iqtree_available:
355
- status_msg += "❌ IQ-TREE not found\n"
356
- else:
357
- status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
358
-
359
- # If neither tool is available, provide installation guide
360
- if not mafft_available and not iqtree_available:
361
- guide = install_dependencies_guide()
362
- return False, f"{status_msg}\n{guide}", None, None
363
-
364
- # If only one tool is missing, provide specific guidance
365
- if not mafft_available:
366
- return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
367
-
368
- if not iqtree_available:
369
- status_msg += "\n⚠️ IQ-TREE not available. Attempting simple tree construction...\n"
370
-
371
- # Try to create a simple tree as fallback
372
- multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
373
- if multi_fasta:
374
- # Read sequences
375
- sequences = {}
376
- current_seq = ""
377
- current_name = ""
378
-
379
- with open(multi_fasta, 'r') as f:
380
- for line in f:
381
- line = line.strip()
382
- if line.startswith('>'):
383
- if current_name and current_seq:
384
- sequences[current_name] = current_seq
385
- current_name = line[1:]
386
- current_seq = ""
387
- else:
388
- current_seq += line
389
- if current_name and current_seq:
390
- sequences[current_name] = current_seq
391
-
392
- simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
393
- os.unlink(multi_fasta)
394
-
395
- if simple_tree:
396
- return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
397
- else:
398
- return False, f"{status_msg}❌ {simple_msg}", None, None
399
- else:
400
- return False, f"{status_msg}❌ Failed to create input sequences", None, None
401
-
402
- # Both tools available - proceed with full ML analysis
403
- # Create output directory
404
  output_dir = "ml_tree_output"
405
  os.makedirs(output_dir, exist_ok=True)
406
-
407
- # Step 1: Create multi-FASTA file with query and reference sequences
408
  logging.info("Creating multi-FASTA file...")
409
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
410
  if not multi_fasta:
411
  return False, f"{status_msg}❌ Failed to create input FASTA", None, None
412
-
413
- # Step 2: Run MAFFT alignment
414
  logging.info("Running MAFFT alignment...")
415
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
416
- mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
417
-
418
- # Clean up temporary file
419
  os.unlink(multi_fasta)
420
-
421
  if not mafft_success:
422
  return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
423
-
424
- # Step 3: Run IQ-TREE analysis
425
  logging.info("Running IQ-TREE analysis...")
426
  tree_prefix = os.path.join(output_dir, "ml_tree")
427
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
428
-
429
  if not iqtree_success:
430
  return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
431
-
432
- # Step 4: Prepare output files
433
  tree_file = iqtree_result
434
  log_file = f"{tree_prefix}.log"
435
-
436
- # Copy to standard names for compatibility
437
  standard_aligned = "f_gene_sequences_aligned.fasta"
438
  standard_tree = "f_gene_sequences.phy.treefile"
439
-
440
  if os.path.exists(aligned_fasta):
441
  shutil.copy2(aligned_fasta, standard_aligned)
442
  if os.path.exists(tree_file):
443
  shutil.copy2(tree_file, standard_tree)
444
-
445
- success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
446
- success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
447
- success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
448
-
449
  if os.path.exists(log_file):
450
  try:
451
  with open(log_file, 'r') as f:
452
  log_content = f.read()
453
- # Extract model information
454
  if "Best-fit model:" in log_content:
455
  model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
456
  if model_lines:
457
  success_msg += f"- {model_lines[0].strip()}\n"
458
  except Exception as e:
459
  logging.warning(f"Could not read log file: {e}")
460
-
461
  logging.info("Maximum likelihood tree construction completed")
462
  return True, success_msg, aligned_fasta, tree_file
463
-
464
  except Exception as e:
465
  logging.error(f"ML tree construction failed: {e}")
466
  return False, f"ML tree construction failed: {str(e)}", None, None
467
 
468
- # --- Tree Analysis Function (Based on old Gradio API) ---
469
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
470
- """
471
- Analyze sequence and create phylogenetic tree using the working Gradio API pattern
472
- """
473
  try:
474
  if not analyzer:
475
  return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
476
-
477
  if not sequence:
478
  return "Error: Please provide a sequence."
479
-
480
  if not (1 <= matching_percentage <= 99):
481
  return "Error: Matching percentage must be between 1 and 99."
482
-
483
- # Find query sequence
484
  if not analyzer.find_query_sequence(sequence):
485
  return "Error: Invalid query sequence or sequence not found in dataset."
486
-
487
- # Set matching percentage
488
  analyzer.matching_percentage = matching_percentage
489
-
490
- # Find similar sequences
491
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
492
-
493
  if not matched_ids:
494
  return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
495
-
496
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
497
-
498
- # Build tree structure
499
  tree_structure = analyzer.build_tree_structure(matched_ids)
500
  if not tree_structure:
501
  return "Error: Failed to build tree structure."
502
-
503
- # Create interactive tree
504
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
505
  if not fig:
506
  return "Error: Failed to create tree visualization."
507
-
508
- # Generate HTML content
509
  html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
510
-
511
- # Save to output folder
512
  output_dir = "output"
513
  os.makedirs(output_dir, exist_ok=True)
514
-
515
- # Create a safe filename
516
  safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
517
  html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
518
-
519
  with open(html_filename, "w", encoding='utf-8') as f:
520
  f.write(html_content)
521
-
522
  logging.info(f"Tree HTML saved to {html_filename}")
523
-
524
  return html_content
525
-
526
  except Exception as e:
527
  error_msg = f"Tree analysis error: {str(e)}"
528
  logging.error(error_msg)
@@ -530,43 +450,87 @@ def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
530
  logging.error(f"Full traceback: {traceback.format_exc()}")
531
  return error_msg
532
 
533
- # --- Keras Prediction ---
534
  def predict_with_keras(sequence):
 
535
  try:
536
  if not keras_model or not kmer_to_index:
537
  return f"Keras model not available. Input sequence: {sequence[:100]}..."
538
-
539
  if len(sequence) < 6:
540
  return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
541
-
542
- # Generate k-mers
543
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
544
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
545
-
546
- # Prepare input
547
  input_arr = np.array([indices])
548
  prediction = keras_model.predict(input_arr, verbose=0)[0]
549
-
550
- # Format prediction as probabilities/scores (not a sequence)
551
  result = ''.join([str(round(p, 3)) for p in prediction])
552
  return result
553
  except Exception as e:
554
  logging.error(f"Keras prediction failed: {e}")
555
  return f"Keras prediction failed: {str(e)}"
556
 
557
- # --- FASTA Reader ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  def read_fasta_file(file_obj):
 
559
  try:
560
  if file_obj is None:
561
  return ""
562
-
563
- # Handle file object
564
  if hasattr(file_obj, 'name'):
565
  with open(file_obj.name, "r") as f:
566
  content = f.read()
567
  else:
568
  content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
569
-
570
  lines = content.strip().split("\n")
571
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
572
  return ''.join(seq_lines)
@@ -574,86 +538,75 @@ def read_fasta_file(file_obj):
574
  logging.error(f"Failed to read FASTA file: {e}")
575
  return ""
576
 
577
- # --- Full Pipeline ---
578
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
 
579
  try:
580
  dna_input = read_fasta_file(fasta_file_obj)
581
  if not dna_input:
582
- return "Failed to read FASTA file", "", "", "", "", None, None, None, "No input sequence"
583
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
584
  except Exception as e:
585
  error_msg = f"Pipeline error: {str(e)}"
586
  logging.error(error_msg)
587
- return error_msg, "", "", "", "", None, None, None, error_msg
588
 
589
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
 
590
  try:
591
- # Clean input
592
  dna_input = dna_input.upper().strip()
593
  if not dna_input:
594
- return "Empty input", "", "", "", "", None, None, None, "No input provided"
595
-
596
- # Sanitize DNA sequence
597
  if not re.match('^[ACTGN]+$', dna_input):
598
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
599
  logging.info("DNA sequence sanitized")
600
 
601
- # Step 1: Boundary Prediction - Extract F gene sequence
602
- processed_sequence = dna_input # This will be the sequence used for downstream analysis
603
  boundary_output = ""
604
-
605
  if boundary_model:
606
  try:
607
  predictions, probs, confidence = boundary_model.predict(dna_input)
608
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
609
  if regions:
610
- processed_sequence = regions[0]["sequence"] # Use the extracted gene region
611
- boundary_output = processed_sequence # Output the actual F gene sequence
612
  logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
613
  else:
614
  boundary_output = f"No F gene regions found in input sequence"
615
- processed_sequence = dna_input
616
  logging.warning("No gene regions found, using full sequence")
617
- logging.info("Boundary model prediction completed")
618
  except Exception as e:
619
  logging.error(f"Boundary model failed: {e}")
620
  boundary_output = f"Boundary model error: {str(e)}"
621
- processed_sequence = dna_input # Fall back to original sequence
622
  else:
623
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
624
- processed_sequence = dna_input
625
 
626
- # Step 2: Keras Prediction (F gene validation)
627
  keras_output = ""
628
  if processed_sequence and len(processed_sequence) >= 6:
629
  keras_prediction = predict_with_keras(processed_sequence)
630
- # Interpret keras prediction as F gene validation
631
- if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
632
- # You might want to add logic here to interpret the prediction scores
633
- # For now, just show the prediction
634
- keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
635
- else:
636
- keras_output = keras_prediction
637
- else:
638
- keras_output = "Skipped: sequence too short for F gene validation"
639
 
640
- # Step 3: Maximum Likelihood Tree (MAFFT + IQ-TREE)
 
 
 
 
 
 
 
641
  aligned_file = None
642
  phy_file = None
643
  ml_tree_output = ""
644
-
645
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
646
  try:
647
  logging.info("Starting maximum likelihood tree construction...")
648
  ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
649
-
650
  if ml_success:
651
  ml_tree_output = ml_message
652
  aligned_file = ml_aligned
653
  phy_file = ml_tree
654
  else:
655
- ml_tree_output = ml_message # This now includes detailed error information
656
-
657
  except Exception as e:
658
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
659
  logging.error(f"ML Tree failed: {e}")
@@ -662,32 +615,23 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
662
  else:
663
  ml_tree_output = "ML tree construction skipped (not requested)"
664
 
665
- # Step 4: ML Simplified Tree (using the existing approach)
666
  html_file = None
667
  tree_html_content = "No tree generated"
668
  simplified_ml_output = ""
669
-
670
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
671
  try:
672
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
673
-
674
- # Use the existing tree analysis function with user-specified similarity
675
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
676
-
677
  if tree_result and not tree_result.startswith("Error:"):
678
- # Success - we have HTML content
679
  tree_html_content = tree_result
680
  simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
681
-
682
- # Check if HTML file was created
683
  output_dir = "output"
684
  if os.path.exists(output_dir):
685
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
686
  if html_files:
687
- html_file = os.path.join(output_dir, html_files[-1]) # Get the latest
688
  simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
689
-
690
- # Count sequences analyzed
691
  if analyzer.find_query_sequence(processed_sequence):
692
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
693
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
@@ -695,7 +639,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
695
  else:
696
  simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
697
  tree_html_content = f"<p>Error: {tree_result}</p>"
698
-
699
  except Exception as e:
700
  logging.error(f"Simplified ML tree analysis failed: {e}")
701
  simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
@@ -708,264 +651,141 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
708
 
709
  # Return all results
710
  return (
711
- boundary_output, # F gene extraction result
712
- keras_output, # F gene validation result
713
- ml_tree_output, # ML tree construction status
714
- simplified_ml_output, # Simplified tree analysis status
715
- tree_html_content, # HTML content for tree display
716
- aligned_file, # Path to aligned FASTA file
717
- phy_file, # Path to phylogenetic tree file
718
- html_file, # Path to HTML tree file
719
- f"Pipeline completed. F gene length: {len(processed_sequence)} bp" # Summary
 
 
 
 
720
  )
721
-
722
  except Exception as e:
723
  error_msg = f"Pipeline execution failed: {str(e)}"
724
  logging.error(error_msg)
725
  import traceback
726
  logging.error(f"Full traceback: {traceback.format_exc()}")
727
  return (
728
- error_msg, "", "", "", f"<p>Error: {error_msg}</p>",
729
  None, None, None, error_msg
730
  )
731
 
732
  # --- Gradio Interface ---
733
  def create_interface():
734
  """Create the Gradio interface with enhanced layout and features"""
735
-
736
- # Custom CSS for better styling
737
  custom_css = """
738
- .gradio-container {
739
- max-width: 1200px !important;
740
- }
741
- .tab-nav button {
742
- font-size: 16px !important;
743
- }
744
- .output-html {
745
- height: 600px !important;
746
- overflow: auto;
747
- }
748
  """
749
-
750
  with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
751
  gr.Markdown("""
752
  # 🧬 F Gene Analysis Pipeline
753
 
754
  This tool provides comprehensive analysis of F genes including:
755
- - **Gene Boundary Detection**: Extract F gene sequences from larger genomic sequences
756
- - **Gene Validation**: Validate extracted sequences using machine learning
757
- - **Phylogenetic Analysis**: Build maximum likelihood trees and simplified phylogenetic trees
 
758
 
759
  **Instructions:**
760
- 1. Enter your sequence directly or upload a FASTA file
761
- 2. Adjust similarity threshold for phylogenetic analysis (1-99%)
762
- 3. Choose whether to build maximum likelihood trees (requires MAFFT & IQ-TREE)
763
- 4. Click "Run Analysis" to start the pipeline
764
  """)
765
 
766
  with gr.Tab("🔬 Analysis Pipeline"):
767
  with gr.Row():
768
  with gr.Column(scale=2):
769
- # Input section
770
  gr.Markdown("### Input Sequence")
771
- dna_input = gr.Textbox(
772
- label="DNA Sequence",
773
- placeholder="Enter your DNA sequence here (ATCG format)...",
774
- lines=5,
775
- max_lines=10
776
- )
777
-
778
- fasta_file = gr.File(
779
- label="Or Upload FASTA File",
780
- file_types=[".fasta", ".fa", ".fas", ".txt"]
781
- )
782
-
783
  with gr.Row():
784
- similarity_score = gr.Slider(
785
- minimum=1,
786
- maximum=99,
787
- value=95.0,
788
- step=1.0,
789
- label="Similarity Threshold (%)",
790
- info="Minimum similarity for phylogenetic analysis"
791
- )
792
-
793
- build_ml_tree = gr.Checkbox(
794
- label="Build ML Tree",
795
- value=False,
796
- info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)"
797
- )
798
-
799
- # Action buttons
800
  with gr.Row():
801
  run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
802
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
803
-
804
  with gr.Column(scale=1):
805
- # Status and info
806
  gr.Markdown("### Analysis Status")
807
- status_display = gr.Textbox(
808
- label="Status",
809
- value="Ready to analyze",
810
- interactive=False,
811
- lines=3
812
- )
813
-
814
- # Model status
815
  gr.Markdown("### Available Models")
816
  model_status = []
817
  if boundary_model:
818
  model_status.append("✅ Boundary Detection Model")
819
  else:
820
  model_status.append("❌ Boundary Detection Model")
821
-
822
  if keras_model:
823
  model_status.append("✅ Gene Validation Model")
824
  else:
825
  model_status.append("❌ Gene Validation Model")
826
-
 
 
 
827
  if analyzer:
828
  model_status.append("✅ Tree Analysis Module")
829
  else:
830
  model_status.append("❌ Tree Analysis Module")
831
-
832
  gr.Markdown("\n".join(model_status))
833
 
834
  with gr.Tab("📊 Results"):
835
  with gr.Row():
836
  with gr.Column():
837
- # Text outputs
838
- boundary_output = gr.Textbox(
839
- label="🎯 F Gene Extraction",
840
- lines=5,
841
- interactive=False
842
- )
843
-
844
- keras_output = gr.Textbox(
845
- label="🔍 Gene Validation",
846
- lines=3,
847
- interactive=False
848
- )
849
-
850
  with gr.Column():
851
- ml_tree_output = gr.Textbox(
852
- label="🌳 Maximum Likelihood Tree",
853
- lines=5,
854
- interactive=False
855
- )
856
-
857
- simplified_ml_output = gr.Textbox(
858
- label="📈 Simplified Phylogenetic Analysis",
859
- lines=3,
860
- interactive=False
861
- )
862
-
863
- # Tree visualization
864
  gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
865
- tree_html = gr.HTML(
866
- label="Interactive Tree",
867
- value="<p>No tree generated yet. Run analysis to see results.</p>"
868
- )
869
-
870
- # File downloads
871
  gr.Markdown("### 📁 Download Results")
872
  with gr.Row():
873
- aligned_file = gr.File(
874
- label="Aligned Sequences (FASTA)",
875
- interactive=False
876
- )
877
-
878
- phy_file = gr.File(
879
- label="Phylogenetic Tree File",
880
- interactive=False
881
- )
882
-
883
- html_file = gr.File(
884
- label="Interactive Tree (HTML)",
885
- interactive=False
886
- )
887
 
888
  with gr.Tab("ℹ️ Help & Info"):
889
  gr.Markdown("""
890
  ## About This Tool
891
 
892
  ### F Gene Analysis Pipeline
893
- This comprehensive pipeline analyzes F genes through multiple computational approaches:
894
-
895
- #### 🎯 Gene Boundary Detection
896
- - Uses deep learning to identify and extract F gene sequences from larger genomic sequences
897
- - Provides confidence scores for detected boundaries
898
- - Automatically trims sequences to focus on the F gene region
899
-
900
- #### 🔍 Gene Validation
901
- - Employs k-mer based machine learning models to validate extracted sequences
902
- - Provides probability scores indicating likelihood of being a genuine F gene
903
- - Uses 6-mer frequency patterns for classification
904
-
905
- #### 🌳 Phylogenetic Analysis
906
-
907
- **Maximum Likelihood Trees:**
908
- - Requires MAFFT (sequence alignment) and IQ-TREE (phylogenetic reconstruction)
909
- - Performs model selection and bootstrap analysis
910
- - Generates publication-quality phylogenetic trees
911
- - Provides detailed evolutionary analysis
912
-
913
- **Simplified Trees:**
914
- - Uses built-in algorithms for quick phylogenetic analysis
915
- - Interactive visualization with similarity-based clustering
916
- - Faster alternative when external tools are not available
917
 
918
  ### Input Requirements
919
- - **DNA Sequences**: ATCG format, minimum 50 bp for meaningful analysis
920
- - **FASTA Files**: Standard FASTA format with single or multiple sequences
921
- - **Similarity Threshold**: 1-99% for controlling phylogenetic analysis sensitivity
922
 
923
  ### Dependencies
924
-
925
- **Required for ML Trees:**
926
  ```bash
927
- # Ubuntu/Debian
928
- sudo apt-get install mafft iqtree
929
-
930
- # macOS
931
- brew install mafft iqtree
932
-
933
- # Conda
934
- conda install -c bioconda mafft iqtree
935
  ```
936
 
937
- ### Output Files
938
- - **Aligned FASTA**: Multiple sequence alignment in FASTA format
939
- - **Tree File**: Newick format phylogenetic tree
940
- - **HTML Tree**: Interactive visualization for web browsers
941
-
942
  ### Troubleshooting
943
-
944
- **Common Issues:**
945
- - *"No similar sequences found"*: Lower the similarity threshold
946
- - *"Sequence too short"*: Provide sequences longer than 50 bp
947
- - *"MAFFT/IQ-TREE not found"*: Install required dependencies
948
- - *"Model not available"*: Check model files are properly downloaded
949
-
950
- **Performance Tips:**
951
- - Use sequences between 100-2000 bp for optimal performance
952
- - Limit to <50 sequences for faster tree construction
953
- - Lower similarity thresholds find more distant relatives
954
- - Higher thresholds focus on closely related sequences
955
-
956
- ### Citation
957
- If you use this tool in your research, please cite the appropriate methods and tools used.
958
  """)
959
 
960
- # Event handlers
961
- def run_analysis_text(dna_seq, sim_score, build_tree):
962
- return run_pipeline(dna_seq, sim_score, build_tree)
963
-
964
- def run_analysis_file(file_obj, sim_score, build_tree):
965
- return run_pipeline_from_file(file_obj, sim_score, build_tree)
966
-
967
  def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
968
- # Priority: file upload over text input
969
  if file_obj is not None:
970
  return run_pipeline_from_file(file_obj, sim_score, build_tree)
971
  else:
@@ -974,51 +794,39 @@ def create_interface():
974
  def clear_inputs():
975
  return "", None, 95.0, False, "Ready to analyze"
976
 
977
- # Connect events
978
  run_btn.click(
979
  fn=run_analysis_combined,
980
  inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
981
  outputs=[
982
- boundary_output, keras_output, ml_tree_output,
983
- simplified_ml_output, tree_html, aligned_file,
984
- phy_file, html_file, status_display
985
  ]
986
  )
987
-
988
  clear_btn.click(
989
  fn=clear_inputs,
990
  outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
991
  )
992
 
993
- # Example data loading
994
- gr.Markdown("### 🧪 Example Data")
995
  example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
996
-
997
  def load_example():
998
  example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
999
  return example_seq, "Example F gene sequence loaded"
1000
-
1001
- example_btn.click(
1002
- fn=load_example,
1003
- outputs=[dna_input, status_display]
1004
- )
1005
 
1006
  return iface
1007
 
1008
  # --- Main Execution ---
1009
  if __name__ == "__main__":
1010
- # Initialize and launch interface
1011
  interface = create_interface()
1012
-
1013
- # Launch with enhanced configuration
1014
  interface.launch(
1015
- server_name="0.0.0.0", # Allow external connections
1016
- server_port=7860, # Default Gradio port
1017
- share=False, # Set to True for public sharing
1018
- debug=True, # Enable debug mode
1019
- show_error=True, # Show detailed errors
1020
- max_threads=4, # Limit concurrent threads
1021
- auth=None, # Add authentication if needed: ("username", "password")
1022
- ssl_verify=False, # For development environments
1023
- quiet=False # Show startup messages
1024
  )
 
13
  import ml_simplified_tree
14
  import tempfile
15
  import shutil
16
+ import stat
17
  from pathlib import Path
18
+ from huggingface_hub import hf_hub_download
19
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
20
 
21
  # --- Global Variables ---
22
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
 
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
 
28
  # --- Paths ---
 
 
29
  # Model repository and file paths
30
  model_repo = "GGproject10/best_boundary_aware_model"
31
  csv_path = "f cleaned.csv"
32
+ classifier_model_dir = "model" # Directory for second model files
33
 
34
  # Get HF token from environment (if available)
35
  hf_token = os.getenv("HF_TOKEN")
 
38
  boundary_model = None
39
  keras_model = None
40
  kmer_to_index = None
41
+ classifier_model = None
42
+ classifier_kmer_to_index = None
43
+ classifier_maxlen = None
44
 
45
  # Try to load boundary model from Hugging Face Hub
46
  try:
47
+ boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token)
 
 
 
 
48
  if os.path.exists(boundary_path):
49
  boundary_model = GenePredictor(boundary_path)
50
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 
55
 
56
  # Try to load Keras model from Hugging Face Hub
57
  try:
58
+ keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras", token=hf_token)
59
+ kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl", token=hf_token)
 
 
 
 
 
 
 
 
 
60
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
61
  keras_model = load_model(keras_path)
62
  with open(kmer_path, "rb") as f:
 
67
  except Exception as e:
68
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
69
 
70
+ # Try to load classifier model (second model)
71
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
72
+ try:
73
+ classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
74
+ classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
75
+ classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
76
+ if os.path.exists(classifier_path) and os.path.exists(classifier_kmer_path) and os.path.exists(classifier_maxlen_path):
77
+ classifier_model = load_model(classifier_path)
78
+ with open(classifier_kmer_path, "rb") as f:
79
+ classifier_kmer_to_index = pickle.load(f)
80
+ with open(classifier_maxlen_path, "r") as f:
81
+ classifier_maxlen = int(f.read().strip())
82
+ logging.info("Classifier model loaded successfully.")
83
+ else:
84
+ logging.warning(f"Classifier model files not found in {classifier_model_dir}")
85
+ except Exception as e:
86
+ logging.error(f"Failed to load classifier model: {e}")
87
+
88
+ LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
89
+
90
  # --- Initialize Tree Analyzer ---
91
  analyzer = None
92
  try:
 
94
  if os.path.exists(csv_path):
95
  if analyzer.load_data(csv_path):
96
  logging.info("Tree analyzer initialized successfully")
 
97
  try:
98
  if not analyzer.train_ai_model():
99
  logging.warning("AI model training failed; proceeding with basic analysis.")
 
110
  analyzer = None
111
 
112
  # --- Enhanced Tool Detection ---
113
+ def check_and_fix_executable_permissions(filepath):
114
+ """Check and fix executable permissions for a file"""
115
+ try:
116
+ if os.path.exists(filepath):
117
+ if not os.access(filepath, os.X_OK):
118
+ logging.info(f"File {filepath} is not executable, attempting to fix permissions...")
119
+ current_permissions = os.stat(filepath).st_mode
120
+ os.chmod(filepath, current_permissions | stat.S_IEXEC | stat.S_IXUSR | stat.S_IXGRP)
121
+ logging.info(f"Fixed permissions for {filepath}")
122
+ return True
123
+ return True
124
+ return False
125
+ except Exception as e:
126
+ logging.error(f"Failed to fix permissions for {filepath}: {e}")
127
+ return False
128
+
129
+ def enhanced_check_tool_availability():
130
+ """Enhanced check for MAFFT and IQ-TREE availability with permission fixing"""
131
  mafft_available = False
132
  mafft_cmd = None
 
 
133
  mafft_candidates = [
134
  MAFFT_PATH,
135
  'mafft',
136
  '/usr/bin/mafft',
137
  '/usr/local/bin/mafft',
138
+ '/opt/homebrew/bin/mafft',
139
+ '/usr/local/homebrew/bin/mafft',
140
+ 'mafft.bat',
141
  ]
 
142
  for candidate in mafft_candidates:
143
+ if candidate and os.path.exists(candidate):
144
+ if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
145
+ check_and_fix_executable_permissions(candidate)
146
+ if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
147
+ mafft_available = True
148
+ mafft_cmd = candidate
149
+ logging.info(f"Found MAFFT at: {candidate}")
150
+ break
151
+ elif candidate and shutil.which(candidate) is not None:
152
  mafft_available = True
153
  mafft_cmd = candidate
154
+ logging.info(f"Found MAFFT in PATH: {candidate}")
155
  break
156
+
 
157
  iqtree_available = False
158
  iqtree_cmd = None
 
 
159
  iqtree_candidates = [
160
  IQTREE_PATH,
161
  'iqtree2',
 
164
  '/usr/local/bin/iqtree2',
165
  '/usr/bin/iqtree',
166
  '/usr/local/bin/iqtree',
167
+ '/opt/homebrew/bin/iqtree2',
168
+ 'iqtree2.exe',
169
+ 'iqtree.exe',
170
  ]
 
171
  for candidate in iqtree_candidates:
172
+ if candidate and os.path.exists(candidate):
173
+ if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
174
+ check_and_fix_executable_permissions(candidate)
175
+ if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
176
+ iqtree_available = True
177
+ iqtree_cmd = candidate
178
+ logging.info(f"Found IQ-TREE at: {candidate}")
179
+ break
180
+ elif candidate and shutil.which(candidate) is not None:
181
  iqtree_available = True
182
  iqtree_cmd = candidate
183
+ logging.info(f"Found IQ-TREE in PATH: {candidate}")
184
  break
 
 
 
 
 
 
 
185
 
186
+ return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ def get_installation_instructions():
189
+ """Get detailed installation instructions based on the current system"""
190
+ import platform
191
+ system = platform.system().lower()
192
+ if system == "linux":
193
+ try:
194
+ with open('/etc/os-release', 'r') as f:
195
+ os_info = f.read().lower()
196
+ if 'ubuntu' in os_info or 'debian' in os_info:
197
+ return """
198
+ 📦 INSTALLATION INSTRUCTIONS (Ubuntu/Debian):
199
+ 1. Update package list: sudo apt-get update
200
+ 2. Install MAFFT and IQ-TREE: sudo apt-get install mafft iqtree
201
+ 3. Verify installation: mafft --version, iqtree2 --version
202
+ Alternative using Conda: conda install -c bioconda mafft iqtree
203
+ """
204
+ elif 'centos' in os_info or 'rhel' in os_info or 'fedora' in os_info:
205
+ return """
206
+ 📦 INSTALLATION INSTRUCTIONS (CentOS/RHEL/Fedora):
207
+ 1. Install EPEL repository (CentOS/RHEL): sudo yum install epel-release
208
+ 2. Install packages: sudo yum install mafft iqtree
209
+ 3. Verify installation: mafft --version, iqtree2 --version
210
+ """
211
+ except:
212
+ pass
213
+ elif system == "darwin":
214
+ return """
215
+ 📦 INSTALLATION INSTRUCTIONS (macOS):
216
+ Using Homebrew: 1. Install Homebrew: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
217
+ 2. Install MAFFT and IQ-TREE: brew install mafft iqtree
218
+ 3. Verify installation: mafft --version, iqtree2 --version
219
+ Using Conda: conda install -c bioconda mafft iqtree
220
+ """
221
+ elif system == "windows":
222
+ return """
223
+ 📦 INSTALLATION INSTRUCTIONS (Windows):
224
+ Option 1 - Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
225
+ Option 2 - Manual: 1. Download MAFFT: https://mafft.cbrc.jp/alignment/software/
226
+ 2. Download IQ-TREE: http://www.iqtree.org/
227
+ 3. Add to PATH
228
+ """
229
+ return """
230
+ 📦 GENERAL INSTALLATION INSTRUCTIONS:
231
+ Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
232
+ Manual: 1. MAFFT: https://mafft.cbrc.jp/alignment/software/
233
+ 2. IQ-TREE: http://www.iqtree.org/
234
  """
 
235
 
236
+ def run_mafft_alignment_improved(input_fasta, output_fasta, mafft_cmd):
237
+ """Run MAFFT alignment with improved permission and error handling"""
238
  try:
239
+ if not os.access(mafft_cmd, os.X_OK):
240
+ logging.warning(f"MAFFT executable {mafft_cmd} is not executable")
241
+ if not check_and_fix_executable_permissions(mafft_cmd):
242
+ return False, f"Cannot make {mafft_cmd} executable"
243
+ try:
244
+ test_result = subprocess.run([mafft_cmd, '--version'], capture_output=True, text=True, timeout=10)
245
+ if test_result.returncode != 0:
246
+ return False, f"MAFFT version check failed: {test_result.stderr}"
247
+ except Exception as e:
248
+ return False, f"MAFFT version check failed: {str(e)}"
249
+ cmd = [mafft_cmd, '--auto', '--quiet', '--thread', '2', input_fasta]
250
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
251
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
 
 
 
 
 
 
 
 
 
252
  if result.returncode == 0:
 
253
  with open(output_fasta, 'w') as f:
254
  f.write(result.stdout)
255
  logging.info(f"MAFFT alignment completed: {output_fasta}")
 
 
256
  if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
257
  return True, output_fasta
258
  else:
 
261
  error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
262
  logging.error(f"MAFFT failed: {error_msg}")
263
  return False, f"MAFFT error: {error_msg}"
 
264
  except subprocess.TimeoutExpired:
265
  logging.error("MAFFT timeout")
266
  return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
267
+ except PermissionError as e:
268
+ logging.error(f"Permission error running MAFFT: {e}")
269
+ return False, f"Permission denied: {mafft_cmd}. Please check file permissions."
270
  except FileNotFoundError:
271
  return False, f"MAFFT executable not found: {mafft_cmd}"
272
  except Exception as e:
 
276
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
277
  """Run IQ-TREE with enhanced options and error handling"""
278
  try:
279
+ if not os.access(iqtree_cmd, os.X_OK):
280
+ logging.warning(f"IQ-TREE executable {iqtree_cmd} is not executable")
281
+ if not check_and_fix_executable_permissions(iqtree_cmd):
282
+ return False, f"Cannot make {iqtree_cmd} executable"
283
+ try:
284
+ test_result = subprocess.run([iqtree_cmd, '--version'], capture_output=True, text=True, timeout=10)
285
+ if test_result.returncode != 0:
286
+ return False, f"IQ-TREE version check failed: {test_result.stderr}"
287
+ except Exception as e:
288
+ return False, f"IQ-TREE version check failed: {str(e)}"
289
+ cmd = [iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000', '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '-redo', '--quiet']
 
 
290
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
291
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200, cwd=os.getcwd())
 
 
 
 
 
 
 
 
 
292
  if result.returncode == 0:
293
  tree_file = f"{output_prefix}.treefile"
294
  if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
 
301
  error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
302
  logging.error(f"IQ-TREE failed: {error_msg}")
303
  return False, f"IQ-TREE error: {error_msg}"
 
304
  except subprocess.TimeoutExpired:
305
  logging.error("IQ-TREE timeout")
306
  return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
307
+ except PermissionError as e:
308
+ logging.error(f"Permission error running IQ-TREE: {e}")
309
+ return False, f"Permission denied: {iqtree_cmd}. Please check file permissions."
310
  except FileNotFoundError:
311
  return False, f"IQ-TREE executable not found: {iqtree_cmd}"
312
  except Exception as e:
 
316
  def create_simple_neighbor_joining_tree(sequences_dict):
317
  """Create a simple distance-based tree when ML tools are not available"""
318
  try:
 
 
319
  import random
 
320
  seq_names = list(sequences_dict.keys())
321
  n_seqs = len(seq_names)
 
322
  if n_seqs < 2:
323
  return None, "Need at least 2 sequences for tree construction"
 
 
324
  if n_seqs == 2:
325
  tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
326
  else:
 
327
  tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
 
 
328
  tree_file = "simple_tree.nwk"
329
  with open(tree_file, 'w') as f:
330
  f.write(tree_str)
 
331
  return tree_file, "Simple distance-based tree created"
 
332
  except Exception as e:
333
  return None, f"Simple tree creation failed: {str(e)}"
334
 
335
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
336
  """Create a multi-FASTA file with query sequence and reference sequences"""
337
  try:
 
338
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
 
 
339
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
 
 
340
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
341
  if os.path.exists(ref_fasta_path):
342
  with open(ref_fasta_path, 'r') as ref_file:
343
  temp_fasta.write(ref_file.read())
344
  logging.info(f"Added reference sequences from {ref_fasta_path}")
345
  else:
 
346
  if analyzer and hasattr(analyzer, 'data'):
347
  count = 0
348
  for idx, row in analyzer.data.iterrows():
 
351
  sequence = str(row['sequence']).upper()
352
  temp_fasta.write(f">{seq_id}\n{sequence}\n")
353
  count += 1
354
+ if count >= 20:
355
  break
356
  logging.info(f"Added {count} reference sequences from CSV")
 
357
  temp_fasta.close()
358
  return temp_fasta.name
 
359
  except Exception as e:
360
  logging.error(f"Failed to create multi-FASTA: {e}")
361
  return None
 
363
  def build_maximum_likelihood_tree(f_gene_sequence):
364
  """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
365
  try:
366
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = enhanced_check_tool_availability()
 
 
 
367
  status_msg = "🔍 Checking dependencies...\n"
368
+ status_msg += f"✅ MAFFT found: {mafft_cmd}\n" if mafft_available else "❌ MAFFT not found\n"
369
+ status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n" if iqtree_available else "❌ IQ-TREE not found\n"
370
+ if not mafft_available or not iqtree_available:
371
+ instructions = get_installation_instructions()
372
+ return False, f"{status_msg}\n{instructions}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  output_dir = "ml_tree_output"
374
  os.makedirs(output_dir, exist_ok=True)
 
 
375
  logging.info("Creating multi-FASTA file...")
376
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
377
  if not multi_fasta:
378
  return False, f"{status_msg}❌ Failed to create input FASTA", None, None
 
 
379
  logging.info("Running MAFFT alignment...")
380
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
381
+ mafft_success, mafft_result = run_mafft_alignment_improved(multi_fasta, aligned_fasta, mafft_cmd)
 
 
382
  os.unlink(multi_fasta)
 
383
  if not mafft_success:
384
  return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
 
 
385
  logging.info("Running IQ-TREE analysis...")
386
  tree_prefix = os.path.join(output_dir, "ml_tree")
387
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
 
388
  if not iqtree_success:
389
  return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
 
 
390
  tree_file = iqtree_result
391
  log_file = f"{tree_prefix}.log"
 
 
392
  standard_aligned = "f_gene_sequences_aligned.fasta"
393
  standard_tree = "f_gene_sequences.phy.treefile"
 
394
  if os.path.exists(aligned_fasta):
395
  shutil.copy2(aligned_fasta, standard_aligned)
396
  if os.path.exists(tree_file):
397
  shutil.copy2(tree_file, standard_tree)
398
+ success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}\n"
 
 
 
 
399
  if os.path.exists(log_file):
400
  try:
401
  with open(log_file, 'r') as f:
402
  log_content = f.read()
 
403
  if "Best-fit model:" in log_content:
404
  model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
405
  if model_lines:
406
  success_msg += f"- {model_lines[0].strip()}\n"
407
  except Exception as e:
408
  logging.warning(f"Could not read log file: {e}")
 
409
  logging.info("Maximum likelihood tree construction completed")
410
  return True, success_msg, aligned_fasta, tree_file
 
411
  except Exception as e:
412
  logging.error(f"ML tree construction failed: {e}")
413
  return False, f"ML tree construction failed: {str(e)}", None, None
414
 
 
415
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
416
+ """Analyze sequence and create phylogenetic tree"""
 
 
417
  try:
418
  if not analyzer:
419
  return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
 
420
  if not sequence:
421
  return "Error: Please provide a sequence."
 
422
  if not (1 <= matching_percentage <= 99):
423
  return "Error: Matching percentage must be between 1 and 99."
 
 
424
  if not analyzer.find_query_sequence(sequence):
425
  return "Error: Invalid query sequence or sequence not found in dataset."
 
 
426
  analyzer.matching_percentage = matching_percentage
 
 
427
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
 
428
  if not matched_ids:
429
  return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
 
430
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
 
 
431
  tree_structure = analyzer.build_tree_structure(matched_ids)
432
  if not tree_structure:
433
  return "Error: Failed to build tree structure."
 
 
434
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
435
  if not fig:
436
  return "Error: Failed to create tree visualization."
 
 
437
  html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
 
 
438
  output_dir = "output"
439
  os.makedirs(output_dir, exist_ok=True)
 
 
440
  safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
441
  html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
 
442
  with open(html_filename, "w", encoding='utf-8') as f:
443
  f.write(html_content)
 
444
  logging.info(f"Tree HTML saved to {html_filename}")
 
445
  return html_content
 
446
  except Exception as e:
447
  error_msg = f"Tree analysis error: {str(e)}"
448
  logging.error(error_msg)
 
450
  logging.error(f"Full traceback: {traceback.format_exc()}")
451
  return error_msg
452
 
 
453
  def predict_with_keras(sequence):
454
+ """Keras prediction for initial sequence processing"""
455
  try:
456
  if not keras_model or not kmer_to_index:
457
  return f"Keras model not available. Input sequence: {sequence[:100]}..."
 
458
  if len(sequence) < 6:
459
  return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
 
 
460
  kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
461
  indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
 
 
462
  input_arr = np.array([indices])
463
  prediction = keras_model.predict(input_arr, verbose=0)[0]
 
 
464
  result = ''.join([str(round(p, 3)) for p in prediction])
465
  return result
466
  except Exception as e:
467
  logging.error(f"Keras prediction failed: {e}")
468
  return f"Keras prediction failed: {str(e)}"
469
 
470
+ def classify_sequence(sequence):
471
+ """Classify sequence using the second model"""
472
+ try:
473
+ if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
474
+ return {
475
+ "status": "error",
476
+ "message": "Classifier model not available.",
477
+ "confidence": None,
478
+ "predicted_label": None
479
+ }
480
+ if len(sequence) < 1500:
481
+ return {
482
+ "status": "error",
483
+ "message": "Sequence too short. Must be at least 1500 bases.",
484
+ "confidence": None,
485
+ "predicted_label": None
486
+ }
487
+ tokens = [sequence[i:i+6] for i in range(len(sequence)-5+1)]
488
+ encoded = [classifier_kmer_to_index.get(kmer, 0) for kmer in tokens]
489
+ padded = pad_sequences([encoded], maxlen=classifier_maxlen, padding='post')
490
+ pred = classifier_model.predict(padded, verbose=0)
491
+ predicted_class = int(np.argmax(pred))
492
+ label = LABELS[predicted_class]
493
+ confidence = float(np.max(pred))
494
+ if label == "F":
495
+ return {
496
+ "status": "success",
497
+ "message": "F gene detected.",
498
+ "confidence": confidence,
499
+ "predicted_label": label
500
+ }
501
+ elif label == "Random":
502
+ return {
503
+ "status": "error",
504
+ "message": "Unidentified sequence detected. Make sure you're entering the F gene of the NDV.",
505
+ "confidence": confidence,
506
+ "predicted_label": label
507
+ }
508
+ else:
509
+ return {
510
+ "status": "error",
511
+ "message": "No F-gene detected. Please enter an NDV's F gene.",
512
+ "confidence": confidence,
513
+ "predicted_label": label
514
+ }
515
+ except Exception as e:
516
+ logging.error(f"Classifier prediction failed: {e}")
517
+ return {
518
+ "status": "error",
519
+ "message": f"Prediction failed: {str(e)}",
520
+ "confidence": None,
521
+ "predicted_label": None
522
+ }
523
+
524
  def read_fasta_file(file_obj):
525
+ """Read FASTA file content"""
526
  try:
527
  if file_obj is None:
528
  return ""
 
 
529
  if hasattr(file_obj, 'name'):
530
  with open(file_obj.name, "r") as f:
531
  content = f.read()
532
  else:
533
  content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
 
534
  lines = content.strip().split("\n")
535
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
536
  return ''.join(seq_lines)
 
538
  logging.error(f"Failed to read FASTA file: {e}")
539
  return ""
540
 
 
541
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
542
+ """Run pipeline from FASTA file"""
543
  try:
544
  dna_input = read_fasta_file(fasta_file_obj)
545
  if not dna_input:
546
+ return "Failed to read FASTA file", "", "", "", "", "", "", "", "", None, None, None, "No input sequence"
547
  return run_pipeline(dna_input, similarity_score, build_ml_tree)
548
  except Exception as e:
549
  error_msg = f"Pipeline error: {str(e)}"
550
  logging.error(error_msg)
551
+ return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
552
 
553
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
554
+ """Run the full pipeline"""
555
  try:
 
556
  dna_input = dna_input.upper().strip()
557
  if not dna_input:
558
+ return "Empty input", "", "", "", "", "", "", "", "", None, None, None, "No input provided"
 
 
559
  if not re.match('^[ACTGN]+$', dna_input):
560
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
561
  logging.info("DNA sequence sanitized")
562
 
563
+ # Step 1: Boundary Prediction
564
+ processed_sequence = dna_input
565
  boundary_output = ""
 
566
  if boundary_model:
567
  try:
568
  predictions, probs, confidence = boundary_model.predict(dna_input)
569
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
570
  if regions:
571
+ processed_sequence = regions[0]["sequence"]
572
+ boundary_output = processed_sequence
573
  logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
574
  else:
575
  boundary_output = f"No F gene regions found in input sequence"
 
576
  logging.warning("No gene regions found, using full sequence")
 
577
  except Exception as e:
578
  logging.error(f"Boundary model failed: {e}")
579
  boundary_output = f"Boundary model error: {str(e)}"
 
580
  else:
581
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
 
582
 
583
+ # Step 2: Keras Prediction
584
  keras_output = ""
585
  if processed_sequence and len(processed_sequence) >= 6:
586
  keras_prediction = predict_with_keras(processed_sequence)
587
+ keras_output = keras_prediction if not keras_prediction.startswith(("Keras", "Sequence")) else keras_prediction
 
 
 
 
 
 
 
 
588
 
589
+ # Step 3: Classifier Prediction
590
+ classifier_result = classify_sequence(processed_sequence)
591
+ classifier_status = classifier_result["status"]
592
+ classifier_message = classifier_result["message"]
593
+ classifier_label = classifier_result["predicted_label"]
594
+ classifier_confidence = classifier_result["confidence"]
595
+
596
+ # Step 4: Maximum Likelihood Tree
597
  aligned_file = None
598
  phy_file = None
599
  ml_tree_output = ""
 
600
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
601
  try:
602
  logging.info("Starting maximum likelihood tree construction...")
603
  ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
 
604
  if ml_success:
605
  ml_tree_output = ml_message
606
  aligned_file = ml_aligned
607
  phy_file = ml_tree
608
  else:
609
+ ml_tree_output = ml_message
 
610
  except Exception as e:
611
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
612
  logging.error(f"ML Tree failed: {e}")
 
615
  else:
616
  ml_tree_output = "ML tree construction skipped (not requested)"
617
 
618
+ # Step 5: ML Simplified Tree
619
  html_file = None
620
  tree_html_content = "No tree generated"
621
  simplified_ml_output = ""
 
622
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
623
  try:
624
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
 
 
625
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
 
626
  if tree_result and not tree_result.startswith("Error:"):
 
627
  tree_html_content = tree_result
628
  simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
 
 
629
  output_dir = "output"
630
  if os.path.exists(output_dir):
631
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
632
  if html_files:
633
+ html_file = os.path.join(output_dir, html_files[-1])
634
  simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
 
 
635
  if analyzer.find_query_sequence(processed_sequence):
636
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
637
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
 
639
  else:
640
  simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
641
  tree_html_content = f"<p>Error: {tree_result}</p>"
 
642
  except Exception as e:
643
  logging.error(f"Simplified ML tree analysis failed: {e}")
644
  simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
 
651
 
652
  # Return all results
653
  return (
654
+ boundary_output,
655
+ keras_output,
656
+ classifier_status,
657
+ classifier_message,
658
+ classifier_label,
659
+ classifier_confidence,
660
+ ml_tree_output,
661
+ simplified_ml_output,
662
+ tree_html_content,
663
+ aligned_file,
664
+ phy_file,
665
+ html_file,
666
+ f"Pipeline completed. F gene length: {len(processed_sequence)} bp"
667
  )
 
668
  except Exception as e:
669
  error_msg = f"Pipeline execution failed: {str(e)}"
670
  logging.error(error_msg)
671
  import traceback
672
  logging.error(f"Full traceback: {traceback.format_exc()}")
673
  return (
674
+ error_msg, "", "", "", "", "", "", "", f"<p>Error: {error_msg}</p>",
675
  None, None, None, error_msg
676
  )
677
 
678
  # --- Gradio Interface ---
679
  def create_interface():
680
  """Create the Gradio interface with enhanced layout and features"""
 
 
681
  custom_css = """
682
+ .gradio-container { max-width: 1200px !important; }
683
+ .tab-nav button { font-size: 16px !important; }
684
+ .output-html { height: 600px !important; overflow: auto; }
 
 
 
 
 
 
 
685
  """
 
686
  with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
687
  gr.Markdown("""
688
  # 🧬 F Gene Analysis Pipeline
689
 
690
  This tool provides comprehensive analysis of F genes including:
691
+ - **Gene Boundary Detection**: Extract F gene sequences
692
+ - **Gene Validation**: Validate with machine learning
693
+ - **Gene Classification**: Classify sequence type (F gene or other)
694
+ - **Phylogenetic Analysis**: Build maximum likelihood and simplified trees
695
 
696
  **Instructions:**
697
+ 1. Enter your sequence or upload a FASTA file
698
+ 2. Adjust similarity threshold (1-99%)
699
+ 3. Choose whether to build ML tree (requires MAFFT & IQ-TREE)
700
+ 4. Click "Run Analysis" to start
701
  """)
702
 
703
  with gr.Tab("🔬 Analysis Pipeline"):
704
  with gr.Row():
705
  with gr.Column(scale=2):
 
706
  gr.Markdown("### Input Sequence")
707
+ dna_input = gr.Textbox(label="DNA Sequence", placeholder="Enter your DNA sequence here (ATCG format)...", lines=5, max_lines=10)
708
+ fasta_file = gr.File(label="Or Upload FASTA File", file_types=[".fasta", ".fa", ".fas", ".txt"])
 
 
 
 
 
 
 
 
 
 
709
  with gr.Row():
710
+ similarity_score = gr.Slider(minimum=1, maximum=99, value=95.0, step=1.0, label="Similarity Threshold (%)", info="Minimum similarity for phylogenetic analysis")
711
+ build_ml_tree = gr.Checkbox(label="Build ML Tree", value=False, info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
  with gr.Row():
713
  run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
714
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
 
715
  with gr.Column(scale=1):
 
716
  gr.Markdown("### Analysis Status")
717
+ status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
 
 
 
 
 
 
 
718
  gr.Markdown("### Available Models")
719
  model_status = []
720
  if boundary_model:
721
  model_status.append("✅ Boundary Detection Model")
722
  else:
723
  model_status.append("❌ Boundary Detection Model")
 
724
  if keras_model:
725
  model_status.append("✅ Gene Validation Model")
726
  else:
727
  model_status.append("❌ Gene Validation Model")
728
+ if classifier_model:
729
+ model_status.append("✅ Gene Classification Model")
730
+ else:
731
+ model_status.append("❌ Gene Classification Model")
732
  if analyzer:
733
  model_status.append("✅ Tree Analysis Module")
734
  else:
735
  model_status.append("❌ Tree Analysis Module")
 
736
  gr.Markdown("\n".join(model_status))
737
 
738
  with gr.Tab("📊 Results"):
739
  with gr.Row():
740
  with gr.Column():
741
+ boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False)
742
+ keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
743
+ classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
744
+ classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
745
+ classifier_label = gr.Textbox(label="🏷️ Predicted Label", lines=1, interactive=False)
746
+ classifier_confidence = gr.Textbox(label="📊 Confidence Score", lines=1, interactive=False)
 
 
 
 
 
 
 
747
  with gr.Column():
748
+ ml_tree_output = gr.Textbox(label="🌳 Maximum Likelihood Tree", lines=5, interactive=False)
749
+ simplified_ml_output = gr.Textbox(label="📈 Simplified Phylogenetic Analysis", lines=3, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
750
  gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
751
+ tree_html = gr.HTML(label="Interactive Tree", value="<p>No tree generated yet. Run analysis to see results.</p>")
 
 
 
 
 
752
  gr.Markdown("### 📁 Download Results")
753
  with gr.Row():
754
+ aligned_file = gr.File(label="Aligned Sequences (FASTA)", interactive=False)
755
+ phy_file = gr.File(label="Phylogenetic Tree File", interactive=False)
756
+ html_file = gr.File(label="Interactive Tree (HTML)", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
757
 
758
  with gr.Tab("ℹ️ Help & Info"):
759
  gr.Markdown("""
760
  ## About This Tool
761
 
762
  ### F Gene Analysis Pipeline
763
+ - **🎯 Gene Boundary Detection**: Extracts F gene sequences using deep learning.
764
+ - **🔍 Gene Validation**: Validates with k-mer based machine learning.
765
+ - **🧬 Gene Classification**: Classifies sequences (F gene or other) with confidence scores.
766
+ - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
 
768
  ### Input Requirements
769
+ - DNA Sequences: ATCG format, minimum 50 bp.
770
+ - FASTA Files: Standard format.
771
+ - Similarity Threshold: 1-99%.
772
 
773
  ### Dependencies
774
+ **For ML Trees:**
 
775
  ```bash
776
+ # Ubuntu/Debian: sudo apt-get install mafft iqtree
777
+ # macOS: brew install mafft iqtree
778
+ # Conda: conda install -c bioconda mafft iqtree
 
 
 
 
 
779
  ```
780
 
 
 
 
 
 
781
  ### Troubleshooting
782
+ - *"No similar sequences"*: Lower similarity threshold.
783
+ - *"Sequence too short"*: Provide >50 bp.
784
+ - *"MAFFT/IQ-TREE not found"*: Install dependencies.
785
+ - *"Model not available"*: Check model files.
 
 
 
 
 
 
 
 
 
 
 
786
  """)
787
 
 
 
 
 
 
 
 
788
  def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
 
789
  if file_obj is not None:
790
  return run_pipeline_from_file(file_obj, sim_score, build_tree)
791
  else:
 
794
  def clear_inputs():
795
  return "", None, 95.0, False, "Ready to analyze"
796
 
 
797
  run_btn.click(
798
  fn=run_analysis_combined,
799
  inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
800
  outputs=[
801
+ boundary_output, keras_output, classifier_status, classifier_message,
802
+ classifier_label, classifier_confidence, ml_tree_output, simplified_ml_output,
803
+ tree_html, aligned_file, phy_file, html_file, status_display
804
  ]
805
  )
 
806
  clear_btn.click(
807
  fn=clear_inputs,
808
  outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
809
  )
810
 
 
 
811
  example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
 
812
  def load_example():
813
  example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
814
  return example_seq, "Example F gene sequence loaded"
815
+ example_btn.click(fn=load_example, outputs=[dna_input, status_display])
 
 
 
 
816
 
817
  return iface
818
 
819
  # --- Main Execution ---
820
  if __name__ == "__main__":
 
821
  interface = create_interface()
 
 
822
  interface.launch(
823
+ server_name="0.0.0.0",
824
+ server_port=7860,
825
+ share=False,
826
+ debug=True,
827
+ show_error=True,
828
+ max_threads=4,
829
+ auth=None,
830
+ ssl_verify=False,
831
+ quiet=False
832
  )