re-type commited on
Commit
d6bbc7a
·
verified ·
1 Parent(s): 22db390

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +362 -711
app.py CHANGED
@@ -1,1139 +1,790 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import ml_simplified_tree
2
  import tempfile
3
  import shutil
4
- import sys
5
  from pathlib import Path
6
-
7
-
8
 
9
  # --- Global Variables ---
10
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
 
 
 
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
 
13
  # --- Paths ---
14
- from huggingface_hub import hf_hub_download
15
-
16
- # Model repository and file paths
17
  model_repo = "GGproject10/best_boundary_aware_model"
18
  csv_path = "f cleaned.csv"
19
-
20
 
21
  # Get HF token from environment (if available)
22
  hf_token = os.getenv("HF_TOKEN")
 
 
23
  boundary_model = None
24
  keras_model = None
25
  kmer_to_index = None
26
-
27
-
28
-
 
29
 
30
  # Try to load boundary model from Hugging Face Hub
31
  try:
32
- boundary_path = hf_hub_download(
33
- repo_id=model_repo,
34
- filename="best_boundary_aware_model.pth",
35
- token=hf_token
36
- )
37
  if os.path.exists(boundary_path):
38
  boundary_model = GenePredictor(boundary_path)
39
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 
 
 
 
40
 
41
  # Try to load Keras model from Hugging Face Hub
42
  try:
43
- keras_path = hf_hub_download(
44
- repo_id=model_repo,
45
- filename="best_model.keras",
46
- token=hf_token
47
- )
48
- kmer_path = hf_hub_download(
49
- repo_id=model_repo,
50
- filename="kmer_to_index.pkl",
51
- token=hf_token
52
- )
53
-
54
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
55
  keras_model = load_model(keras_path)
56
  with open(kmer_path, "rb") as f:
 
 
 
 
57
  except Exception as e:
58
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
59
 
60
-
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
 
 
 
 
 
 
79
 
80
  # --- Initialize Tree Analyzer ---
81
  analyzer = None
82
  try:
 
83
  if os.path.exists(csv_path):
84
  if analyzer.load_data(csv_path):
85
  logging.info("Tree analyzer initialized successfully")
86
- # Try to train AI model (optional)
87
  try:
88
  if not analyzer.train_ai_model():
89
  logging.warning("AI model training failed; proceeding with basic analysis.")
 
 
 
 
 
 
 
 
 
 
90
  analyzer = None
91
 
92
  # --- Enhanced Tool Detection ---
93
- def check_tool_availability():
94
- """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
95
-
96
- # Check MAFFT
97
-
98
-
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
-
109
 
 
 
 
 
 
 
110
 
111
  mafft_available = False
112
  mafft_cmd = None
113
-
114
- # Try multiple MAFFT locations
115
  mafft_candidates = [
116
  MAFFT_PATH,
117
  'mafft',
118
  '/usr/bin/mafft',
119
  '/usr/local/bin/mafft',
120
- 'mafft.bat', # Windows
121
-
122
-
123
  ]
124
-
125
  for candidate in mafft_candidates:
126
- if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
127
-
128
-
129
-
130
-
131
-
132
-
133
-
134
-
135
  mafft_available = True
136
  mafft_cmd = candidate
137
- logging.info(f"Found MAFFT at: {candidate}")
138
  break
139
-
140
- # Check IQ-TREE
141
  iqtree_available = False
142
  iqtree_cmd = None
143
-
144
- # Try multiple IQ-TREE locations and names
145
  iqtree_candidates = [
146
  IQTREE_PATH,
147
- 'iqtree2',
148
- '/usr/local/bin/iqtree2',
 
 
149
  '/usr/bin/iqtree',
150
  '/usr/local/bin/iqtree',
151
- 'iqtree2.exe', # Windows
152
- 'iqtree.exe', # Windows
153
-
154
  ]
155
-
156
  for candidate in iqtree_candidates:
157
- if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
158
-
159
-
160
-
161
-
162
-
163
-
164
-
165
-
166
  iqtree_available = True
167
  iqtree_cmd = candidate
168
- logging.info(f"Found IQ-TREE at: {candidate}")
169
  break
170
-
171
- return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
172
-
173
- def install_dependencies_guide():
174
- """Provide installation guidance for missing dependencies"""
175
- guide = """
176
- 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
177
-
178
- For MAFFT:
179
- - Ubuntu/Debian: sudo apt-get install mafft
180
- - CentOS/RHEL: sudo yum install mafft
181
- - macOS: brew install mafft
182
- - Windows: Download from https://mafft.cbrc.jp/alignment/software/
183
-
184
- For IQ-TREE:
185
- - Ubuntu/Debian: sudo apt-get install iqtree
186
- - CentOS/RHEL: sudo yum install iqtree
187
- - macOS: brew install iqtree
188
- - Windows: Download from http://www.iqtree.org/
189
-
190
- Alternative: Use conda/mamba:
191
- - conda install -c bioconda mafft iqtree
192
-
193
- Docker option:
194
- - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
195
- - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
196
-
197
-
198
-
199
-
200
-
201
-
202
-
203
-
204
-
205
-
206
-
207
-
208
-
209
-
210
-
211
-
212
-
213
-
214
-
215
-
216
-
217
-
218
-
219
-
220
-
221
-
222
-
223
-
224
-
225
-
226
-
227
-
228
-
229
-
230
-
231
-
232
-
233
-
234
-
235
-
236
-
237
 
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  """
240
- return guide
241
 
242
- def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
243
- """Run MAFFT alignment with enhanced error handling"""
244
  try:
245
- # MAFFT command with more robust options
246
- cmd = [
247
- mafft_cmd,
248
- '--auto', # Automatic strategy selection
249
- '--quiet', # Reduce output verbosity
250
- input_fasta
251
- ]
252
-
253
-
254
-
255
-
256
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
257
-
258
- # Run MAFFT with enhanced error handling
259
- result = subprocess.run(
260
- cmd,
261
- capture_output=True,
262
- text=True,
263
- timeout=600, # Increased timeout to 10 minutes
264
- cwd=os.getcwd() # Ensure working directory is set
265
- )
266
-
267
  if result.returncode == 0:
268
- # Write aligned sequences to output file
269
  with open(output_fasta, 'w') as f:
270
  f.write(result.stdout)
271
  logging.info(f"MAFFT alignment completed: {output_fasta}")
272
-
273
- # Verify output file
274
  if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
275
  return True, output_fasta
276
  else:
 
 
277
  error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
278
  logging.error(f"MAFFT failed: {error_msg}")
279
  return False, f"MAFFT error: {error_msg}"
280
-
281
  except subprocess.TimeoutExpired:
282
  logging.error("MAFFT timeout")
283
  return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
284
-
285
-
286
-
287
  except FileNotFoundError:
288
  return False, f"MAFFT executable not found: {mafft_cmd}"
289
  except Exception as e:
 
 
 
290
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
291
  """Run IQ-TREE with enhanced options and error handling"""
292
  try:
293
- # Enhanced IQ-TREE command
294
- cmd = [
295
- iqtree_cmd,
296
- '-s', aligned_fasta,
297
- '-m', 'MFP', # ModelFinder Plus for automatic model selection
298
- '-bb', '1000', # Bootstrap replicates
299
- '-alrt', '1000', # SH-aLRT test
300
- '-nt', 'AUTO', # Auto detect threads
301
- '--prefix', output_prefix,
302
- '-redo', # Overwrite existing files
303
- '--quiet' # Reduce verbosity
304
- ]
305
-
306
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
307
-
308
- # Run IQ-TREE with enhanced error handling
309
- result = subprocess.run(
310
- cmd,
311
- capture_output=True,
312
- text=True,
313
- timeout=1200, # 20 minute timeout for larger datasets
314
- cwd=os.getcwd()
315
- )
316
-
317
  if result.returncode == 0:
318
  tree_file = f"{output_prefix}.treefile"
319
  if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
 
 
 
 
 
 
320
  error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
321
  logging.error(f"IQ-TREE failed: {error_msg}")
322
  return False, f"IQ-TREE error: {error_msg}"
323
-
324
  except subprocess.TimeoutExpired:
325
  logging.error("IQ-TREE timeout")
326
  return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
327
-
328
-
329
-
330
  except FileNotFoundError:
331
  return False, f"IQ-TREE executable not found: {iqtree_cmd}"
332
  except Exception as e:
 
 
 
333
  def create_simple_neighbor_joining_tree(sequences_dict):
334
  """Create a simple distance-based tree when ML tools are not available"""
335
  try:
336
- # This is a simplified implementation
337
- # In a real scenario, you'd want to use a proper NJ implementation
338
  import random
339
-
340
  seq_names = list(sequences_dict.keys())
341
  n_seqs = len(seq_names)
342
-
343
  if n_seqs < 2:
344
  return None, "Need at least 2 sequences for tree construction"
345
-
346
- # Create a simple Newick tree structure
347
  if n_seqs == 2:
348
  tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
349
  else:
350
- # Simple clustering approach
351
  tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
352
-
353
- # Save to temporary file
354
  tree_file = "simple_tree.nwk"
355
  with open(tree_file, 'w') as f:
356
  f.write(tree_str)
357
-
358
  return tree_file, "Simple distance-based tree created"
359
-
360
  except Exception as e:
361
  return None, f"Simple tree creation failed: {str(e)}"
362
 
363
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
364
  """Create a multi-FASTA file with query sequence and reference sequences"""
365
  try:
366
- # Create temporary FASTA file
367
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
368
-
369
- # Add query sequence
370
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
371
-
372
- # Add reference sequences from existing aligned FASTA if available
373
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
374
  if os.path.exists(ref_fasta_path):
375
  with open(ref_fasta_path, 'r') as ref_file:
376
  temp_fasta.write(ref_file.read())
377
  logging.info(f"Added reference sequences from {ref_fasta_path}")
378
  else:
379
- # If no reference file, try to create from CSV data
380
  if analyzer and hasattr(analyzer, 'data'):
381
  count = 0
382
  for idx, row in analyzer.data.iterrows():
 
 
383
  sequence = str(row['sequence']).upper()
384
  temp_fasta.write(f">{seq_id}\n{sequence}\n")
385
  count += 1
386
- if count >= 20: # Limit to prevent too large datasets
387
  break
388
  logging.info(f"Added {count} reference sequences from CSV")
389
-
390
  temp_fasta.close()
391
  return temp_fasta.name
392
-
393
  except Exception as e:
394
  logging.error(f"Failed to create multi-FASTA: {e}")
395
  return None
 
396
  def build_maximum_likelihood_tree(f_gene_sequence):
397
  """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
398
  try:
399
- # Check tool availability with enhanced detection
400
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
401
-
402
- # Prepare status message
403
  status_msg = "🔍 Checking dependencies...\n"
404
-
405
- if not mafft_available:
406
- status_msg += "❌ MAFFT not found\n"
407
- else:
408
- status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
409
-
410
- if not iqtree_available:
411
- status_msg += "❌ IQ-TREE not found\n"
412
- else:
413
- status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
414
-
415
- # If neither tool is available, provide installation guide
416
- if not mafft_available and not iqtree_available:
417
- guide = install_dependencies_guide()
418
- return False, f"{status_msg}\n{guide}", None, None
419
-
420
- # If only one tool is missing, provide specific guidance
421
- if not mafft_available:
422
- return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
423
-
424
- if not iqtree_available:
425
- status_msg += "\n⚠️ IQ-TREE not available. Attempting simple tree construction...\n"
426
-
427
- # Try to create a simple tree as fallback
428
- multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
429
- if multi_fasta:
430
- # Read sequences
431
- sequences = {}
432
- current_seq = ""
433
- current_name = ""
434
-
435
- with open(multi_fasta, 'r') as f:
436
- for line in f:
437
- line = line.strip()
438
- if line.startswith('>'):
439
- if current_name and current_seq:
440
- sequences[current_name] = current_seq
441
- current_name = line[1:]
442
- current_seq = ""
443
- else:
444
- current_seq += line
445
- if current_name and current_seq:
446
- sequences[current_name] = current_seq
447
-
448
- simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
449
- os.unlink(multi_fasta)
450
-
451
- if simple_tree:
452
- return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
453
- else:
454
- return False, f"{status_msg}❌ {simple_msg}", None, None
455
- else:
456
- return False, f"{status_msg}❌ Failed to create input sequences", None, None
457
-
458
- # Both tools available - proceed with full ML analysis
459
- # Create output directory
460
  output_dir = "ml_tree_output"
461
  os.makedirs(output_dir, exist_ok=True)
462
-
463
- # Step 1: Create multi-FASTA file with query and reference sequences
464
  logging.info("Creating multi-FASTA file...")
465
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
466
  if not multi_fasta:
467
  return False, f"{status_msg}❌ Failed to create input FASTA", None, None
468
-
469
- # Step 2: Run MAFFT alignment
470
  logging.info("Running MAFFT alignment...")
471
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
472
- mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
473
-
474
- # Clean up temporary file
475
  os.unlink(multi_fasta)
476
-
477
  if not mafft_success:
478
  return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
479
-
480
- # Step 3: Run IQ-TREE analysis
481
  logging.info("Running IQ-TREE analysis...")
482
  tree_prefix = os.path.join(output_dir, "ml_tree")
483
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
484
-
485
  if not iqtree_success:
486
  return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
487
-
488
- # Step 4: Prepare output files
489
  tree_file = iqtree_result
490
  log_file = f"{tree_prefix}.log"
491
-
492
- # Copy to standard names for compatibility
493
  standard_aligned = "f_gene_sequences_aligned.fasta"
494
  standard_tree = "f_gene_sequences.phy.treefile"
495
-
496
  if os.path.exists(aligned_fasta):
497
  shutil.copy2(aligned_fasta, standard_aligned)
498
  if os.path.exists(tree_file):
499
  shutil.copy2(tree_file, standard_tree)
500
-
501
- success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
502
- success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
503
- success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
504
-
505
  if os.path.exists(log_file):
506
  try:
507
  with open(log_file, 'r') as f:
508
  log_content = f.read()
509
- # Extract model information
510
  if "Best-fit model:" in log_content:
511
  model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
512
  if model_lines:
513
  success_msg += f"- {model_lines[0].strip()}\n"
514
  except Exception as e:
515
  logging.warning(f"Could not read log file: {e}")
516
-
517
  logging.info("Maximum likelihood tree construction completed")
518
  return True, success_msg, aligned_fasta, tree_file
519
-
520
  except Exception as e:
521
  logging.error(f"ML tree construction failed: {e}")
522
  return False, f"ML tree construction failed: {str(e)}", None, None
523
 
524
- # --- Tree Analysis Function (Based on old Gradio API) ---
525
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
526
- """
527
- Analyze sequence and create phylogenetic tree using the working Gradio API pattern
528
- """
529
  try:
530
  if not analyzer:
531
  return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
532
-
533
  if not sequence:
534
  return "Error: Please provide a sequence."
535
-
536
  if not (1 <= matching_percentage <= 99):
537
  return "Error: Matching percentage must be between 1 and 99."
538
-
539
- # Find query sequence
540
  if not analyzer.find_query_sequence(sequence):
541
  return "Error: Invalid query sequence or sequence not found in dataset."
542
-
543
- # Set matching percentage
544
  analyzer.matching_percentage = matching_percentage
545
-
546
- # Find similar sequences
547
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
548
-
549
  if not matched_ids:
550
  return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
551
-
552
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
553
-
554
- # Build tree structure
555
  tree_structure = analyzer.build_tree_structure(matched_ids)
556
  if not tree_structure:
557
  return "Error: Failed to build tree structure."
558
-
559
- # Create interactive tree
560
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
561
  if not fig:
562
  return "Error: Failed to create tree visualization."
563
-
564
- # Generate HTML content
565
  html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
566
-
567
- # Save to output folder
568
  output_dir = "output"
569
  os.makedirs(output_dir, exist_ok=True)
570
-
571
- # Create a safe filename
572
  safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
573
  html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
574
-
575
  with open(html_filename, "w", encoding='utf-8') as f:
576
  f.write(html_content)
577
-
578
  logging.info(f"Tree HTML saved to {html_filename}")
579
-
580
  return html_content
581
-
582
  except Exception as e:
583
  error_msg = f"Tree analysis error: {str(e)}"
584
  logging.error(error_msg)
 
585
  logging.error(f"Full traceback: {traceback.format_exc()}")
586
  return error_msg
587
-
588
- # --- Keras Prediction ---
589
- def predict_with_keras(sequence):
590
-
591
- try:
592
- if not keras_model or not kmer_to_index:
593
- return f"Keras model not available. Input sequence: {sequence[:100]}..."
594
-
595
- if len(sequence) < 6:
596
- return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
597
-
598
- # Generate k-mers
599
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
600
- indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
601
-
602
- # Prepare input
603
- input_arr = np.array([indices])
604
- prediction = keras_model.predict(input_arr, verbose=0)[0]
605
-
606
- # Format prediction as probabilities/scores (not a sequence)
607
- result = ''.join([str(round(p, 3)) for p in prediction])
608
- return result
609
- except Exception as e:
610
- logging.error(f"Keras prediction failed: {e}")
611
- return f"Keras prediction failed: {str(e)}"
612
-
613
- # --- FASTA Reader ---
614
-
615
-
616
-
617
-
618
-
619
-
620
-
621
-
622
-
623
-
624
-
625
-
626
-
627
-
628
-
629
-
630
-
631
-
632
-
633
-
634
-
635
-
636
-
637
-
638
-
639
-
640
-
641
-
642
-
643
-
644
-
645
-
646
-
647
-
648
-
649
-
650
-
651
-
652
-
653
-
654
-
655
-
656
-
657
-
658
-
659
-
660
-
661
-
662
-
663
-
664
-
665
-
666
 
667
  def read_fasta_file(file_obj):
668
-
669
  try:
670
  if file_obj is None:
671
  return ""
672
-
673
- # Handle file object
674
  if hasattr(file_obj, 'name'):
675
  with open(file_obj.name, "r") as f:
676
  content = f.read()
677
  else:
678
  content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
679
-
680
  lines = content.strip().split("\n")
681
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
682
  return ''.join(seq_lines)
 
683
  logging.error(f"Failed to read FASTA file: {e}")
684
  return ""
685
 
686
- # --- Full Pipeline ---
687
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
688
-
689
  try:
690
  dna_input = read_fasta_file(fasta_file_obj)
691
  if not dna_input:
692
- return "Failed to read FASTA file", "", "", "", "", None, None, None, "No input sequence"
693
- return run_pipeline(dna_input, similarity_score, build_ml_tree)
 
 
 
 
694
  except Exception as e:
695
  error_msg = f"Pipeline error: {str(e)}"
696
  logging.error(error_msg)
697
- return error_msg, "", "", "", "", None, None, None, error_msg
698
-
699
- def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
700
 
 
 
 
 
701
  try:
702
- # Clean input
703
  dna_input = dna_input.upper().strip()
704
  if not dna_input:
705
- return "Empty input", "", "", "", "", None, None, None, "No input provided"
706
 
707
- # Sanitize DNA sequence
708
  if not re.match('^[ACTGN]+$', dna_input):
709
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
710
  logging.info("DNA sequence sanitized")
711
-
712
- # Step 1: Boundary Prediction - Extract F gene sequence
713
- processed_sequence = dna_input # This will be the sequence used for downstream analysis
714
- boundary_output = ""
715
 
 
 
 
716
  if boundary_model:
717
  try:
718
  predictions, probs, confidence = boundary_model.predict(dna_input)
719
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
720
  if regions:
721
- processed_sequence = regions[0]["sequence"] # Use the extracted gene region
722
- boundary_output = processed_sequence # Output the actual F gene sequence
723
  logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
724
  else:
725
  boundary_output = f"No F gene regions found in input sequence"
726
- processed_sequence = dna_input
727
  logging.warning("No gene regions found, using full sequence")
728
- logging.info("Boundary model prediction completed")
729
  except Exception as e:
730
  logging.error(f"Boundary model failed: {e}")
731
  boundary_output = f"Boundary model error: {str(e)}"
732
- processed_sequence = dna_input # Fall back to original sequence
733
  else:
734
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
735
- processed_sequence = dna_input
736
 
737
- # Step 2: Keras Prediction (F gene validation)
738
- keras_output = ""
739
- if processed_sequence and len(processed_sequence) >= 6:
740
- keras_prediction = predict_with_keras(processed_sequence)
741
- # Interpret keras prediction as F gene validation
742
- if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
743
- # You might want to add logic here to interpret the prediction scores
744
- # For now, just show the prediction
745
- keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
746
- else:
747
- keras_output = keras_prediction
748
  else:
749
- keras_output = "Skipped: sequence too short for F gene validation"
750
-
751
- # Step 3: Maximum Likelihood Tree (MAFFT + IQ-TREE)
752
-
753
-
754
-
755
-
756
-
757
 
 
 
 
 
 
 
 
 
758
 
 
759
  aligned_file = None
760
  phy_file = None
761
  ml_tree_output = ""
762
-
763
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
764
  try:
765
  logging.info("Starting maximum likelihood tree construction...")
766
  ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
767
-
768
  if ml_success:
769
  ml_tree_output = ml_message
770
  aligned_file = ml_aligned
771
  phy_file = ml_tree
772
  else:
773
- ml_tree_output = ml_message # This now includes detailed error information
774
-
775
  except Exception as e:
776
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
777
  logging.error(f"ML Tree failed: {e}")
 
 
778
  else:
779
  ml_tree_output = "ML tree construction skipped (not requested)"
780
 
781
- # Step 4: ML Simplified Tree (using the existing approach)
782
  html_file = None
783
  tree_html_content = "No tree generated"
784
  simplified_ml_output = ""
785
-
786
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
787
  try:
788
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
789
-
790
- # Use the existing tree analysis function with user-specified similarity
791
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
792
-
793
  if tree_result and not tree_result.startswith("Error:"):
794
- # Success - we have HTML content
795
  tree_html_content = tree_result
796
  simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
797
-
798
- # Check if HTML file was created
799
  output_dir = "output"
800
  if os.path.exists(output_dir):
801
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
802
  if html_files:
803
- html_file = os.path.join(output_dir, html_files[-1]) # Get the latest
804
  simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
805
-
806
- # Count sequences analyzed
807
  if analyzer.find_query_sequence(processed_sequence):
808
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
809
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
 
810
  else:
811
  simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
812
  tree_html_content = f"<p>Error: {tree_result}</p>"
813
-
814
  except Exception as e:
815
  logging.error(f"Simplified ML tree analysis failed: {e}")
816
  simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
 
 
 
 
 
 
817
 
818
  # Return all results
819
  return (
820
- boundary_output, # F gene extraction result
821
- keras_output, # F gene validation result
822
- ml_tree_output, # ML tree construction status
823
- simplified_ml_output, # Simplified tree analysis status
824
- tree_html_content, # HTML content for tree display
825
- aligned_file, # Path to aligned FASTA file
826
- phy_file, # Path to phylogenetic tree file
827
- html_file, # Path to HTML tree file
828
- f"Pipeline completed. F gene length: {len(processed_sequence)} bp" # Summary
829
-
830
-
831
-
832
-
833
  )
834
-
835
  except Exception as e:
836
  error_msg = f"Pipeline execution failed: {str(e)}"
837
  logging.error(error_msg)
838
  import traceback
839
  logging.error(f"Full traceback: {traceback.format_exc()}")
840
  return (
841
- error_msg, "", "", "", f"<p>Error: {error_msg}</p>",
842
  None, None, None, error_msg
843
  )
844
 
845
  # --- Gradio Interface ---
846
  def create_interface():
847
  """Create the Gradio interface with enhanced layout and features"""
848
-
849
- # Custom CSS for better styling
850
  custom_css = """
851
- .gradio-container {
852
- max-width: 1200px !important;
853
- }
854
- .tab-nav button {
855
- font-size: 16px !important;
856
- }
857
- .output-html {
858
- height: 600px !important;
859
- overflow: auto;
860
- }
861
  """
862
-
863
  with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
864
  gr.Markdown("""
865
  # 🧬 F Gene Analysis Pipeline
866
 
867
- This tool provides comprehensive analysis of F genes including:
868
- - **Gene Boundary Detection**: Extract F gene sequences from larger genomic sequences
869
- - **Gene Validation**: Validate extracted sequences using machine learning
870
- - **Phylogenetic Analysis**: Build maximum likelihood trees and simplified phylogenetic trees
871
-
872
 
873
  **Instructions:**
874
- 1. Enter your sequence directly or upload a FASTA file
875
- 2. Adjust similarity threshold for phylogenetic analysis (1-99%)
876
- 3. Choose whether to build maximum likelihood trees (requires MAFFT & IQ-TREE)
877
- 4. Click "Run Analysis" to start the pipeline
878
  """)
879
 
880
  with gr.Tab("🔬 Analysis Pipeline"):
881
  with gr.Row():
882
  with gr.Column(scale=2):
883
- # Input section
884
  gr.Markdown("### Input Sequence")
885
- dna_input = gr.Textbox(
886
- label="DNA Sequence",
887
- placeholder="Enter your DNA sequence here (ATCG format)...",
888
- lines=5,
889
- max_lines=10
890
- )
891
-
892
- fasta_file = gr.File(
893
- label="Or Upload FASTA File",
894
- file_types=[".fasta", ".fa", ".fas", ".txt"]
895
- )
896
-
897
  with gr.Row():
898
- similarity_score = gr.Slider(
899
- minimum=1,
900
- maximum=99,
901
- value=95.0,
902
- step=1.0,
903
- label="Similarity Threshold (%)",
904
- info="Minimum similarity for phylogenetic analysis"
905
- )
906
-
907
- build_ml_tree = gr.Checkbox(
908
- label="Build ML Tree",
909
- value=False,
910
- info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)"
911
- )
912
-
913
- # Action buttons
914
  with gr.Row():
915
  run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
916
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
917
-
918
  with gr.Column(scale=1):
919
- # Status and info
920
  gr.Markdown("### Analysis Status")
921
- status_display = gr.Textbox(
922
- label="Status",
923
- value="Ready to analyze",
924
- interactive=False,
925
- lines=3
926
- )
927
-
928
- # Model status
929
  gr.Markdown("### Available Models")
930
  model_status = []
931
  if boundary_model:
932
  model_status.append("✅ Boundary Detection Model")
933
  else:
934
  model_status.append("❌ Boundary Detection Model")
935
-
936
  if keras_model:
937
  model_status.append("✅ Gene Validation Model")
938
  else:
939
  model_status.append("❌ Gene Validation Model")
940
-
941
-
942
-
943
-
944
  if analyzer:
945
  model_status.append("✅ Tree Analysis Module")
946
  else:
947
  model_status.append("❌ Tree Analysis Module")
948
-
949
  gr.Markdown("\n".join(model_status))
950
 
951
  with gr.Tab("📊 Results"):
952
  with gr.Row():
953
  with gr.Column():
954
- # Text outputs
955
- boundary_output = gr.Textbox(
956
- label="🎯 F Gene Extraction",
957
- lines=5,
958
- interactive=False
959
- )
960
-
961
- keras_output = gr.Textbox(
962
- label="🔍 Gene Validation",
963
- lines=3,
964
- interactive=False
965
- )
966
-
967
  with gr.Column():
968
- ml_tree_output = gr.Textbox(
969
- label="🌳 Maximum Likelihood Tree",
970
- lines=5,
971
- interactive=False
972
- )
973
-
974
- simplified_ml_output = gr.Textbox(
975
- label="📈 Simplified Phylogenetic Analysis",
976
- lines=3,
977
- interactive=False
978
- )
979
-
980
- # Tree visualization
981
  gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
982
- tree_html = gr.HTML(
983
- label="Interactive Tree",
984
- value="<p>No tree generated yet. Run analysis to see results.</p>"
985
- )
986
-
987
- # File downloads
988
  gr.Markdown("### 📁 Download Results")
989
  with gr.Row():
990
- aligned_file = gr.File(
991
- label="Aligned Sequences (FASTA)",
992
- interactive=False
993
- )
994
-
995
- phy_file = gr.File(
996
- label="Phylogenetic Tree File",
997
- interactive=False
998
- )
999
-
1000
- html_file = gr.File(
1001
- label="Interactive Tree (HTML)",
1002
- interactive=False
1003
- )
1004
 
1005
  with gr.Tab("ℹ️ Help & Info"):
1006
  gr.Markdown("""
1007
  ## About This Tool
1008
 
1009
  ### F Gene Analysis Pipeline
1010
- This comprehensive pipeline analyzes F genes through multiple computational approaches:
1011
-
1012
- #### 🎯 Gene Boundary Detection
1013
- - Uses deep learning to identify and extract F gene sequences from larger genomic sequences
1014
- - Provides confidence scores for detected boundaries
1015
- - Automatically trims sequences to focus on the F gene region
1016
-
1017
- #### 🔍 Gene Validation
1018
- - Employs k-mer based machine learning models to validate extracted sequences
1019
- - Provides probability scores indicating likelihood of being a genuine F gene
1020
- - Uses 6-mer frequency patterns for classification
1021
-
1022
- #### 🌳 Phylogenetic Analysis
1023
-
1024
- **Maximum Likelihood Trees:**
1025
- - Requires MAFFT (sequence alignment) and IQ-TREE (phylogenetic reconstruction)
1026
- - Performs model selection and bootstrap analysis
1027
- - Generates publication-quality phylogenetic trees
1028
- - Provides detailed evolutionary analysis
1029
-
1030
- **Simplified Trees:**
1031
- - Uses built-in algorithms for quick phylogenetic analysis
1032
- - Interactive visualization with similarity-based clustering
1033
- - Faster alternative when external tools are not available
1034
 
1035
  ### Input Requirements
1036
- - **DNA Sequences**: ATCG format, minimum 50 bp for meaningful analysis
1037
- - **FASTA Files**: Standard FASTA format with single or multiple sequences
1038
- - **Similarity Threshold**: 1-99% for controlling phylogenetic analysis sensitivity
1039
 
1040
  ### Dependencies
1041
-
1042
- **Required for ML Trees:**
1043
  ```bash
1044
- # Ubuntu/Debian
1045
- sudo apt-get install mafft iqtree
1046
-
1047
- # macOS
1048
- brew install mafft iqtree
1049
-
1050
- # Conda
1051
- conda install -c bioconda mafft iqtree
1052
  ```
1053
 
1054
- ### Output Files
1055
- - **Aligned FASTA**: Multiple sequence alignment in FASTA format
1056
- - **Tree File**: Newick format phylogenetic tree
1057
- - **HTML Tree**: Interactive visualization for web browsers
1058
-
1059
  ### Troubleshooting
1060
-
1061
- **Common Issues:**
1062
- - *"No similar sequences found"*: Lower the similarity threshold
1063
- - *"Sequence too short"*: Provide sequences longer than 50 bp
1064
- - *"MAFFT/IQ-TREE not found"*: Install required dependencies
1065
- - *"Model not available"*: Check model files are properly downloaded
1066
-
1067
- **Performance Tips:**
1068
- - Use sequences between 100-2000 bp for optimal performance
1069
- - Limit to <50 sequences for faster tree construction
1070
- - Lower similarity thresholds find more distant relatives
1071
- - Higher thresholds focus on closely related sequences
1072
-
1073
- ### Citation
1074
- If you use this tool in your research, please cite the appropriate methods and tools used.
1075
  """)
1076
 
1077
- # Event handlers
1078
- def run_analysis_text(dna_seq, sim_score, build_tree):
1079
- return run_pipeline(dna_seq, sim_score, build_tree)
1080
-
1081
- def run_analysis_file(file_obj, sim_score, build_tree):
1082
- return run_pipeline_from_file(file_obj, sim_score, build_tree)
1083
-
1084
  def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
1085
- # Priority: file upload over text input
1086
  if file_obj is not None:
1087
  return run_pipeline_from_file(file_obj, sim_score, build_tree)
1088
  else:
 
 
 
 
 
 
1089
  def clear_inputs():
1090
  return "", None, 95.0, False, "Ready to analyze"
1091
 
1092
- # Connect events
1093
  run_btn.click(
1094
  fn=run_analysis_combined,
1095
  inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
1096
  outputs=[
1097
- boundary_output, keras_output, ml_tree_output,
1098
- simplified_ml_output, tree_html, aligned_file,
1099
- phy_file, html_file, status_display
1100
  ]
1101
  )
1102
-
1103
  clear_btn.click(
1104
  fn=clear_inputs,
1105
  outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
1106
  )
1107
 
1108
- # Example data loading
1109
- gr.Markdown("### 🧪 Example Data")
1110
  example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
1111
-
1112
  def load_example():
1113
  example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
1114
  return example_seq, "Example F gene sequence loaded"
1115
-
1116
- example_btn.click(
1117
- fn=load_example,
1118
- outputs=[dna_input, status_display]
1119
- )
1120
 
1121
  return iface
1122
 
1123
  # --- Main Execution ---
1124
  if __name__ == "__main__":
1125
- # Initialize and launch interface
1126
  interface = create_interface()
1127
-
1128
- # Launch with enhanced configuration
1129
  interface.launch(
1130
- server_name="0.0.0.0", # Allow external connections
1131
- server_port=7860, # Default Gradio port
1132
- share=False, # Set to True for public sharing
1133
- debug=True, # Enable debug mode
1134
- show_error=True, # Show detailed errors
1135
- max_threads=4, # Limit concurrent threads
1136
- auth=None, # Add authentication if needed: ("username", "password")
1137
- ssl_verify=False, # For development environments
1138
- quiet=False # Show startup messages
1139
  )
 
1
+ # app.py
2
+ import gradio as gr
3
+ import torch
4
+ import pickle
5
+ import subprocess
6
+ import pandas as pd
7
+ import os
8
+ import re
9
+ import logging
10
+ import numpy as np
11
+ from predictor import GenePredictor, preprocess_sequence_for_ndv_f_gene, enhanced_keras_prediction, enhanced_classify_sequence, validate_ndv_f_gene_sequence
12
+ from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
  import tempfile
15
  import shutil
16
+ import stat
17
  from pathlib import Path
18
+ from huggingface_hub import hf_hub_download
19
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
20
 
21
  # --- Global Variables ---
22
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
23
+ IQTREE_PATH = "iqtree/bin/iqtree3" # Updated to match uploaded iqtree3 files
24
+
25
+ # --- Logging ---
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
 
28
  # --- Paths ---
 
 
 
29
  model_repo = "GGproject10/best_boundary_aware_model"
30
  csv_path = "f cleaned.csv"
31
+ classifier_model_dir = "model" # Directory for second model files
32
 
33
  # Get HF token from environment (if available)
34
  hf_token = os.getenv("HF_TOKEN")
35
+
36
+ # --- Load Models ---
37
  boundary_model = None
38
  keras_model = None
39
  kmer_to_index = None
40
+ classifier_model = None
41
+ classifier_kmer_to_index = None
42
+ classifier_maxlen = None
43
+ LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
44
 
45
  # Try to load boundary model from Hugging Face Hub
46
  try:
47
+ boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token)
 
 
 
 
48
  if os.path.exists(boundary_path):
49
  boundary_model = GenePredictor(boundary_path)
50
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
51
+ else:
52
+ logging.warning(f"Boundary model file not found after download")
53
+ except Exception as e:
54
+ logging.error(f"Failed to load boundary model from HF Hub: {e}")
55
 
56
  # Try to load Keras model from Hugging Face Hub
57
  try:
58
+ keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras", token=hf_token)
59
+ kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl", token=hf_token)
 
 
 
 
 
 
 
 
 
60
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
61
  keras_model = load_model(keras_path)
62
  with open(kmer_path, "rb") as f:
63
+ kmer_to_index = pickle.load(f)
64
+ logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
65
+ else:
66
+ logging.warning(f"Keras model or kmer files not found after download")
67
  except Exception as e:
68
  logging.error(f"Failed to load Keras model from HF Hub: {e}")
69
 
70
+ # Try to load classifier model (second model)
71
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
72
+ try:
73
+ classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
74
+ classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
75
+ classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
76
+ missing_files = []
77
+ if not os.path.exists(classifier_path):
78
+ missing_files.append("best_model.keras")
79
+ if not os.path.exists(classifier_kmer_path):
80
+ missing_files.append("kmer_to_index.pkl")
81
+ if not os.path.exists(classifier_maxlen_path):
82
+ missing_files.append("maxlen.txt")
83
+ if missing_files:
84
+ logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
85
+ else:
86
+ classifier_model = load_model(classifier_path)
87
+ with open(classifier_kmer_path, "rb") as f:
88
+ classifier_kmer_to_index = pickle.load(f)
89
+ with open(classifier_maxlen_path, "r") as f:
90
+ classifier_maxlen = int(f.read().strip())
91
+ logging.info("Classifier model loaded successfully.")
92
+ except Exception as e:
93
+ logging.error(f"Failed to load classifier model: {e}")
94
+ logging.warning("Falling back to existing Keras model for validation.")
95
 
96
  # --- Initialize Tree Analyzer ---
97
  analyzer = None
98
  try:
99
+ analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
100
  if os.path.exists(csv_path):
101
  if analyzer.load_data(csv_path):
102
  logging.info("Tree analyzer initialized successfully")
 
103
  try:
104
  if not analyzer.train_ai_model():
105
  logging.warning("AI model training failed; proceeding with basic analysis.")
106
+ except Exception as e:
107
+ logging.warning(f"AI model training failed: {e}")
108
+ else:
109
+ logging.error("Failed to load CSV data for tree analyzer")
110
+ analyzer = None
111
+ else:
112
+ logging.error(f"CSV file not found: {csv_path}")
113
+ analyzer = None
114
+ except Exception as e:
115
+ logging.error(f"Failed to initialize tree analyzer: {e}")
116
  analyzer = None
117
 
118
  # --- Enhanced Tool Detection ---
119
+ def check_and_fix_executable_permissions(filepath):
120
+ """Check and fix executable permissions for a file"""
121
+ try:
122
+ if os.path.exists(filepath):
123
+ if not os.access(filepath, os.X_OK):
124
+ logging.info(f"File {filepath} is not executable, attempting to fix permissions...")
125
+ current_permissions = os.stat(filepath).st_mode
126
+ os.chmod(filepath, current_permissions | stat.S_IEXEC | stat.S_IXUSR | stat.S_IXGRP)
127
+ logging.info(f"Fixed permissions for {filepath}")
128
+ return True
129
+ return True
130
+ return False
131
+ except Exception as e:
132
+ logging.error(f"Failed to fix permissions for {filepath): {e}")
133
+ return False
 
134
 
135
+ def enhanced_check_tool_availability():
136
+ """Enhanced check for MAFFT and IQ-TREE availability with permission fixing and MAFFT_BINARIES unset"""
137
+ # Unset MAFFT_BINARIES to fix version check issue
138
+ if 'MAFFT_BINARIES' in os.environ:
139
+ del os.environ['MAFFT_BINARIES']
140
+ logging.info("Unset MAFFT_BINARIES environment variable to resolve version check issue.")
141
 
142
  mafft_available = False
143
  mafft_cmd = None
 
 
144
  mafft_candidates = [
145
  MAFFT_PATH,
146
  'mafft',
147
  '/usr/bin/mafft',
148
  '/usr/local/bin/mafft',
149
+ '/opt/homebrew/bin/mafft',
150
+ '/usr/local/homebrew/bin/mafft',
151
+ 'mafft.bat',
152
  ]
 
153
  for candidate in mafft_candidates:
154
+ if candidate and os.path.exists(candidate):
155
+ if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
156
+ check_and_fix_executable_permissions(candidate)
157
+ if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
158
+ mafft_available = True
159
+ mafft_cmd = candidate
160
+ logging.info(f"Found MAFFT at: {candidate}")
161
+ break
162
+ elif candidate and shutil.which(candidate) is not None:
163
  mafft_available = True
164
  mafft_cmd = candidate
165
+ logging.info(f"Found MAFFT in PATH: {candidate}")
166
  break
167
+
 
168
  iqtree_available = False
169
  iqtree_cmd = None
 
 
170
  iqtree_candidates = [
171
  IQTREE_PATH,
172
+ 'iqtree3',
173
+ 'iqtree',
174
+ '/usr/bin/iqtree3',
175
+ '/usr/local/bin/iqtree3',
176
  '/usr/bin/iqtree',
177
  '/usr/local/bin/iqtree',
178
+ '/opt/homebrew/bin/iqtree3',
179
+ 'iqtree3.exe',
 
180
  ]
 
181
  for candidate in iqtree_candidates:
182
+ if candidate and os.path.exists(candidate):
183
+ if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
184
+ check_and_fix_executable_permissions(candidate)
185
+ if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
186
+ iqtree_available = True
187
+ iqtree_cmd = candidate
188
+ logging.info(f"Found IQ-TREE at: {candidate}")
189
+ break
190
+ elif candidate and shutil.which(candidate) is not None:
191
  iqtree_available = True
192
  iqtree_cmd = candidate
193
+ logging.info(f"Found IQ-TREE in PATH: {candidate}")
194
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
197
 
198
+ def get_installation_instructions():
199
+ """Get detailed installation instructions based on the current system"""
200
+ import platform
201
+ system = platform.system().lower()
202
+ if system == "linux":
203
+ try:
204
+ with open('/etc/os-release', 'r') as f:
205
+ os_info = f.read().lower()
206
+ if 'ubuntu' in os_info or 'debian' in os_info:
207
+ return """
208
+ 📦 INSTALLATION INSTRUCTIONS (Ubuntu/Debian):
209
+ 1. Update package list: sudo apt-get update
210
+ 2. Install MAFFT and IQ-TREE: sudo apt-get install mafft iqtree
211
+ 3. Verify installation: mafft --version, iqtree3 --version
212
+ Alternative using Conda: conda install -c bioconda mafft iqtree
213
+ """
214
+ elif 'centos' in os_info or 'rhel' in os_info or 'fedora' in os_info:
215
+ return """
216
+ 📦 INSTALLATION INSTRUCTIONS (CentOS/RHEL/Fedora):
217
+ 1. Install EPEL repository (CentOS/RHEL): sudo yum install epel-release
218
+ 2. Install packages: sudo yum install mafft iqtree
219
+ 3. Verify installation: mafft --version, iqtree3 --version
220
+ """
221
+ except:
222
+ pass
223
+ elif system == "darwin":
224
+ return """
225
+ 📦 INSTALLATION INSTRUCTIONS (macOS):
226
+ Using Homebrew: 1. Install Homebrew: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
227
+ 2. Install MAFFT and IQ-TREE: brew install mafft iqtree
228
+ 3. Verify installation: mafft --version, iqtree3 --version
229
+ Using Conda: conda install -c bioconda mafft iqtree
230
+ """
231
+ elif system == "windows":
232
+ return """
233
+ 📦 INSTALLATION INSTRUCTIONS (Windows):
234
+ Option 1 - Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
235
+ Option 2 - Manual: 1. Download MAFFT: https://mafft.cbrc.jp/alignment/software/
236
+ 2. Download IQ-TREE: http://www.iqtree.org/
237
+ 3. Add to PATH
238
+ """
239
+ return """
240
+ 📦 GENERAL INSTALLATION INSTRUCTIONS:
241
+ Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
242
+ Manual: 1. MAFFT: https://mafft.cbrc.jp/alignment/software/
243
+ 2. IQ-TREE: http://www.iqtree.org/
244
  """
 
245
 
246
+ def run_mafft_alignment_improved(input_fasta, output_fasta, mafft_cmd):
247
+ """Run MAFFT alignment with improved permission and error handling"""
248
  try:
249
+ if not os.access(mafft_cmd, os.X_OK):
250
+ logging.warning(f"MAFFT executable {mafft_cmd} is not executable")
251
+ if not check_and_fix_executable_permissions(mafft_cmd):
252
+ return False, f"Cannot make {mafft_cmd} executable"
253
+ try:
254
+ test_result = subprocess.run([mafft_cmd, '--version'], capture_output=True, text=True, timeout=10, env={k: v for k, v in os.environ.items() if k != 'MAFFT_BINARIES'})
255
+ if test_result.returncode != 0:
256
+ return False, f"MAFFT version check failed: {test_result.stderr}"
257
+ except Exception as e:
258
+ return False, f"MAFFT version check failed: {str(e)}"
259
+ cmd = [mafft_cmd, '--auto', '--quiet', '--thread', '2', input_fasta]
260
  logging.info(f"Running MAFFT: {' '.join(cmd)}")
261
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd(), env={k: v for k, v in os.environ.items() if k != 'MAFFT_BINARIES'})
 
 
 
 
 
 
 
 
 
262
  if result.returncode == 0:
 
263
  with open(output_fasta, 'w') as f:
264
  f.write(result.stdout)
265
  logging.info(f"MAFFT alignment completed: {output_fasta}")
 
 
266
  if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
267
  return True, output_fasta
268
  else:
269
+ return False, "MAFFT completed but output file is empty"
270
+ else:
271
  error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
272
  logging.error(f"MAFFT failed: {error_msg}")
273
  return False, f"MAFFT error: {error_msg}"
 
274
  except subprocess.TimeoutExpired:
275
  logging.error("MAFFT timeout")
276
  return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
277
+ except PermissionError as e:
278
+ logging.error(f"Permission error running MAFFT: {e}")
279
+ return False, f"Permission denied: {mafft_cmd}. Please check file permissions."
280
  except FileNotFoundError:
281
  return False, f"MAFFT executable not found: {mafft_cmd}"
282
  except Exception as e:
283
+ logging.error(f"MAFFT execution failed: {e}")
284
+ return False, f"MAFFT execution failed: {str(e)}"
285
+
286
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
287
  """Run IQ-TREE with enhanced options and error handling"""
288
  try:
289
+ if not os.access(iqtree_cmd, os.X_OK):
290
+ logging.warning(f"IQ-TREE executable {iqtree_cmd} is not executable")
291
+ if not check_and_fix_executable_permissions(iqtree_cmd):
292
+ return False, f"Cannot make {iqtree_cmd} executable"
293
+ try:
294
+ test_result = subprocess.run([iqtree_cmd, '--version'], capture_output=True, text=True, timeout=10)
295
+ if test_result.returncode != 0:
296
+ return False, f"IQ-TREE version check failed: {test_result.stderr}"
297
+ except Exception as e:
298
+ return False, f"IQ-TREE version check failed: {str(e)}"
299
+ cmd = [iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000', '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '-redo', '--quiet']
 
 
300
  logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
301
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200, cwd=os.getcwd())
 
 
 
 
 
 
 
 
 
302
  if result.returncode == 0:
303
  tree_file = f"{output_prefix}.treefile"
304
  if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
305
+ logging.info(f"IQ-TREE analysis completed: {tree_file}")
306
+ return True, tree_file
307
+ else:
308
+ logging.error("IQ-TREE completed but tree file not found or empty")
309
+ return False, "Tree file not generated or empty"
310
+ else:
311
  error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
312
  logging.error(f"IQ-TREE failed: {error_msg}")
313
  return False, f"IQ-TREE error: {error_msg}"
 
314
  except subprocess.TimeoutExpired:
315
  logging.error("IQ-TREE timeout")
316
  return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
317
+ except PermissionError as e:
318
+ logging.error(f"Permission error running IQ-TREE: {e}")
319
+ return False, f"Permission denied: {iqtree_cmd}. Please check file permissions."
320
  except FileNotFoundError:
321
  return False, f"IQ-TREE executable not found: {iqtree_cmd}"
322
  except Exception as e:
323
+ logging.error(f"IQ-TREE execution failed: {e}")
324
+ return False, f"IQ-TREE execution failed: {str(e)}"
325
+
326
  def create_simple_neighbor_joining_tree(sequences_dict):
327
  """Create a simple distance-based tree when ML tools are not available"""
328
  try:
 
 
329
  import random
 
330
  seq_names = list(sequences_dict.keys())
331
  n_seqs = len(seq_names)
 
332
  if n_seqs < 2:
333
  return None, "Need at least 2 sequences for tree construction"
 
 
334
  if n_seqs == 2:
335
  tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
336
  else:
 
337
  tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
 
 
338
  tree_file = "simple_tree.nwk"
339
  with open(tree_file, 'w') as f:
340
  f.write(tree_str)
 
341
  return tree_file, "Simple distance-based tree created"
 
342
  except Exception as e:
343
  return None, f"Simple tree creation failed: {str(e)}"
344
 
345
  def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
346
  """Create a multi-FASTA file with query sequence and reference sequences"""
347
  try:
 
348
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
 
 
349
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
 
 
350
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
351
  if os.path.exists(ref_fasta_path):
352
  with open(ref_fasta_path, 'r') as ref_file:
353
  temp_fasta.write(ref_file.read())
354
  logging.info(f"Added reference sequences from {ref_fasta_path}")
355
  else:
 
356
  if analyzer and hasattr(analyzer, 'data'):
357
  count = 0
358
  for idx, row in analyzer.data.iterrows():
359
+ if 'sequence' in row and len(str(row['sequence'])) > 50:
360
+ seq_id = row.get('id', f"Ref_{count}")
361
  sequence = str(row['sequence']).upper()
362
  temp_fasta.write(f">{seq_id}\n{sequence}\n")
363
  count += 1
364
+ if count >= 20:
365
  break
366
  logging.info(f"Added {count} reference sequences from CSV")
 
367
  temp_fasta.close()
368
  return temp_fasta.name
 
369
  except Exception as e:
370
  logging.error(f"Failed to create multi-FASTA: {e}")
371
  return None
372
+
373
  def build_maximum_likelihood_tree(f_gene_sequence):
374
  """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
375
  try:
376
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = enhanced_check_tool_availability()
 
 
 
377
  status_msg = "🔍 Checking dependencies...\n"
378
+ status_msg += f"✅ MAFFT found: {mafft_cmd}\n" if mafft_available else "❌ MAFFT not found\n"
379
+ status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n" if iqtree_available else "❌ IQ-TREE not found\n"
380
+ if not mafft_available or not iqtree_available:
381
+ instructions = get_installation_instructions()
382
+ return False, f"{status_msg}\n{instructions}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  output_dir = "ml_tree_output"
384
  os.makedirs(output_dir, exist_ok=True)
 
 
385
  logging.info("Creating multi-FASTA file...")
386
  multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
387
  if not multi_fasta:
388
  return False, f"{status_msg}❌ Failed to create input FASTA", None, None
 
 
389
  logging.info("Running MAFFT alignment...")
390
  aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
391
+ mafft_success, mafft_result = run_mafft_alignment_improved(multi_fasta, aligned_fasta, mafft_cmd)
 
 
392
  os.unlink(multi_fasta)
 
393
  if not mafft_success:
394
  return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
 
 
395
  logging.info("Running IQ-TREE analysis...")
396
  tree_prefix = os.path.join(output_dir, "ml_tree")
397
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
 
398
  if not iqtree_success:
399
  return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
 
 
400
  tree_file = iqtree_result
401
  log_file = f"{tree_prefix}.log"
 
 
402
  standard_aligned = "f_gene_sequences_aligned.fasta"
403
  standard_tree = "f_gene_sequences.phy.treefile"
 
404
  if os.path.exists(aligned_fasta):
405
  shutil.copy2(aligned_fasta, standard_aligned)
406
  if os.path.exists(tree_file):
407
  shutil.copy2(tree_file, standard_tree)
408
+ success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}\n"
 
 
 
 
409
  if os.path.exists(log_file):
410
  try:
411
  with open(log_file, 'r') as f:
412
  log_content = f.read()
 
413
  if "Best-fit model:" in log_content:
414
  model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
415
  if model_lines:
416
  success_msg += f"- {model_lines[0].strip()}\n"
417
  except Exception as e:
418
  logging.warning(f"Could not read log file: {e}")
 
419
  logging.info("Maximum likelihood tree construction completed")
420
  return True, success_msg, aligned_fasta, tree_file
 
421
  except Exception as e:
422
  logging.error(f"ML tree construction failed: {e}")
423
  return False, f"ML tree construction failed: {str(e)}", None, None
424
 
 
425
  def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
426
+ """Analyze sequence and create phylogenetic tree"""
 
 
427
  try:
428
  if not analyzer:
429
  return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
 
430
  if not sequence:
431
  return "Error: Please provide a sequence."
 
432
  if not (1 <= matching_percentage <= 99):
433
  return "Error: Matching percentage must be between 1 and 99."
 
 
434
  if not analyzer.find_query_sequence(sequence):
435
  return "Error: Invalid query sequence or sequence not found in dataset."
 
 
436
  analyzer.matching_percentage = matching_percentage
 
 
437
  matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
 
438
  if not matched_ids:
439
  return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
 
440
  logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
 
 
441
  tree_structure = analyzer.build_tree_structure(matched_ids)
442
  if not tree_structure:
443
  return "Error: Failed to build tree structure."
 
 
444
  fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
445
  if not fig:
446
  return "Error: Failed to create tree visualization."
 
 
447
  html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
 
 
448
  output_dir = "output"
449
  os.makedirs(output_dir, exist_ok=True)
 
 
450
  safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
451
  html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
 
452
  with open(html_filename, "w", encoding='utf-8') as f:
453
  f.write(html_content)
 
454
  logging.info(f"Tree HTML saved to {html_filename}")
 
455
  return html_content
 
456
  except Exception as e:
457
  error_msg = f"Tree analysis error: {str(e)}"
458
  logging.error(error_msg)
459
+ import traceback
460
  logging.error(f"Full traceback: {traceback.format_exc()}")
461
  return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
  def read_fasta_file(file_obj):
464
+ """Read FASTA file content"""
465
  try:
466
  if file_obj is None:
467
  return ""
 
 
468
  if hasattr(file_obj, 'name'):
469
  with open(file_obj.name, "r") as f:
470
  content = f.read()
471
  else:
472
  content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
 
473
  lines = content.strip().split("\n")
474
  seq_lines = [line.strip() for line in lines if not line.startswith(">")]
475
  return ''.join(seq_lines)
476
+ except Exception as e:
477
  logging.error(f"Failed to read FASTA file: {e}")
478
  return ""
479
 
 
480
  def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
481
+ """Run pipeline from FASTA file"""
482
  try:
483
  dna_input = read_fasta_file(fasta_file_obj)
484
  if not dna_input:
485
+ return "Failed to read FASTA file", "", "", "", "", "", "", "", "", None, None, None, "No input sequence"
486
+ return enhanced_run_pipeline(
487
+ dna_input, keras_model, kmer_to_index, classifier_model,
488
+ classifier_kmer_to_index, classifier_maxlen, LABELS,
489
+ similarity_score, build_ml_tree
490
+ )
491
  except Exception as e:
492
  error_msg = f"Pipeline error: {str(e)}"
493
  logging.error(error_msg)
494
+ return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
 
 
495
 
496
+ def enhanced_run_pipeline(dna_input, keras_model, kmer_to_index, classifier_model,
497
+ classifier_kmer_to_index, classifier_maxlen, labels,
498
+ similarity_score=95.0, build_ml_tree=False):
499
+ """Enhanced pipeline with improved F gene prediction"""
500
  try:
501
+ # Input validation and preprocessing
502
  dna_input = dna_input.upper().strip()
503
  if not dna_input:
504
+ return "Empty input", "", "", "", "", "", "", "", "", None, None, None, "No input provided"
505
 
 
506
  if not re.match('^[ACTGN]+$', dna_input):
507
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
508
  logging.info("DNA sequence sanitized")
 
 
 
 
509
 
510
+ # Step 1: Boundary Prediction
511
+ processed_sequence = dna_input
512
+ boundary_output = ""
513
  if boundary_model:
514
  try:
515
  predictions, probs, confidence = boundary_model.predict(dna_input)
516
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
517
  if regions:
518
+ processed_sequence = regions[0]["sequence"]
519
+ boundary_output = f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
520
  logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
521
  else:
522
  boundary_output = f"No F gene regions found in input sequence"
 
523
  logging.warning("No gene regions found, using full sequence")
 
524
  except Exception as e:
525
  logging.error(f"Boundary model failed: {e}")
526
  boundary_output = f"Boundary model error: {str(e)}"
 
527
  else:
528
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
 
529
 
530
+ # Step 2: Enhanced Keras Prediction
531
+ keras_result = enhanced_keras_prediction(processed_sequence, keras_model, kmer_to_index)
532
+ if isinstance(keras_result, dict):
533
+ keras_output = f"Prediction confidence: {keras_result['confidence_score']:.3f}\n"
534
+ keras_output += f"K-mer coverage: {keras_result['kmer_coverage']:.1%}\n"
535
+ keras_output += f"Sequence length: {keras_result['sequence_length']} nt"
536
+ if keras_result['kmer_coverage'] < 0.8:
537
+ keras_output += "\n⚠️ Low k-mer coverage - may affect accuracy"
 
 
 
538
  else:
539
+ keras_output = str(keras_result)
 
 
 
 
 
 
 
540
 
541
+ # Step 3: Enhanced Classification
542
+ classifier_result = enhanced_classify_sequence(
543
+ processed_sequence, classifier_model, classifier_kmer_to_index, classifier_maxlen, labels
544
+ )
545
+ classifier_status = classifier_result["status"]
546
+ classifier_message = classifier_result["message"]
547
+ classifier_label = classifier_result["predicted_label"] or "Unknown"
548
+ classifier_confidence = f"{classifier_result['confidence']:.3f}" if classifier_result['confidence'] is not None else "N/A"
549
 
550
+ # Step 4: Maximum Likelihood Tree
551
  aligned_file = None
552
  phy_file = None
553
  ml_tree_output = ""
 
554
  if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
555
  try:
556
  logging.info("Starting maximum likelihood tree construction...")
557
  ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
 
558
  if ml_success:
559
  ml_tree_output = ml_message
560
  aligned_file = ml_aligned
561
  phy_file = ml_tree
562
  else:
563
+ ml_tree_output = ml_message
 
564
  except Exception as e:
565
  ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
566
  logging.error(f"ML Tree failed: {e}")
567
+ elif build_ml_tree:
568
+ ml_tree_output = "❌ F gene sequence too short for ML tree construction (minimum 50 bp)"
569
  else:
570
  ml_tree_output = "ML tree construction skipped (not requested)"
571
 
572
+ # Step 5: ML Simplified Tree
573
  html_file = None
574
  tree_html_content = "No tree generated"
575
  simplified_ml_output = ""
 
576
  if analyzer and processed_sequence and len(processed_sequence) >= 10:
577
  try:
578
  logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
 
 
579
  tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
 
580
  if tree_result and not tree_result.startswith("Error:"):
 
581
  tree_html_content = tree_result
582
  simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
 
 
583
  output_dir = "output"
584
  if os.path.exists(output_dir):
585
  html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
586
  if html_files:
587
+ html_file = os.path.join(output_dir, html_files[-1])
588
  simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
 
 
589
  if analyzer.find_query_sequence(processed_sequence):
590
  matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
591
  simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
592
+ simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
593
  else:
594
  simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
595
  tree_html_content = f"<p>Error: {tree_result}</p>"
 
596
  except Exception as e:
597
  logging.error(f"Simplified ML tree analysis failed: {e}")
598
  simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
599
+ tree_html_content = f"<p>Error: {str(e)}</p>"
600
+ else:
601
+ if not analyzer:
602
+ simplified_ml_output = "❌ Tree analyzer not available"
603
+ else:
604
+ simplified_ml_output = "❌ F gene sequence too short for tree analysis (minimum 10 bp)"
605
 
606
  # Return all results
607
  return (
608
+ boundary_output,
609
+ keras_output,
610
+ classifier_status,
611
+ classifier_message,
612
+ classifier_label,
613
+ classifier_confidence,
614
+ ml_tree_output,
615
+ simplified_ml_output,
616
+ tree_html_content,
617
+ aligned_file,
618
+ phy_file,
619
+ html_file,
620
+ f"Pipeline completed. F gene length: {len(processed_sequence)} bp"
621
  )
 
622
  except Exception as e:
623
  error_msg = f"Pipeline execution failed: {str(e)}"
624
  logging.error(error_msg)
625
  import traceback
626
  logging.error(f"Full traceback: {traceback.format_exc()}")
627
  return (
628
+ error_msg, "", "error", error_msg, "Error", "0.000", "", "", f"<p>Error: {error_msg}</p>",
629
  None, None, None, error_msg
630
  )
631
 
632
  # --- Gradio Interface ---
633
  def create_interface():
634
  """Create the Gradio interface with enhanced layout and features"""
 
 
635
  custom_css = """
636
+ .gradio-container { max-width: 1200px !important; }
637
+ .tab-nav button { font-size: 16px !important; }
638
+ .output-html { height: 600px !important; overflow: auto; }
 
 
 
 
 
 
 
639
  """
 
640
  with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
641
  gr.Markdown("""
642
  # 🧬 F Gene Analysis Pipeline
643
 
644
+ This tool provides comprehensive analysis of F genes:
645
+ - **🎯 F Gene Extraction**: Extracts F gene sequences using deep learning.
646
+ - **🔍 Gene Validation**: Validates with machine learning.
647
+ - **🧬 Gene Classification**: Classifies sequence type (F gene or other).
648
+ - **🌳 Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
649
 
650
  **Instructions:**
651
+ 1. Enter your sequence or upload a FASTA file
652
+ 2. Adjust similarity threshold (1-99%)
653
+ 3. Choose whether to build ML tree (requires MAFFT & IQ-TREE)
654
+ 4. Click "Run Analysis" to start
655
  """)
656
 
657
  with gr.Tab("🔬 Analysis Pipeline"):
658
  with gr.Row():
659
  with gr.Column(scale=2):
 
660
  gr.Markdown("### Input Sequence")
661
+ dna_input = gr.Textbox(label="DNA Sequence", placeholder="Enter your DNA sequence here (ATCG format)...", lines=5, max_lines=10)
662
+ fasta_file = gr.File(label="Or Upload FASTA File", file_types=[".fasta", ".fa", ".fas", ".txt"])
 
 
 
 
 
 
 
 
 
 
663
  with gr.Row():
664
+ similarity_score = gr.Slider(minimum=1, maximum=99, value=95.0, step=1.0, label="Similarity Threshold (%)", info="Minimum similarity for phylogenetic analysis")
665
+ build_ml_tree = gr.Checkbox(label="Build ML Tree", value=False, info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  with gr.Row():
667
  run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
668
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
 
669
  with gr.Column(scale=1):
 
670
  gr.Markdown("### Analysis Status")
671
+ status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
 
 
 
 
 
 
 
672
  gr.Markdown("### Available Models")
673
  model_status = []
674
  if boundary_model:
675
  model_status.append("✅ Boundary Detection Model")
676
  else:
677
  model_status.append("❌ Boundary Detection Model")
 
678
  if keras_model:
679
  model_status.append("✅ Gene Validation Model")
680
  else:
681
  model_status.append("❌ Gene Validation Model")
682
+ if classifier_model:
683
+ model_status.append("✅ Gene Classification Model")
684
+ else:
685
+ model_status.append("❌ Gene Classification Model")
686
  if analyzer:
687
  model_status.append("✅ Tree Analysis Module")
688
  else:
689
  model_status.append("❌ Tree Analysis Module")
 
690
  gr.Markdown("\n".join(model_status))
691
 
692
  with gr.Tab("📊 Results"):
693
  with gr.Row():
694
  with gr.Column():
695
+ boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False)
696
+ keras_output = gr.Textbox(label="🔍 Gene Validation", lines=5, interactive=False)
697
+ classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
698
+ classifier_message = gr.Textbox(label="📝 Classification Message", lines=6, interactive=False)
699
+ classifier_label = gr.Textbox(label="🏷️ Predicted Label", lines=1, interactive=False)
700
+ classifier_confidence = gr.Textbox(label="📊 Confidence Score", lines=1, interactive=False)
 
 
 
 
 
 
 
701
  with gr.Column():
702
+ ml_tree_output = gr.Textbox(label="🌳 Maximum Likelihood Tree", lines=5, interactive=False)
703
+ simplified_ml_output = gr.Textbox(label="📈 Simplified Phylogenetic Analysis", lines=3, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
704
  gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
705
+ tree_html = gr.HTML(label="Interactive Tree", value="<p>No tree generated yet. Run analysis to see results.</p>")
 
 
 
 
 
706
  gr.Markdown("### 📁 Download Results")
707
  with gr.Row():
708
+ aligned_file = gr.File(label="Aligned Sequences (FASTA)", interactive=False)
709
+ phy_file = gr.File(label="Phylogenetic Tree File", interactive=False)
710
+ html_file = gr.File(label="Interactive Tree (HTML)", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
711
 
712
  with gr.Tab("ℹ️ Help & Info"):
713
  gr.Markdown("""
714
  ## About This Tool
715
 
716
  ### F Gene Analysis Pipeline
717
+ - **🎯 F Gene Extraction**: Extracts F gene sequences using deep learning.
718
+ - **🔍 Gene Validation**: Validates with k-mer based machine learning.
719
+ - **🧬 Gene Classification**: Classifies sequences (F gene or other) with confidence scores.
720
+ - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
 
722
  ### Input Requirements
723
+ - DNA Sequences: ATCG format, minimum 50 bp (preferably 1500-2000 bp for F gene).
724
+ - FASTA Files: Standard format.
725
+ - Similarity Threshold: 1-99%.
726
 
727
  ### Dependencies
728
+ **For ML Trees:**
 
729
  ```bash
730
+ # Ubuntu/Debian: sudo apt-get update && sudo apt-get install mafft iqtree
731
+ # macOS: brew install mafft iqtree
732
+ # Conda: conda install -c bioconda mafft iqtree
 
 
 
 
 
733
  ```
734
 
 
 
 
 
 
735
  ### Troubleshooting
736
+ - *"No similar sequences"*: Lower similarity threshold.
737
+ - *"Sequence too short"*: Provide >50 bp (ideally >1500 bp for F gene).
738
+ - *"MAFFT/IQ-TREE not found"*: Install dependencies.
739
+ - *"Model not available"*: Check model files.
 
 
 
 
 
 
 
 
 
 
 
740
  """)
741
 
 
 
 
 
 
 
 
742
  def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
 
743
  if file_obj is not None:
744
  return run_pipeline_from_file(file_obj, sim_score, build_tree)
745
  else:
746
+ return enhanced_run_pipeline(
747
+ dna_seq, keras_model, kmer_to_index, classifier_model,
748
+ classifier_kmer_to_index, classifier_maxlen, LABELS,
749
+ sim_score, build_tree
750
+ )
751
+
752
  def clear_inputs():
753
  return "", None, 95.0, False, "Ready to analyze"
754
 
 
755
  run_btn.click(
756
  fn=run_analysis_combined,
757
  inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
758
  outputs=[
759
+ boundary_output, keras_output, classifier_status, classifier_message,
760
+ classifier_label, classifier_confidence, ml_tree_output, simplified_ml_output,
761
+ tree_html, aligned_file, phy_file, html_file, status_display
762
  ]
763
  )
 
764
  clear_btn.click(
765
  fn=clear_inputs,
766
  outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
767
  )
768
 
 
 
769
  example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
 
770
  def load_example():
771
  example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
772
  return example_seq, "Example F gene sequence loaded"
773
+ example_btn.click(fn=load_example, outputs=[dna_input, status_display])
 
 
 
 
774
 
775
  return iface
776
 
777
  # --- Main Execution ---
778
  if __name__ == "__main__":
 
779
  interface = create_interface()
 
 
780
  interface.launch(
781
+ server_name="0.0.0.0",
782
+ server_port=7860,
783
+ share=False,
784
+ debug=True,
785
+ show_error=True,
786
+ max_threads=4,
787
+ auth=None,
788
+ ssl_verify=False,
789
+ quiet=False
790
  )