re-type commited on
Commit
20b71b9
·
verified ·
1 Parent(s): b5a86a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +305 -1042
app.py CHANGED
@@ -8,201 +8,132 @@ import os
8
  import re
9
  import logging
10
  import numpy as np
11
- from predictor import GenePredictor
12
- from tensorflow.keras.models import load_model
13
- import ml_simplified_tree
14
  import tempfile
15
  import shutil
16
  import sys
17
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # --- Global Variables ---
20
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
21
  IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # --- Logging ---
24
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
-
26
- # --- Paths ---
27
- from huggingface_hub import hf_hub_download
28
-
29
- # Model repository and file paths
30
- model_repo = "GGproject10/best_boundary_aware_model"
31
- csv_path = "f cleaned.csv"
32
-
33
- # Get HF token from environment (if available)
34
- hf_token = os.getenv("HF_TOKEN")
35
-
36
- # --- Load Models ---
37
  boundary_model = None
38
  keras_model = None
39
  kmer_to_index = None
 
40
 
41
- # Try to load boundary model from Hugging Face Hub
42
- try:
43
- boundary_path = hf_hub_download(
44
- repo_id=model_repo,
45
- filename="best_boundary_aware_model.pth",
46
- token=hf_token
47
- )
48
- if os.path.exists(boundary_path):
49
- boundary_model = GenePredictor(boundary_path)
50
- logging.info("Boundary model loaded successfully from Hugging Face Hub.")
51
- else:
52
- logging.warning(f"Boundary model file not found after download")
53
- except Exception as e:
54
- logging.error(f"Failed to load boundary model from HF Hub: {e}")
55
-
56
- # Try to load Keras model from Hugging Face Hub
57
- try:
58
- keras_path = hf_hub_download(
59
- repo_id=model_repo,
60
- filename="best_model.keras",
61
- token=hf_token
62
- )
63
- kmer_path = hf_hub_download(
64
- repo_id=model_repo,
65
- filename="kmer_to_index.pkl",
66
- token=hf_token
67
- )
68
 
69
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
70
- keras_model = load_model(keras_path)
71
- with open(kmer_path, "rb") as f:
72
- kmer_to_index = pickle.load(f)
73
- logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
 
 
 
 
 
 
 
 
74
  else:
75
- logging.warning(f"Keras model or kmer files not found after download")
76
- except Exception as e:
77
- logging.error(f"Failed to load Keras model from HF Hub: {e}")
78
-
79
- # --- Load Verification Models from models directory ---
80
- verification_models = {}
81
-
82
- def load_verification_models():
83
- """Load all verification models from the models directory"""
84
- global verification_models
85
- models_dir = "models"
86
 
87
- if not os.path.exists(models_dir):
88
- logging.warning(f"Models directory not found: {models_dir}")
89
- return
90
-
91
- # Load different types of verification models
92
- model_files = {
93
- "boundary_model": "best_boundary_aware_model.pth",
94
- "keras_model": "best_model.keras",
95
- "kmer_index": "kmer_to_index.pkl",
96
- "additional_model_1": "verification_model_1.pth", # Add your model names here
97
- "additional_model_2": "verification_model_2.keras",
98
- # Add more models as needed
99
- }
100
-
101
- for model_name, filename in model_files.items():
102
- model_path = os.path.join(models_dir, filename)
103
-
104
  try:
105
- if os.path.exists(model_path):
106
- if filename.endswith('.pth'):
107
- # PyTorch model
108
- if model_name == "boundary_model":
109
- verification_models[model_name] = GenePredictor(model_path)
110
- else:
111
- verification_models[model_name] = torch.load(model_path, map_location='cpu')
112
-
113
- elif filename.endswith('.keras'):
114
- # Keras model
115
- verification_models[model_name] = load_model(model_path)
116
-
117
- elif filename.endswith('.pkl'):
118
- # Pickle file
119
- with open(model_path, 'rb') as f:
120
- verification_models[model_name] = pickle.load(f)
121
-
122
- logging.info(f"Loaded verification model: {model_name}")
123
-
124
  except Exception as e:
125
- logging.error(f"Failed to load {model_name} from {model_path}: {e}")
126
-
127
- # Load verification models at startup
128
- load_verification_models()
 
129
 
130
  # --- Initialize Tree Analyzer ---
131
- analyzer = None
132
- try:
133
- analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
134
- if os.path.exists(csv_path):
135
- if analyzer.load_data(csv_path):
136
- logging.info("Tree analyzer initialized successfully")
137
- # Try to train AI model (optional)
138
- try:
139
- if not analyzer.train_ai_model():
140
- logging.warning("AI model training failed; proceeding with basic analysis.")
141
- except Exception as e:
142
- logging.warning(f"AI model training failed: {e}")
143
- else:
144
- logging.error("Failed to load CSV data for tree analyzer")
 
 
 
145
  analyzer = None
146
  else:
147
- logging.error(f"CSV file not found: {csv_path}")
148
  analyzer = None
149
- except Exception as e:
150
- logging.error(f"Failed to initialize tree analyzer: {e}")
151
- analyzer = None
152
 
153
- # --- Enhanced Tool Detection ---
154
  def check_tool_availability():
155
- """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
156
-
157
- # Check MAFFT
158
- mafft_available = False
159
- mafft_cmd = None
160
-
161
- # Try multiple MAFFT locations
162
  mafft_candidates = [
163
- MAFFT_PATH,
164
- 'mafft',
165
- '/usr/bin/mafft',
166
- '/usr/local/bin/mafft',
167
- 'mafft.bat', # Windows
168
  ]
169
-
170
- for candidate in mafft_candidates:
171
- if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
172
- mafft_available = True
173
- mafft_cmd = candidate
174
- logging.info(f"Found MAFFT at: {candidate}")
175
- break
176
-
177
- # Check IQ-TREE
178
- iqtree_available = False
179
- iqtree_cmd = None
180
-
181
- # Try multiple IQ-TREE locations and names
182
  iqtree_candidates = [
183
- IQTREE_PATH,
184
- 'iqtree2',
185
- 'iqtree',
186
- '/usr/bin/iqtree2',
187
- '/usr/local/bin/iqtree2',
188
- '/usr/bin/iqtree',
189
- '/usr/local/bin/iqtree',
190
- 'iqtree2.exe', # Windows
191
- 'iqtree.exe', # Windows
192
  ]
193
 
194
- for candidate in iqtree_candidates:
195
- if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
196
- iqtree_available = True
197
- iqtree_cmd = candidate
198
- logging.info(f"Found IQ-TREE at: {candidate}")
199
- break
200
 
201
- return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
202
 
 
203
  def install_dependencies_guide():
204
- """Provide installation guidance for missing dependencies"""
205
- guide = """
206
  🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
207
 
208
  For MAFFT:
@@ -217,999 +148,331 @@ For IQ-TREE:
217
  - macOS: brew install iqtree
218
  - Windows: Download from http://www.iqtree.org/
219
 
220
- Alternative: Use conda/mamba:
221
- - conda install -c bioconda mafft iqtree
222
-
223
- Docker option:
224
- - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
225
- - docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
226
  """
227
- return guide
228
 
 
229
  def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
230
- """Run MAFFT alignment with enhanced error handling"""
231
  try:
232
- # MAFFT command with more robust options
233
- cmd = [
234
- mafft_cmd,
235
- '--auto', # Automatic strategy selection
236
- '--quiet', # Reduce output verbosity
237
- input_fasta
238
- ]
239
-
240
- logging.info(f"Running MAFFT: {' '.join(cmd)}")
241
-
242
- # Run MAFFT with enhanced error handling
243
- result = subprocess.run(
244
- cmd,
245
- capture_output=True,
246
- text=True,
247
- timeout=600, # Increased timeout to 10 minutes
248
- cwd=os.getcwd() # Ensure working directory is set
249
- )
250
-
251
  if result.returncode == 0:
252
- # Write aligned sequences to output file
253
  with open(output_fasta, 'w') as f:
254
  f.write(result.stdout)
255
- logging.info(f"MAFFT alignment completed: {output_fasta}")
256
-
257
- # Verify output file
258
- if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
259
  return True, output_fasta
260
- else:
261
- return False, "MAFFT completed but output file is empty"
262
- else:
263
- error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
264
- logging.error(f"MAFFT failed: {error_msg}")
265
- return False, f"MAFFT error: {error_msg}"
266
-
267
- except subprocess.TimeoutExpired:
268
- logging.error("MAFFT timeout")
269
- return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
270
- except FileNotFoundError:
271
- return False, f"MAFFT executable not found: {mafft_cmd}"
272
  except Exception as e:
273
- logging.error(f"MAFFT execution failed: {e}")
274
- return False, f"MAFFT execution failed: {str(e)}"
275
 
276
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
277
- """Run IQ-TREE with enhanced options and error handling"""
278
  try:
279
- # Enhanced IQ-TREE command
280
  cmd = [
281
- iqtree_cmd,
282
- '-s', aligned_fasta,
283
- '-m', 'MFP', # ModelFinder Plus for automatic model selection
284
- '-bb', '1000', # Bootstrap replicates
285
- '-alrt', '1000', # SH-aLRT test
286
- '-nt', 'AUTO', # Auto detect threads
287
- '--prefix', output_prefix,
288
- '-redo', # Overwrite existing files
289
- '--quiet' # Reduce verbosity
290
  ]
291
-
292
- logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
293
-
294
- # Run IQ-TREE with enhanced error handling
295
- result = subprocess.run(
296
- cmd,
297
- capture_output=True,
298
- text=True,
299
- timeout=1200, # 20 minute timeout for larger datasets
300
- cwd=os.getcwd()
301
- )
302
-
303
- if result.returncode == 0:
304
- tree_file = f"{output_prefix}.treefile"
305
- if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
306
- logging.info(f"IQ-TREE analysis completed: {tree_file}")
307
- return True, tree_file
308
- else:
309
- logging.error("IQ-TREE completed but tree file not found or empty")
310
- return False, "Tree file not generated or empty"
311
- else:
312
- error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
313
- logging.error(f"IQ-TREE failed: {error_msg}")
314
- return False, f"IQ-TREE error: {error_msg}"
315
-
316
- except subprocess.TimeoutExpired:
317
- logging.error("IQ-TREE timeout")
318
- return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
319
- except FileNotFoundError:
320
- return False, f"IQ-TREE executable not found: {iqtree_cmd}"
321
  except Exception as e:
322
- logging.error(f"IQ-TREE execution failed: {e}")
323
- return False, f"IQ-TREE execution failed: {str(e)}"
324
 
325
- def create_simple_neighbor_joining_tree(sequences_dict):
326
- """Create a simple distance-based tree when ML tools are not available"""
327
  try:
328
- # This is a simplified implementation
329
- # In a real scenario, you'd want to use a proper NJ implementation
330
- import random
331
-
332
  seq_names = list(sequences_dict.keys())
333
- n_seqs = len(seq_names)
334
-
335
- if n_seqs < 2:
336
- return None, "Need at least 2 sequences for tree construction"
337
-
338
- # Create a simple Newick tree structure
339
- if n_seqs == 2:
340
- tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
341
- else:
342
- # Simple clustering approach
343
- tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
344
-
345
- # Save to temporary file
346
  tree_file = "simple_tree.nwk"
347
  with open(tree_file, 'w') as f:
348
  f.write(tree_str)
349
-
350
- return tree_file, "Simple distance-based tree created"
351
-
352
  except Exception as e:
353
  return None, f"Simple tree creation failed: {str(e)}"
354
 
355
- def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
356
- """Create a multi-FASTA file with query sequence and reference sequences"""
357
  try:
358
- # Create temporary FASTA file
359
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
360
-
361
- # Add query sequence
362
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
363
-
364
- # Add reference sequences from existing aligned FASTA if available
365
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
366
  if os.path.exists(ref_fasta_path):
367
  with open(ref_fasta_path, 'r') as ref_file:
368
  temp_fasta.write(ref_file.read())
369
- logging.info(f"Added reference sequences from {ref_fasta_path}")
370
- else:
371
- # If no reference file, try to create from CSV data
372
- if analyzer and hasattr(analyzer, 'data'):
373
- count = 0
374
- for idx, row in analyzer.data.iterrows():
375
- if 'sequence' in row and len(str(row['sequence'])) > 50:
376
- seq_id = row.get('id', f"Ref_{count}")
377
- sequence = str(row['sequence']).upper()
378
- temp_fasta.write(f">{seq_id}\n{sequence}\n")
379
- count += 1
380
- if count >= 20: # Limit to prevent too large datasets
381
- break
382
- logging.info(f"Added {count} reference sequences from CSV")
383
-
384
  temp_fasta.close()
385
  return temp_fasta.name
386
-
387
  except Exception as e:
388
- logging.error(f"Failed to create multi-FASTA: {e}")
389
  return None
390
 
391
- def build_maximum_likelihood_tree(f_gene_sequence):
392
- """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
393
  try:
394
- # Check tool availability with enhanced detection
395
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
396
-
397
- # Prepare status message
398
- status_msg = "🔍 Checking dependencies...\n"
399
 
400
- if not mafft_available:
401
- status_msg += " MAFFT not found\n"
402
- else:
403
- status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
404
-
405
- if not iqtree_available:
406
- status_msg += "❌ IQ-TREE not found\n"
407
- else:
408
- status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
409
 
410
- # If neither tool is available, provide installation guide
411
- if not mafft_available and not iqtree_available:
412
  guide = install_dependencies_guide()
413
- return False, f"{status_msg}\n{guide}", None, None
414
-
415
- # If only one tool is missing, provide specific guidance
416
- if not mafft_available:
417
- return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
418
-
419
- if not iqtree_available:
420
- status_msg += "\n⚠️ IQ-TREE not available. Attempting simple tree construction...\n"
421
-
422
- # Try to create a simple tree as fallback
423
- multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
424
- if multi_fasta:
425
- # Read sequences
426
- sequences = {}
427
- current_seq = ""
428
- current_name = ""
429
-
430
- with open(multi_fasta, 'r') as f:
431
- for line in f:
432
- line = line.strip()
433
- if line.startswith('>'):
434
- if current_name and current_seq:
435
- sequences[current_name] = current_seq
436
- current_name = line[1:]
437
- current_seq = ""
438
- else:
439
- current_seq += line
440
- if current_name and current_seq:
441
- sequences[current_name] = current_seq
442
-
443
- simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
444
- os.unlink(multi_fasta)
445
-
446
- if simple_tree:
447
- return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
448
- else:
449
- return False, f"{status_msg}❌ {simple_msg}", None, None
450
- else:
451
- return False, f"{status_msg}❌ Failed to create input sequences", None, None
452
-
453
- # Both tools available - proceed with full ML analysis
454
- # Create output directory
455
- output_dir = "ml_tree_output"
456
- os.makedirs(output_dir, exist_ok=True)
457
 
458
- # Step 1: Create multi-FASTA file with query and reference sequences
459
- logging.info("Creating multi-FASTA file...")
460
- multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
461
  if not multi_fasta:
462
- return False, f"{status_msg}❌ Failed to create input FASTA", None, None
463
 
464
- # Step 2: Run MAFFT alignment
465
- logging.info("Running MAFFT alignment...")
466
- aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
467
  mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
468
-
469
- # Clean up temporary file
470
  os.unlink(multi_fasta)
471
 
472
  if not mafft_success:
473
- return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
474
 
475
- # Step 3: Run IQ-TREE analysis
476
- logging.info("Running IQ-TREE analysis...")
477
- tree_prefix = os.path.join(output_dir, "ml_tree")
478
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
479
-
480
  if not iqtree_success:
481
- return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
482
 
483
- # Step 4: Prepare output files
484
  tree_file = iqtree_result
485
- log_file = f"{tree_prefix}.log"
486
-
487
- # Copy to standard names for compatibility
488
- standard_aligned = "f_gene_sequences_aligned.fasta"
489
- standard_tree = "f_gene_sequences.phy.treefile"
490
-
491
- if os.path.exists(aligned_fasta):
492
- shutil.copy2(aligned_fasta, standard_aligned)
493
- if os.path.exists(tree_file):
494
- shutil.copy2(tree_file, standard_tree)
495
-
496
- success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
497
- success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
498
- success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
499
 
500
- if os.path.exists(log_file):
501
- try:
502
- with open(log_file, 'r') as f:
503
- log_content = f.read()
504
- # Extract model information
505
- if "Best-fit model:" in log_content:
506
- model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
507
- if model_lines:
508
- success_msg += f"- {model_lines[0].strip()}\n"
509
- except Exception as e:
510
- logging.warning(f"Could not read log file: {e}")
511
-
512
- logging.info("Maximum likelihood tree construction completed")
513
  return True, success_msg, aligned_fasta, tree_file
514
-
515
  except Exception as e:
516
  logging.error(f"ML tree construction failed: {e}")
517
  return False, f"ML tree construction failed: {str(e)}", None, None
518
 
519
- # --- Tree Analysis Function (Fixed for display) ---
520
- def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tuple:
521
- """
522
- Analyze sequence and create phylogenetic tree - FIXED to return HTML content properly
523
- Returns: (html_content, html_file_path, success_message)
524
- """
525
- try:
526
- if not analyzer:
527
- return None, None, "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
528
-
529
- if not sequence:
530
- return None, None, "Error: Please provide a sequence."
531
-
532
- if not (1 <= matching_percentage <= 99):
533
- return None, None, "Error: Matching percentage must be between 1 and 99."
534
-
535
- # Find query sequence
536
- if not analyzer.find_query_sequence(sequence):
537
- return None, None, "Error: Invalid query sequence or sequence not found in dataset."
538
-
539
- # Set matching percentage
540
- analyzer.matching_percentage = matching_percentage
541
-
542
- # Find similar sequences
543
- matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
544
-
545
- if not matched_ids:
546
- return None, None, f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
547
-
548
- logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
549
-
550
- # Build tree structure
551
- tree_structure = analyzer.build_tree_structure(matched_ids)
552
- if not tree_structure:
553
- return None, None, "Error: Failed to build tree structure."
554
-
555
- # Create interactive tree
556
- fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
557
- if not fig:
558
- return None, None, "Error: Failed to create tree visualization."
559
-
560
- # Generate HTML content
561
- html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
562
-
563
- # Save to output folder
564
- output_dir = "output"
565
- os.makedirs(output_dir, exist_ok=True)
566
-
567
- # Create a safe filename
568
- safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
569
- html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
570
-
571
- with open(html_filename, "w", encoding='utf-8') as f:
572
- f.write(html_content)
573
-
574
- logging.info(f"Tree HTML saved to {html_filename}")
575
-
576
- success_msg = f"✅ Simplified phylogenetic tree generated successfully!\n"
577
- success_msg += f"- {len(matched_ids)} sequences analyzed\n"
578
- success_msg += f"- Similarity threshold: {actual_percentage:.1f}%\n"
579
- success_msg += f"- Tree file: {os.path.basename(html_filename)}"
580
-
581
- return html_content, html_filename, success_msg
582
-
583
- except Exception as e:
584
- error_msg = f"Tree analysis error: {str(e)}"
585
- logging.error(error_msg)
586
- import traceback
587
- logging.error(f"Full traceback: {traceback.format_exc()}")
588
- return None, None, error_msg
589
-
590
- # --- Verification Functions for Hugging Face Models ---
591
- def run_verification_pipeline(sequence, model_names=None):
592
- """
593
- Run verification using models from the models directory
594
- Args:
595
- sequence: DNA sequence to verify
596
- model_names: List of model names to use (None = use all available)
597
- Returns:
598
- Dictionary with verification results from each model
599
- """
600
  results = {}
 
 
 
 
601
 
602
- if not verification_models:
603
- return {"error": "No verification models loaded from models directory"}
604
-
605
- # Use all models if none specified
606
- if model_names is None:
607
- model_names = list(verification_models.keys())
 
 
 
 
 
 
 
608
 
609
- for model_name in model_names:
610
- if model_name not in verification_models:
611
- results[model_name] = f"Model {model_name} not found"
612
- continue
613
-
614
  try:
615
- model = verification_models[model_name]
616
-
617
- if model_name == "boundary_model" and hasattr(model, 'predict'):
618
- # Boundary prediction model
619
- predictions, probs, confidence = model.predict(sequence)
620
- regions = model.extract_gene_regions(predictions, sequence)
621
- results[model_name] = {
622
- "type": "boundary_detection",
623
- "confidence": confidence,
624
- "regions_found": len(regions) if regions else 0,
625
- "extracted_sequence": regions[0]["sequence"] if regions else None
626
- }
627
-
628
- elif model_name == "keras_model":
629
- # Keras model for gene validation
630
- if len(sequence) < 6:
631
- results[model_name] = {"error": "Sequence too short for k-mer analysis"}
632
- continue
633
-
634
- # Generate k-mers
635
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
636
- kmer_index = verification_models.get("kmer_index", {})
637
- indices = [kmer_index.get(kmer, 0) for kmer in kmers]
638
-
639
- # Prepare input
640
- input_arr = np.array([indices])
641
- prediction = model.predict(input_arr, verbose=0)[0]
642
-
643
- results[model_name] = {
644
- "type": "gene_validation",
645
- "prediction_scores": prediction.tolist(),
646
- "mean_score": float(np.mean(prediction)),
647
- "max_score": float(np.max(prediction))
648
- }
649
-
650
- else:
651
- # Generic model handling
652
- results[model_name] = {
653
- "type": "generic",
654
- "status": "Model loaded but no specific handler implemented",
655
- "model_type": type(model).__name__
656
- }
657
-
658
  except Exception as e:
659
- results[model_name] = {"error": str(e)}
660
- logging.error(f"Verification failed for {model_name}: {e}")
661
 
662
  return results
663
 
664
- def format_verification_results(verification_results):
665
- """Format verification results for display"""
666
- if not verification_results:
667
- return "No verification results available"
668
 
669
- if "error" in verification_results:
670
- return f"Verification Error: {verification_results['error']}"
 
671
 
672
- formatted = "🔍 VERIFICATION RESULTS:\n\n"
673
-
674
- for model_name, result in verification_results.items():
675
- formatted += f"📊 {model_name.upper()}:\n"
676
-
677
- if isinstance(result, dict):
678
- if "error" in result:
679
- formatted += f" ❌ Error: {result['error']}\n"
680
- elif result.get("type") == "boundary_detection":
681
- formatted += f" ✅ Confidence: {result.get('confidence', 'N/A'):.3f}\n"
682
- formatted += f" 🎯 Regions Found: {result.get('regions_found', 0)}\n"
683
- if result.get('extracted_sequence'):
684
- seq_len = len(result['extracted_sequence'])
685
- formatted += f" 📏 Extracted Length: {seq_len} bp\n"
686
- elif result.get("type") == "gene_validation":
687
- formatted += f" 📈 Mean Score: {result.get('mean_score', 0):.3f}\n"
688
- formatted += f" 🔝 Max Score: {result.get('max_score', 0):.3f}\n"
689
  else:
690
- formatted += f" ℹ️ Status: {result.get('status', 'Processed')}\n"
691
- else:
692
- formatted += f" 📝 Result: {str(result)}\n"
693
-
694
- formatted += "\n"
 
 
 
 
 
 
 
 
 
 
695
 
696
- return formatted
697
 
698
- # --- Keras Prediction ---
699
- def predict_with_keras(sequence):
700
- try:
701
- if not keras_model or not kmer_to_index:
702
- return f"Keras model not available. Input sequence: {sequence[:100]}..."
703
-
704
- if len(sequence) < 6:
705
- return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
706
-
707
- # Generate k-mers
708
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
709
- indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
710
-
711
- # Prepare input
712
- input_arr = np.array([indices])
713
- prediction = keras_model.predict(input_arr, verbose=0)[0]
714
-
715
- # Format prediction as probabilities/scores
716
- mean_score = np.mean(prediction)
717
- max_score = np.max(prediction)
718
- min_score = np.min(prediction)
719
-
720
- result = f"Keras Model Prediction Results:\n"
721
- result += f"- Mean Score: {mean_score:.4f}\n"
722
- result += f"- Max Score: {max_score:.4f}\n"
723
- result += f"- Min Score: {min_score:.4f}\n"
724
- result += f"- Total K-mers: {len(kmers)}\n"
725
- result += f"- Sequence Length: {len(sequence)} bp"
726
-
727
- return result
728
- except Exception as e:
729
- logging.error(f"Keras prediction error: {e}")
730
- return f"Keras prediction failed: {str(e)}"
731
 
732
- # --- Boundary Prediction ---
733
- def predict_with_boundary(sequence):
734
- try:
735
- if not boundary_model:
736
- return f"Boundary model not available. Input sequence: {sequence[:100]}..."
737
-
738
- # Get predictions from boundary model
739
- predictions, probabilities, confidence = boundary_model.predict(sequence)
740
-
741
- # Extract gene regions
742
- regions = boundary_model.extract_gene_regions(predictions, sequence)
743
-
744
- result = f"Boundary Model Prediction Results:\n"
745
- result += f"- Overall Confidence: {confidence:.4f}\n"
746
- result += f"- Regions Detected: {len(regions) if regions else 0}\n"
747
-
748
- if regions:
749
- for i, region in enumerate(regions[:3]): # Show first 3 regions
750
- result += f"\nRegion {i+1}:\n"
751
- result += f" - Start: {region['start']}\n"
752
- result += f" - End: {region['end']}\n"
753
- result += f" - Length: {len(region['sequence'])} bp\n"
754
- result += f" - Confidence: {region.get('confidence', 'N/A'):.4f}\n"
755
-
756
- return result
757
- except Exception as e:
758
- logging.error(f"Boundary prediction error: {e}")
759
- return f"Boundary prediction failed: {str(e)}"
760
 
761
- # --- Combined Prediction Function ---
762
- def predict_gene_sequence(sequence):
763
- """Combined prediction using both models"""
764
- try:
765
- if not sequence or len(sequence.strip()) == 0:
766
- return "Please provide a DNA sequence."
767
-
768
- # Clean and validate sequence
769
- sequence = re.sub(r'[^ATCG]', '', sequence.upper())
770
-
771
- if len(sequence) < 10:
772
- return "Sequence too short. Please provide at least 10 nucleotides."
773
-
774
- results = []
775
- results.append(f"🧬 GENE SEQUENCE ANALYSIS\n")
776
- results.append(f"Input sequence length: {len(sequence)} bp\n")
777
- results.append("=" * 50)
778
-
779
- # Boundary model prediction
780
- if boundary_model:
781
- results.append("\n🎯 BOUNDARY DETECTION:")
782
- boundary_result = predict_with_boundary(sequence)
783
- results.append(boundary_result)
784
- else:
785
- results.append("\n❌ Boundary model not available")
786
-
787
- # Keras model prediction
788
- if keras_model:
789
- results.append("\n🔍 KERAS MODEL ANALYSIS:")
790
- keras_result = predict_with_keras(sequence)
791
- results.append(keras_result)
792
- else:
793
- results.append("\n❌ Keras model not available")
794
-
795
- # Verification models
796
- if verification_models:
797
- results.append("\n🔬 VERIFICATION ANALYSIS:")
798
- verification_result = run_verification_pipeline(sequence)
799
- formatted_verification = format_verification_results(verification_result)
800
- results.append(formatted_verification)
801
 
802
- return "\n".join(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
804
- except Exception as e:
805
- logging.error(f"Gene prediction error: {e}")
806
- return f"Gene prediction failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
 
808
- # --- File Processing Functions ---
809
  def process_fasta_file(file):
810
- """Process uploaded FASTA file"""
811
  try:
812
- if file is None:
813
  return "Please upload a FASTA file."
814
 
815
- # Read file content
816
- with open(file.name, 'r') as f:
817
- content = f.read()
818
-
819
- # Parse FASTA
820
  sequences = {}
821
  current_seq = ""
822
  current_name = ""
823
-
824
- lines = content.strip().split('\n')
825
- for line in lines:
826
- line = line.strip()
827
- if line.startswith('>'):
828
- if current_name and current_seq:
829
- sequences[current_name] = current_seq
830
- current_name = line[1:] # Remove '>'
831
- current_seq = ""
832
- else:
833
- current_seq += line.upper()
834
-
835
- # Add last sequence
836
  if current_name and current_seq:
837
  sequences[current_name] = current_seq
838
 
839
  if not sequences:
840
- return "No valid sequences found in FASTA file."
841
-
842
- # Process each sequence
843
- results = []
844
- results.append(f"📁 FASTA FILE ANALYSIS")
845
- results.append(f"Found {len(sequences)} sequences\n")
846
- results.append("=" * 60)
847
 
 
848
  for i, (name, seq) in enumerate(sequences.items()):
849
- if i >= 5: # Limit to first 5 sequences
850
  results.append(f"\n... and {len(sequences) - 5} more sequences")
851
  break
852
-
853
- results.append(f"\n🧬 Sequence: {name}")
854
- results.append(f"Length: {len(seq)} bp")
855
-
856
- # Clean sequence
857
  clean_seq = re.sub(r'[^ATCG]', '', seq)
858
  if len(clean_seq) >= 10:
859
- # Run prediction on cleaned sequence
860
- prediction = predict_gene_sequence(clean_seq)
861
- results.append(prediction)
862
  else:
863
  results.append("❌ Sequence too short or invalid")
864
-
865
  results.append("-" * 40)
866
 
867
  return "\n".join(results)
868
-
869
  except Exception as e:
870
- logging.error(f"FASTA processing error: {e}")
871
  return f"FASTA processing failed: {str(e)}"
872
 
873
- # --- Tree Building Interface Functions ---
874
- def build_tree_interface(sequence):
875
- """Interface function for building phylogenetic trees"""
876
- try:
877
- if not sequence or len(sequence.strip()) == 0:
878
- return "Please provide a DNA sequence for tree construction."
879
-
880
- # Clean sequence
881
- clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
882
-
883
- if len(clean_seq) < 50:
884
- return "Sequence too short for phylogenetic analysis (minimum 50 bp required)."
885
-
886
- # Try ML tree construction first
887
- success, message, aligned_file, tree_file = build_maximum_likelihood_tree(clean_seq)
888
-
889
- result = f"🌳 PHYLOGENETIC TREE CONSTRUCTION\n"
890
- result += f"Input sequence length: {len(clean_seq)} bp\n"
891
- result += "=" * 50 + "\n\n"
892
- result += message
893
-
894
- if success and tree_file:
895
- # Try to read and display tree
896
- try:
897
- with open(tree_file, 'r') as f:
898
- tree_content = f.read().strip()
899
-
900
- result += f"\n\n📄 Tree file content:\n"
901
- result += f"File: {os.path.basename(tree_file)}\n"
902
- result += f"Size: {len(tree_content)} characters\n"
903
-
904
- # Show first part of tree if it's very long
905
- if len(tree_content) > 500:
906
- result += f"Preview: {tree_content[:500]}...\n"
907
- else:
908
- result += f"Content: {tree_content}\n"
909
-
910
- except Exception as e:
911
- result += f"\n⚠️ Could not read tree file: {e}"
912
-
913
- return result
914
-
915
- except Exception as e:
916
- logging.error(f"Tree building interface error: {e}")
917
- return f"Tree construction failed: {str(e)}"
918
-
919
- def analyze_tree_interface(sequence, similarity_threshold):
920
- """Interface function for tree analysis with similarity threshold"""
921
- try:
922
- if not sequence or len(sequence.strip()) == 0:
923
- return "Please provide a DNA sequence.", None
924
-
925
- # Clean sequence
926
- clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
927
-
928
- if len(clean_seq) < 20:
929
- return "Sequence too short for analysis (minimum 20 bp required).", None
930
-
931
- # Validate similarity threshold
932
- if not (1 <= similarity_threshold <= 99):
933
- return "Similarity threshold must be between 1 and 99%.", None
934
-
935
- # Run tree analysis
936
- html_content, html_file, success_msg = analyze_sequence_for_tree(
937
- clean_seq, similarity_threshold
938
- )
939
-
940
- if html_content:
941
- result = f"🌳 PHYLOGENETIC TREE ANALYSIS\n"
942
- result += f"Input sequence length: {len(clean_seq)} bp\n"
943
- result += f"Similarity threshold: {similarity_threshold}%\n"
944
- result += "=" * 50 + "\n\n"
945
- result += success_msg
946
-
947
- return result, html_file
948
- else:
949
- return success_msg or "Tree analysis failed.", None
950
-
951
- except Exception as e:
952
- logging.error(f"Tree analysis interface error: {e}")
953
- return f"Tree analysis failed: {str(e)}", None
954
-
955
- # --- Gradio Interface ---
956
- def create_gradio_interface():
957
- """Create the Gradio interface"""
958
-
959
- # Custom CSS for better styling
960
- css = """
961
- .gradio-container {
962
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
963
- }
964
- .output-text {
965
- font-family: 'Courier New', monospace;
966
- font-size: 12px;
967
- line-height: 1.4;
968
- }
969
- .tab-nav {
970
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
971
- }
972
- """
973
-
974
- with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
975
- gr.Markdown("""
976
- # 🧬 Advanced Gene Analysis Tool
977
-
978
- This tool provides comprehensive gene sequence analysis including:
979
- - **Gene Prediction**: Boundary detection and validation
980
- - **Phylogenetic Analysis**: Tree construction and similarity analysis
981
- - **File Processing**: Batch analysis of FASTA files
982
- - **Model Verification**: Multi-model validation pipeline
983
- """)
984
-
985
- with gr.Tabs():
986
- # Tab 1: Gene Prediction
987
- with gr.Tab("🔬 Gene Prediction"):
988
- gr.Markdown("### Predict gene sequences using trained models")
989
-
990
- with gr.Row():
991
- with gr.Column(scale=2):
992
- seq_input = gr.Textbox(
993
- label="DNA Sequence",
994
- placeholder="Enter DNA sequence (A, T, C, G only)...",
995
- lines=5,
996
- max_lines=10
997
- )
998
- predict_btn = gr.Button("🚀 Analyze Sequence", variant="primary")
999
-
1000
- with gr.Column(scale=3):
1001
- prediction_output = gr.Textbox(
1002
- label="Analysis Results",
1003
- lines=20,
1004
- max_lines=30,
1005
- elem_classes=["output-text"]
1006
- )
1007
-
1008
- predict_btn.click(
1009
- fn=predict_gene_sequence,
1010
- inputs=[seq_input],
1011
- outputs=[prediction_output]
1012
- )
1013
-
1014
- # Tab 2: File Processing
1015
- with gr.Tab("📁 File Processing"):
1016
- gr.Markdown("### Upload and analyze FASTA files")
1017
-
1018
- with gr.Row():
1019
- with gr.Column(scale=1):
1020
- file_input = gr.File(
1021
- label="Upload FASTA File",
1022
- file_types=[".fasta", ".fa", ".fas", ".txt"]
1023
- )
1024
- process_btn = gr.Button("📊 Process File", variant="primary")
1025
-
1026
- with gr.Column(scale=2):
1027
- file_output = gr.Textbox(
1028
- label="Processing Results",
1029
- lines=25,
1030
- max_lines=35,
1031
- elem_classes=["output-text"]
1032
- )
1033
-
1034
- process_btn.click(
1035
- fn=process_fasta_file,
1036
- inputs=[file_input],
1037
- outputs=[file_output]
1038
- )
1039
-
1040
- # Tab 3: Phylogenetic Trees
1041
- with gr.Tab("🌳 Phylogenetic Trees"):
1042
- gr.Markdown("### Build and analyze phylogenetic trees")
1043
-
1044
- with gr.Tabs():
1045
- # Subtab: ML Tree Construction
1046
- with gr.Tab("Maximum Likelihood Tree"):
1047
- gr.Markdown("**Build ML tree using MAFFT + IQ-TREE**")
1048
-
1049
- with gr.Row():
1050
- with gr.Column(scale=1):
1051
- ml_seq_input = gr.Textbox(
1052
- label="DNA Sequence",
1053
- placeholder="Enter sequence for ML tree construction...",
1054
- lines=4
1055
- )
1056
- ml_tree_btn = gr.Button("🌳 Build ML Tree", variant="primary")
1057
-
1058
- with gr.Column(scale=2):
1059
- ml_tree_output = gr.Textbox(
1060
- label="ML Tree Results",
1061
- lines=20,
1062
- elem_classes=["output-text"]
1063
- )
1064
-
1065
- ml_tree_btn.click(
1066
- fn=build_tree_interface,
1067
- inputs=[ml_seq_input],
1068
- outputs=[ml_tree_output]
1069
- )
1070
-
1071
- # Subtab: Interactive Tree Analysis
1072
- with gr.Tab("Interactive Analysis"):
1073
- gr.Markdown("**Analyze sequence similarity with interactive tree**")
1074
-
1075
- with gr.Row():
1076
- with gr.Column(scale=1):
1077
- tree_seq_input = gr.Textbox(
1078
- label="Query Sequence",
1079
- placeholder="Enter sequence for tree analysis...",
1080
- lines=4
1081
- )
1082
- similarity_slider = gr.Slider(
1083
- minimum=1,
1084
- maximum=99,
1085
- value=80,
1086
- step=1,
1087
- label="Similarity Threshold (%)"
1088
- )
1089
- tree_analyze_btn = gr.Button("🔍 Analyze Tree", variant="primary")
1090
-
1091
- with gr.Column(scale=2):
1092
- tree_analysis_output = gr.Textbox(
1093
- label="Tree Analysis Results",
1094
- lines=15,
1095
- elem_classes=["output-text"]
1096
- )
1097
- tree_file_output = gr.File(
1098
- label="Interactive Tree File (HTML)"
1099
- )
1100
-
1101
- tree_analyze_btn.click(
1102
- fn=analyze_tree_interface,
1103
- inputs=[tree_seq_input, similarity_slider],
1104
- outputs=[tree_analysis_output, tree_file_output]
1105
- )
1106
-
1107
- # Tab 4: Model Information
1108
- with gr.Tab("ℹ️ Model Information"):
1109
- gr.Markdown("""
1110
- ### Model Status and Information
1111
-
1112
- **Available Models:**
1113
- """)
1114
-
1115
- # Model status
1116
- model_status = []
1117
- if boundary_model:
1118
- model_status.append("✅ Boundary Detection Model: Loaded")
1119
- else:
1120
- model_status.append("❌ Boundary Detection Model: Not Available")
1121
-
1122
- if keras_model:
1123
- model_status.append("✅ Keras Validation Model: Loaded")
1124
- else:
1125
- model_status.append("❌ Keras Validation Model: Not Available")
1126
-
1127
- if verification_models:
1128
- model_status.append(f"✅ Verification Models: {len(verification_models)} loaded")
1129
- for model_name in verification_models.keys():
1130
- model_status.append(f" - {model_name}")
1131
- else:
1132
- model_status.append("❌ Verification Models: None loaded")
1133
-
1134
- if analyzer:
1135
- model_status.append("✅ Tree Analyzer: Initialized")
1136
- else:
1137
- model_status.append("❌ Tree Analyzer: Not Available")
1138
-
1139
- # Check external tools
1140
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1141
- if mafft_available:
1142
- model_status.append(f"✅ MAFFT: Available ({mafft_cmd})")
1143
- else:
1144
- model_status.append("❌ MAFFT: Not Available")
1145
-
1146
- if iqtree_available:
1147
- model_status.append(f"✅ IQ-TREE: Available ({iqtree_cmd})")
1148
- else:
1149
- model_status.append("❌ IQ-TREE: Not Available")
1150
-
1151
- gr.Markdown("\n".join(model_status))
1152
-
1153
- gr.Markdown("""
1154
- ### Usage Guidelines:
1155
-
1156
- 1. **Gene Prediction**: Input DNA sequences containing only A, T, C, G characters
1157
- 2. **File Processing**: Upload FASTA files with multiple sequences
1158
- 3. **ML Trees**: Requires MAFFT and IQ-TREE installation
1159
- 4. **Interactive Trees**: Uses simplified clustering for quick analysis
1160
-
1161
- ### System Requirements:
1162
- - Python 3.8+
1163
- - TensorFlow/Keras for neural network models
1164
- - PyTorch for boundary detection
1165
- - MAFFT and IQ-TREE for phylogenetic analysis (optional)
1166
- """)
1167
-
1168
- return interface
1169
-
1170
- # --- Main Application ---
1171
  if __name__ == "__main__":
1172
- # Initialize logging
1173
- logging.basicConfig(
1174
- level=logging.INFO,
1175
- format='%(asctime)s - %(levelname)s - %(message)s',
1176
- handlers=[
1177
- logging.FileHandler('gene_analysis.log'),
1178
- logging.StreamHandler(sys.stdout)
1179
- ]
1180
- )
1181
-
1182
- # Create output directories
1183
  os.makedirs("output", exist_ok=True)
1184
  os.makedirs("ml_tree_output", exist_ok=True)
1185
 
1186
- # Log startup information
1187
- logging.info("Starting Gene Analysis Tool")
1188
- logging.info(f"Boundary model loaded: {boundary_model is not None}")
1189
- logging.info(f"Keras model loaded: {keras_model is not None}")
1190
- logging.info(f"Verification models loaded: {len(verification_models) if verification_models else 0}")
1191
- logging.info(f"Tree analyzer initialized: {analyzer is not None}")
1192
 
1193
- # Check external tools
1194
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1195
- logging.info(f"MAFFT available: {mafft_available}")
1196
- logging.info(f"IQ-TREE available: {iqtree_available}")
1197
 
1198
- # Create and launch interface
1199
  try:
1200
  interface = create_gradio_interface()
1201
-
1202
- # Launch with appropriate settings
1203
  interface.launch(
1204
- share=False, # Set to True if you want a public link
1205
- server_name="0.0.0.0", # Allow external connections
1206
  server_port=7860,
1207
  show_error=True,
1208
  debug=True
1209
  )
1210
-
1211
  except Exception as e:
1212
- logging.error(f"Failed to launch interface: {e}")
1213
- import traceback
1214
- logging.error(f"Full traceback: {traceback.format_exc()}")
1215
  sys.exit(1)
 
8
  import re
9
  import logging
10
  import numpy as np
 
 
 
11
  import tempfile
12
  import shutil
13
  import sys
14
  from pathlib import Path
15
+ try:
16
+ from predictor import GenePredictor
17
+ except ImportError:
18
+ GenePredictor = None
19
+ try:
20
+ from tensorflow.keras.models import load_model
21
+ except ImportError:
22
+ load_model = None
23
+ try:
24
+ import ml_simplified_tree
25
+ except ImportError:
26
+ ml_simplified_tree = None
27
+ from huggingface_hub import hf_hub_download
28
 
29
  # --- Global Variables ---
30
  MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
31
  IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
32
+ CSV_PATH = "f_cleaned.csv" # Updated to match your naming
33
+
34
+ # --- Logging Setup ---
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format='%(asctime)s - %(levelname)s - %(message)s',
38
+ handlers=[
39
+ logging.FileHandler('gene_analysis.log'),
40
+ logging.StreamHandler(sys.stdout)
41
+ ]
42
+ )
43
 
44
+ # --- Model Paths and Variables ---
45
+ MODEL_REPO = "GGproject10/best_boundary_aware_model"
 
 
 
 
 
 
 
 
 
 
 
 
46
  boundary_model = None
47
  keras_model = None
48
  kmer_to_index = None
49
+ analyzer = None
50
 
51
+ # --- Load Models ---
52
+ def load_models():
53
+ global boundary_model, keras_model, kmer_to_index
54
+ hf_token = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Load boundary model
57
+ if GenePredictor:
58
+ try:
59
+ boundary_path = hf_hub_download(
60
+ repo_id=MODEL_REPO,
61
+ filename="best_boundary_aware_model.pth",
62
+ token=hf_token
63
+ )
64
+ boundary_model = GenePredictor(boundary_path)
65
+ logging.info("Boundary model loaded successfully.")
66
+ except Exception as e:
67
+ logging.warning(f"Failed to load boundary model: {e}")
68
+ boundary_model = None
69
  else:
70
+ logging.warning("GenePredictor not available.")
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # Load Keras model
73
+ if load_model:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  try:
75
+ keras_path = hf_hub_download(
76
+ repo_id=MODEL_REPO,
77
+ filename="best_model.keras",
78
+ token=hf_token
79
+ )
80
+ kmer_path = hf_hub_download(
81
+ repo_id=MODEL_REPO,
82
+ filename="kmer_to_index.pkl",
83
+ token=hf_token
84
+ )
85
+ keras_model = load_model(keras_path)
86
+ with open(kmer_path, "rb") as f:
87
+ kmer_to_index = pickle.load(f)
88
+ logging.info("Keras model and k-mer index loaded successfully.")
 
 
 
 
 
89
  except Exception as e:
90
+ logging.warning(f"Failed to load Keras model or k-mer index: {e}")
91
+ keras_model = None
92
+ kmer_to_index = None
93
+ else:
94
+ logging.warning("Keras/TensorFlow not available.")
95
 
96
  # --- Initialize Tree Analyzer ---
97
+ def init_tree_analyzer():
98
+ global analyzer
99
+ if ml_simplified_tree and os.path.exists(CSV_PATH):
100
+ try:
101
+ analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
102
+ if analyzer.load_data(CSV_PATH):
103
+ logging.info("Tree analyzer initialized successfully.")
104
+ try:
105
+ if not analyzer.train_ai_model():
106
+ logging.warning("AI model training failed.")
107
+ except Exception as e:
108
+ logging.warning(f"AI model training failed: {e}")
109
+ else:
110
+ logging.error("Failed to load CSV data.")
111
+ analyzer = None
112
+ except Exception as e:
113
+ logging.error(f"Failed to initialize tree analyzer: {e}")
114
  analyzer = None
115
  else:
116
+ logging.warning("Tree analyzer or CSV file not available.")
117
  analyzer = None
 
 
 
118
 
119
+ # --- Tool Detection ---
120
  def check_tool_availability():
 
 
 
 
 
 
 
121
  mafft_candidates = [
122
+ MAFFT_PATH, 'mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', 'mafft.bat'
 
 
 
 
123
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  iqtree_candidates = [
125
+ IQTREE_PATH, 'iqtree2', 'iqtree', '/usr/bin/iqtree2', '/usr/local/bin/iqtree2',
126
+ '/usr/bin/iqtree', '/usr/local/bin/iqtree', 'iqtree2.exe', 'iqtree.exe'
 
 
 
 
 
 
 
127
  ]
128
 
129
+ mafft_cmd = next((cmd for cmd in mafft_candidates if cmd and (os.path.exists(cmd) or shutil.which(cmd))), None)
130
+ iqtree_cmd = next((cmd for cmd in iqtree_candidates if cmd and (os.path.exists(cmd) or shutil.which(cmd))), None)
 
 
 
 
131
 
132
+ return bool(mafft_cmd), bool(iqtree_cmd), mafft_cmd, iqtree_cmd
133
 
134
+ # --- Installation Guide ---
135
  def install_dependencies_guide():
136
+ return """
 
137
  🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
138
 
139
  For MAFFT:
 
148
  - macOS: brew install iqtree
149
  - Windows: Download from http://www.iqtree.org/
150
 
151
+ Conda: conda install -c bioconda mafft iqtree
 
 
 
 
 
152
  """
 
153
 
154
+ # --- MAFFT and IQ-TREE Functions ---
155
  def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
 
156
  try:
157
+ cmd = [mafft_cmd, '--auto', '--quiet', input_fasta]
158
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  if result.returncode == 0:
 
160
  with open(output_fasta, 'w') as f:
161
  f.write(result.stdout)
162
+ if os.path.getsize(output_fasta) > 0:
163
+ logging.info(f"MAFFT alignment completed: {output_fasta}")
 
 
164
  return True, output_fasta
165
+ return False, "MAFFT output empty."
166
+ return False, f"MAFFT error: {result.stderr.strip() or 'Unknown error'}"
 
 
 
 
 
 
 
 
 
 
167
  except Exception as e:
168
+ logging.error(f"MAFFT failed: {e}")
169
+ return False, f"MAFFT failed: {str(e)}"
170
 
171
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
 
172
  try:
 
173
  cmd = [
174
+ iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000',
175
+ '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '--quiet'
 
 
 
 
 
 
 
176
  ]
177
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200)
178
+ tree_file = f"{output_prefix}.treefile"
179
+ if result.returncode == 0 and os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
180
+ logging.info(f"IQ-TREE completed: {tree_file}")
181
+ return True, tree_file
182
+ return False, f"IQ-TREE error: {result.stderr.strip() or 'Tree file not generated'}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  except Exception as e:
184
+ logging.error(f"IQ-TREE failed: {e}")
185
+ return False, f"IQ-TREE failed: {str(e)}"
186
 
187
+ # --- Fallback Tree Construction ---
188
+ def create_simple_tree(sequences_dict):
189
  try:
 
 
 
 
190
  seq_names = list(sequences_dict.keys())
191
+ if len(seq_names) < 2:
192
+ return None, "Need at least 2 sequences."
193
+ tree_str = f"({','.join([f'{name}:0.1' for name in seq_names[:5]])});"
 
 
 
 
 
 
 
 
 
 
194
  tree_file = "simple_tree.nwk"
195
  with open(tree_file, 'w') as f:
196
  f.write(tree_str)
197
+ return tree_file, "Simple tree created."
 
 
198
  except Exception as e:
199
  return None, f"Simple tree creation failed: {str(e)}"
200
 
201
+ # --- Create Multi-FASTA ---
202
+ def create_multi_fasta(query_sequence, query_id="Query_F_Gene"):
203
  try:
 
204
  temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
 
 
205
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
 
 
206
  ref_fasta_path = "f_gene_sequences_aligned.fasta"
207
  if os.path.exists(ref_fasta_path):
208
  with open(ref_fasta_path, 'r') as ref_file:
209
  temp_fasta.write(ref_file.read())
210
+ elif analyzer and hasattr(analyzer, 'data'):
211
+ count = 0
212
+ for idx, row in analyzer.data.iterrows():
213
+ if 'sequence' in row and len(str(row['sequence'])) > 50:
214
+ temp_fasta.write(f">{row.get('id', f'Ref_{count}')}\n{str(row['sequence']).upper()}\n")
215
+ count += 1
216
+ if count >= 20:
217
+ break
 
 
 
 
 
 
 
218
  temp_fasta.close()
219
  return temp_fasta.name
 
220
  except Exception as e:
221
+ logging.error(f"Multi-FASTA creation failed: {e}")
222
  return None
223
 
224
+ # --- Pipeline: Maximum Likelihood Tree ---
225
+ def build_maximum_likelihood_tree(sequence):
226
  try:
227
+ sequence = re.sub(r'[^ATCG]', '', sequence.upper())
228
+ if len(sequence) < 50:
229
+ return False, "Sequence too short (<50 bp).", None, None
 
 
230
 
231
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
232
+ status_msg = "🔍 Dependencies:\n"
233
+ status_msg += f"✅ MAFFT: {mafft_cmd or 'Not found'}\n"
234
+ status_msg += f"✅ IQ-TREE: {iqtree_cmd or 'Not found'}\n"
 
 
 
 
 
235
 
236
+ if not mafft_available or not iqtree_available:
 
237
  guide = install_dependencies_guide()
238
+ return False, f"{status_msg}\n❌ Missing tools:\n{guide}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ os.makedirs("ml_tree_output", exist_ok=True)
241
+ multi_fasta = create_multi_fasta(sequence)
 
242
  if not multi_fasta:
243
+ return False, f"{status_msg}\n❌ Failed to create input FASTA.", None, None
244
 
245
+ aligned_fasta = "ml_tree_output/aligned_sequences.fasta"
 
 
246
  mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
 
 
247
  os.unlink(multi_fasta)
248
 
249
  if not mafft_success:
250
+ return False, f"{status_msg}\n❌ MAFFT failed: {mafft_result}", None, None
251
 
252
+ tree_prefix = "ml_tree_output/ml_tree"
 
 
253
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
 
254
  if not iqtree_success:
255
+ return False, f"{status_msg}\n❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
256
 
 
257
  tree_file = iqtree_result
258
+ shutil.copy2(aligned_fasta, "f_gene_sequences_aligned.fasta")
259
+ shutil.copy2(tree_file, "f_gene_sequences.phy.treefile")
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ success_msg = f"{status_msg}\n✅ ML tree built:\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}"
 
 
 
 
 
 
 
 
 
 
 
 
262
  return True, success_msg, aligned_fasta, tree_file
 
263
  except Exception as e:
264
  logging.error(f"ML tree construction failed: {e}")
265
  return False, f"ML tree construction failed: {str(e)}", None, None
266
 
267
+ # --- Pipeline: Verification ---
268
+ def run_verification_pipeline(sequence):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  results = {}
270
+ sequence = re.sub(r'[^ATCG]', '', sequence.upper())
271
+ if len(sequence) < 10:
272
+ results["error"] = "Sequence too short (<10 bp)."
273
+ return results
274
 
275
+ # Boundary model verification
276
+ if boundary_model:
277
+ try:
278
+ predictions, probs, confidence = boundary_model.predict(sequence)
279
+ regions = boundary_model.extract_gene_regions(predictions, sequence)
280
+ results["boundary_model"] = {
281
+ "type": "boundary_detection",
282
+ "confidence": float(confidence),
283
+ "regions_found": len(regions) if regions else 0,
284
+ "extracted_sequence": regions[0]["sequence"] if regions else None
285
+ }
286
+ except Exception as e:
287
+ results["boundary_model"] = {"error": f"Boundary prediction failed: {str(e)}"}
288
 
289
+ # Keras model verification
290
+ if keras_model and kmer_to_index:
 
 
 
291
  try:
292
+ kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
293
+ indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
294
+ input_arr = np.array([indices])
295
+ prediction = keras_model.predict(input_arr, verbose=0)[0]
296
+ results["keras_model"] = {
297
+ "type": "gene_validation",
298
+ "mean_score": float(np.mean(prediction)),
299
+ "max_score": float(np.max(prediction))
300
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  except Exception as e:
302
+ results["keras_model"] = {"error": f"Keras prediction failed: {str(e)}"}
 
303
 
304
  return results
305
 
306
+ # --- Format Results ---
307
+ def format_results(results, sequence, pipeline_type):
308
+ output = [f"🧬 {pipeline_type.upper()} ANALYSIS\nSequence length: {len(sequence)} bp\n{'=' * 50}"]
 
309
 
310
+ if "error" in results:
311
+ output.append(f" Error: {results['error']}")
312
+ return "\n".join(output)
313
 
314
+ if pipeline_type == "prediction":
315
+ if boundary_model and "boundary_model" in results:
316
+ r = results["boundary_model"]
317
+ if "error" not in r:
318
+ output.append("\n🎯 Boundary Detection:")
319
+ output.append(f"- Confidence: {r['confidence']:.3f}")
320
+ output.append(f"- Regions Found: {r['regions_found']}")
321
+ if r['extracted_sequence']:
322
+ output.append(f"- Extracted Length: {len(r['extracted_sequence'])} bp")
 
 
 
 
 
 
 
 
323
  else:
324
+ output.append(f"\n❌ Boundary Detection: {r['error']}")
325
+
326
+ if keras_model and "keras_model" in results:
327
+ r = results["keras_model"]
328
+ if "error" not in r:
329
+ output.append("\n🔍 Keras Validation:")
330
+ output.append(f"- Mean Score: {r['mean_score']:.3f}")
331
+ output.append(f"- Max Score: {r['max_score']:.3f}")
332
+ else:
333
+ output.append(f"\n❌ Keras Validation: {r['error']}")
334
+
335
+ elif pipeline_type == "tree":
336
+ output.append(results.get("message", "No tree results available."))
337
+ if results.get("tree_file"):
338
+ output.append(f"\nTree File: {os.path.basename(results['tree_file'])}")
339
 
340
+ return "\n".join(output)
341
 
342
+ # --- Interface Functions ---
343
+ def analyze_sequence(sequence):
344
+ sequence = re.sub(r'[^ATCG]', '', sequence.upper())
345
+ if not sequence or len(sequence) < 10:
346
+ return "Invalid or too short sequence (<10 bp)."
347
+
348
+ results = run_verification_pipeline(sequence)
349
+ return format_results(results, sequence, "prediction")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
+ def build_tree(sequence):
352
+ success, message, aligned_fasta, tree_file = build_maximum_likelihood_tree(sequence)
353
+ return format_results({"message": message, "tree_file": tree_file}, sequence, "tree")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+ # --- Gradio Interface ---
356
+ def create_gradio_interface():
357
+ css = """
358
+ .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
359
+ .output-text { font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4; }
360
+ .input-section { margin-bottom: 20px; }
361
+ """
362
+
363
+ with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
364
+ gr.Markdown("""
365
+ # 🧬 Gene Analysis Tool
366
+ Analyze DNA sequences, predict gene boundaries, and build phylogenetic trees.
367
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
+ # Input Section
370
+ with gr.Row():
371
+ with gr.Column(scale=1):
372
+ seq_input = gr.Textbox(
373
+ label="DNA Sequence",
374
+ placeholder="Enter DNA sequence (A, T, C, G only)...",
375
+ lines=5,
376
+ max_lines=10
377
+ )
378
+ file_input = gr.File(
379
+ label="Upload FASTA File",
380
+ file_types=[".fasta", ".fa", ".fas", ".txt"]
381
+ )
382
+ analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
383
+ tree_btn = gr.Button("🌳 Build Tree", variant="primary")
384
+
385
+ with gr.Column(scale=2):
386
+ output = gr.Textbox(
387
+ label="Results",
388
+ lines=20,
389
+ max_lines=30,
390
+ elem_classes=["output-text"]
391
+ )
392
 
393
+ # Status Information
394
+ gr.Markdown("### Tool Status")
395
+ status = []
396
+ status.append(f"✅ Boundary Model: {'Loaded' if boundary_model else 'Not Available'}")
397
+ status.append(f"✅ Keras Model: {'Loaded' if keras_model else 'Not Available'}")
398
+ status.append(f"✅ Tree Analyzer: {'Initialized' if analyzer else 'Not Available'}")
399
+ mafft_available, iqtree_available, _, _ = check_tool_availability()
400
+ status.append(f"✅ MAFFT: {'Available' if mafft_available else 'Not Available'}")
401
+ status.append(f"✅ IQ-TREE: {'Available' if iqtree_available else 'Not Available'}")
402
+ gr.Markdown("\n".join(status))
403
+
404
+ # Event Handlers
405
+ analyze_btn.click(fn=analyze_sequence, inputs=seq_input, outputs=output)
406
+ tree_btn.click(fn=build_tree, inputs=seq_input, outputs=output)
407
+ file_input.change(fn=process_fasta_file, inputs=file_input, outputs=output)
408
+
409
+ return interface
410
 
411
+ # --- File Processing ---
412
  def process_fasta_file(file):
 
413
  try:
414
+ if not file:
415
  return "Please upload a FASTA file."
416
 
 
 
 
 
 
417
  sequences = {}
418
  current_seq = ""
419
  current_name = ""
420
+ with open(file.name, 'r') as f:
421
+ for line in f:
422
+ line = line.strip()
423
+ if line.startswith('>'):
424
+ if current_name and current_seq:
425
+ sequences[current_name] = current_seq
426
+ current_name = line[1:]
427
+ current_seq = ""
428
+ else:
429
+ current_seq += line.upper()
 
 
 
430
  if current_name and current_seq:
431
  sequences[current_name] = current_seq
432
 
433
  if not sequences:
434
+ return "No valid sequences in FASTA file."
 
 
 
 
 
 
435
 
436
+ results = [f"📁 FASTA FILE ANALYSIS\nFound {len(sequences)} sequences\n{'=' * 50}"]
437
  for i, (name, seq) in enumerate(sequences.items()):
438
+ if i >= 5:
439
  results.append(f"\n... and {len(sequences) - 5} more sequences")
440
  break
441
+ results.append(f"\n🧬 Sequence: {name}\nLength: {len(seq)} bp")
 
 
 
 
442
  clean_seq = re.sub(r'[^ATCG]', '', seq)
443
  if len(clean_seq) >= 10:
444
+ results.append(analyze_sequence(clean_seq))
 
 
445
  else:
446
  results.append("❌ Sequence too short or invalid")
 
447
  results.append("-" * 40)
448
 
449
  return "\n".join(results)
 
450
  except Exception as e:
451
+ logging.error(f"FASTA processing failed: {e}")
452
  return f"FASTA processing failed: {str(e)}"
453
 
454
+ # --- Main ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
456
  os.makedirs("output", exist_ok=True)
457
  os.makedirs("ml_tree_output", exist_ok=True)
458
 
459
+ load_models()
460
+ init_tree_analyzer()
 
 
 
 
461
 
462
+ logging.info("Starting Gene Analysis Tool")
463
+ logging.info(f"Boundary model: {boundary_model is not None}")
464
+ logging.info(f"Keras model: {keras_model is not None}")
465
+ logging.info(f"Tree analyzer: {analyzer is not None}")
466
 
 
467
  try:
468
  interface = create_gradio_interface()
 
 
469
  interface.launch(
470
+ share=False,
471
+ server_name="0.0.0.0",
472
  server_port=7860,
473
  show_error=True,
474
  debug=True
475
  )
 
476
  except Exception as e:
477
+ logging.error(f"Interface launch failed: {e}")
 
 
478
  sys.exit(1)