re-type commited on
Commit
e856e28
·
verified ·
1 Parent(s): b32e104

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -51
app.py CHANGED
@@ -8,7 +8,7 @@ import os
8
  import re
9
  import logging
10
  import numpy as np
11
- from predictor import GenePredictor
12
  from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
  import tempfile
@@ -26,7 +26,6 @@ IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
 
28
  # --- Paths ---
29
- # Model repository and file paths
30
  model_repo = "GGproject10/best_boundary_aware_model"
31
  csv_path = "f cleaned.csv"
32
  classifier_model_dir = "model" # Directory for second model files
@@ -35,23 +34,15 @@ classifier_model_dir = "model" # Directory for second model files
35
  hf_token = os.getenv("HF_TOKEN")
36
 
37
  # --- Load Models ---
38
- boundary_model = None
39
  keras_model = None
40
  kmer_to_index = None
41
  classifier_model = None
42
  classifier_kmer_to_index = None
43
  classifier_maxlen = None
44
 
45
- # Try to load boundary model from Hugging Face Hub
46
- try:
47
- boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token)
48
- if os.path.exists(boundary_path):
49
- boundary_model = GenePredictor(boundary_path)
50
- logging.info("Boundary model loaded successfully from Hugging Face Hub.")
51
- else:
52
- logging.warning(f"Boundary model file not found after download")
53
- except Exception as e:
54
- logging.error(f"Failed to load boundary model from HF Hub: {e}")
55
 
56
  # Try to load Keras model from Hugging Face Hub
57
  try:
@@ -73,17 +64,25 @@ try:
73
  classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
74
  classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
75
  classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
76
- if os.path.exists(classifier_path) and os.path.exists(classifier_kmer_path) and os.path.exists(classifier_maxlen_path):
 
 
 
 
 
 
 
 
 
77
  classifier_model = load_model(classifier_path)
78
  with open(classifier_kmer_path, "rb") as f:
79
  classifier_kmer_to_index = pickle.load(f)
80
  with open(classifier_maxlen_path, "r") as f:
81
  classifier_maxlen = int(f.read().strip())
82
  logging.info("Classifier model loaded successfully.")
83
- else:
84
- logging.warning(f"Classifier model files not found in {classifier_model_dir}")
85
  except Exception as e:
86
  logging.error(f"Failed to load classifier model: {e}")
 
87
 
88
  LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
89
 
@@ -468,12 +467,33 @@ def predict_with_keras(sequence):
468
  return f"Keras prediction failed: {str(e)}"
469
 
470
  def classify_sequence(sequence):
471
- """Classify sequence using the second model"""
472
  try:
473
  if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  return {
475
  "status": "error",
476
- "message": "Classifier model not available.",
477
  "confidence": None,
478
  "predicted_label": None
479
  }
@@ -551,7 +571,7 @@ def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
551
  return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
552
 
553
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
554
- """Run the full pipeline"""
555
  try:
556
  dna_input = dna_input.upper().strip()
557
  if not dna_input:
@@ -560,27 +580,12 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
560
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
561
  logging.info("DNA sequence sanitized")
562
 
563
- # Step 1: Boundary Prediction
564
  processed_sequence = dna_input
565
- boundary_output = ""
566
- if boundary_model:
567
- try:
568
- predictions, probs, confidence = boundary_model.predict(dna_input)
569
- regions = boundary_model.extract_gene_regions(predictions, dna_input)
570
- if regions:
571
- processed_sequence = regions[0]["sequence"]
572
- boundary_output = processed_sequence
573
- logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
574
- else:
575
- boundary_output = f"No F gene regions found in input sequence"
576
- logging.warning("No gene regions found, using full sequence")
577
- except Exception as e:
578
- logging.error(f"Boundary model failed: {e}")
579
- boundary_output = f"Boundary model error: {str(e)}"
580
- else:
581
- boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
582
 
583
- # Step 2: Keras Prediction
584
  keras_output = ""
585
  if processed_sequence and len(processed_sequence) >= 6:
586
  keras_prediction = predict_with_keras(processed_sequence)
@@ -663,7 +668,7 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
663
  aligned_file,
664
  phy_file,
665
  html_file,
666
- f"Pipeline completed. F gene length: {len(processed_sequence)} bp"
667
  )
668
  except Exception as e:
669
  error_msg = f"Pipeline execution failed: {str(e)}"
@@ -687,11 +692,10 @@ def create_interface():
687
  gr.Markdown("""
688
  # 🧬 F Gene Analysis Pipeline
689
 
690
- This tool provides comprehensive analysis of F genes including:
691
- - **Gene Boundary Detection**: Extract F gene sequences
692
- - **Gene Validation**: Validate with machine learning
693
- - **Gene Classification**: Classify sequence type (F gene or other)
694
- - **Phylogenetic Analysis**: Build maximum likelihood and simplified trees
695
 
696
  **Instructions:**
697
  1. Enter your sequence or upload a FASTA file
@@ -717,10 +721,7 @@ def create_interface():
717
  status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
718
  gr.Markdown("### Available Models")
719
  model_status = []
720
- if boundary_model:
721
- model_status.append("✅ Boundary Detection Model")
722
- else:
723
- model_status.append("❌ Boundary Detection Model")
724
  if keras_model:
725
  model_status.append("✅ Gene Validation Model")
726
  else:
@@ -738,7 +739,7 @@ def create_interface():
738
  with gr.Tab("📊 Results"):
739
  with gr.Row():
740
  with gr.Column():
741
- boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False)
742
  keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
743
  classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
744
  classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
@@ -760,9 +761,9 @@ def create_interface():
760
  ## About This Tool
761
 
762
  ### F Gene Analysis Pipeline
763
- - **🎯 Gene Boundary Detection**: Extracts F gene sequences using deep learning.
764
  - **🔍 Gene Validation**: Validates with k-mer based machine learning.
765
- - **🧬 Gene Classification**: Classifies sequences (F gene or other) with confidence scores.
766
  - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
767
 
768
  ### Input Requirements
 
8
  import re
9
  import logging
10
  import numpy as np
11
+ from predictor import GenePredictor # Kept for potential future use, but not loaded
12
  from tensorflow.keras.models import load_model
13
  import ml_simplified_tree
14
  import tempfile
 
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
 
28
  # --- Paths ---
 
29
  model_repo = "GGproject10/best_boundary_aware_model"
30
  csv_path = "f cleaned.csv"
31
  classifier_model_dir = "model" # Directory for second model files
 
34
  hf_token = os.getenv("HF_TOKEN")
35
 
36
  # --- Load Models ---
37
+ boundary_model = None # Disabled as per request
38
  keras_model = None
39
  kmer_to_index = None
40
  classifier_model = None
41
  classifier_kmer_to_index = None
42
  classifier_maxlen = None
43
 
44
+ # Note: Boundary Model is disabled as per user request
45
+ logging.info("Boundary Model is currently disabled. Input will be used directly for verification and tree analysis.")
 
 
 
 
 
 
 
 
46
 
47
  # Try to load Keras model from Hugging Face Hub
48
  try:
 
64
  classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
65
  classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
66
  classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
67
+ missing_files = []
68
+ if not os.path.exists(classifier_path):
69
+ missing_files.append("best_model.keras")
70
+ if not os.path.exists(classifier_kmer_path):
71
+ missing_files.append("kmer_to_index.pkl")
72
+ if not os.path.exists(classifier_maxlen_path):
73
+ missing_files.append("maxlen.txt")
74
+ if missing_files:
75
+ logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
76
+ else:
77
  classifier_model = load_model(classifier_path)
78
  with open(classifier_kmer_path, "rb") as f:
79
  classifier_kmer_to_index = pickle.load(f)
80
  with open(classifier_maxlen_path, "r") as f:
81
  classifier_maxlen = int(f.read().strip())
82
  logging.info("Classifier model loaded successfully.")
 
 
83
  except Exception as e:
84
  logging.error(f"Failed to load classifier model: {e}")
85
+ logging.warning("Falling back to existing Keras model for validation.")
86
 
87
  LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
88
 
 
467
  return f"Keras prediction failed: {str(e)}"
468
 
469
  def classify_sequence(sequence):
470
+ """Classify sequence using the second model or fallback"""
471
  try:
472
  if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
473
+ if keras_model and kmer_to_index: # Fallback to Keras model
474
+ logging.warning("Using Keras model as fallback for classification.")
475
+ if len(sequence) < 6:
476
+ return {
477
+ "status": "error",
478
+ "message": "Sequence too short for k-mer prediction (minimum 6 nucleotides).",
479
+ "confidence": None,
480
+ "predicted_label": None
481
+ }
482
+ kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
483
+ indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
484
+ input_arr = np.array([indices])
485
+ pred = keras_model.predict(input_arr, verbose=0)[0]
486
+ confidence = float(np.max(pred))
487
+ label = "F" if confidence > 0.5 else "Unknown" # Simple threshold-based fallback
488
+ return {
489
+ "status": "success" if label == "F" else "warning",
490
+ "message": f"F gene detected (fallback)" if label == "F" else "Uncertain classification (fallback)",
491
+ "confidence": confidence,
492
+ "predicted_label": label
493
+ }
494
  return {
495
  "status": "error",
496
+ "message": "No classification model available.",
497
  "confidence": None,
498
  "predicted_label": None
499
  }
 
571
  return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
572
 
573
  def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
574
+ """Run the full pipeline with direct input to verification and ML tree"""
575
  try:
576
  dna_input = dna_input.upper().strip()
577
  if not dna_input:
 
580
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
581
  logging.info("DNA sequence sanitized")
582
 
583
+ # Step 1: Direct input (Boundary Model disabled)
584
  processed_sequence = dna_input
585
+ boundary_output = "Boundary Model disabled. Using raw input: " + str(len(dna_input)) + " bp"
586
+ logging.info("Using raw input directly for verification and tree analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
 
588
+ # Step 2: Keras Prediction (Verification)
589
  keras_output = ""
590
  if processed_sequence and len(processed_sequence) >= 6:
591
  keras_prediction = predict_with_keras(processed_sequence)
 
668
  aligned_file,
669
  phy_file,
670
  html_file,
671
+ f"Pipeline completed. Input length: {len(processed_sequence)} bp"
672
  )
673
  except Exception as e:
674
  error_msg = f"Pipeline execution failed: {str(e)}"
 
692
  gr.Markdown("""
693
  # 🧬 F Gene Analysis Pipeline
694
 
695
+ This tool analyzes input sequences directly (Boundary Model disabled):
696
+ - **Gene Validation**: Validates with machine learning.
697
+ - **Gene Classification**: Classifies sequence type (F gene or other).
698
+ - **Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
 
699
 
700
  **Instructions:**
701
  1. Enter your sequence or upload a FASTA file
 
721
  status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
722
  gr.Markdown("### Available Models")
723
  model_status = []
724
+ model_status.append("❌ Boundary Detection Model (Disabled)") # Reflect disabled state
 
 
 
725
  if keras_model:
726
  model_status.append("✅ Gene Validation Model")
727
  else:
 
739
  with gr.Tab("📊 Results"):
740
  with gr.Row():
741
  with gr.Column():
742
+ boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False, value="Boundary Model disabled. Using raw input.")
743
  keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
744
  classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
745
  classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
 
761
  ## About This Tool
762
 
763
  ### F Gene Analysis Pipeline
764
+ - **🎯 F Gene Extraction**: Disabled; uses raw input directly.
765
  - **🔍 Gene Validation**: Validates with k-mer based machine learning.
766
+ - **🧬 Gene Classification**: Classifies sequences (F gene or other).
767
  - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
768
 
769
  ### Input Requirements