Spaces:
No application file
No application file
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import os
|
|
| 8 |
import re
|
| 9 |
import logging
|
| 10 |
import numpy as np
|
| 11 |
-
from predictor import GenePredictor
|
| 12 |
from tensorflow.keras.models import load_model
|
| 13 |
import ml_simplified_tree
|
| 14 |
import tempfile
|
|
@@ -26,7 +26,6 @@ IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
|
|
| 26 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 27 |
|
| 28 |
# --- Paths ---
|
| 29 |
-
# Model repository and file paths
|
| 30 |
model_repo = "GGproject10/best_boundary_aware_model"
|
| 31 |
csv_path = "f cleaned.csv"
|
| 32 |
classifier_model_dir = "model" # Directory for second model files
|
|
@@ -35,23 +34,15 @@ classifier_model_dir = "model" # Directory for second model files
|
|
| 35 |
hf_token = os.getenv("HF_TOKEN")
|
| 36 |
|
| 37 |
# --- Load Models ---
|
| 38 |
-
boundary_model = None
|
| 39 |
keras_model = None
|
| 40 |
kmer_to_index = None
|
| 41 |
classifier_model = None
|
| 42 |
classifier_kmer_to_index = None
|
| 43 |
classifier_maxlen = None
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token)
|
| 48 |
-
if os.path.exists(boundary_path):
|
| 49 |
-
boundary_model = GenePredictor(boundary_path)
|
| 50 |
-
logging.info("Boundary model loaded successfully from Hugging Face Hub.")
|
| 51 |
-
else:
|
| 52 |
-
logging.warning(f"Boundary model file not found after download")
|
| 53 |
-
except Exception as e:
|
| 54 |
-
logging.error(f"Failed to load boundary model from HF Hub: {e}")
|
| 55 |
|
| 56 |
# Try to load Keras model from Hugging Face Hub
|
| 57 |
try:
|
|
@@ -73,17 +64,25 @@ try:
|
|
| 73 |
classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
|
| 74 |
classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
|
| 75 |
classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
classifier_model = load_model(classifier_path)
|
| 78 |
with open(classifier_kmer_path, "rb") as f:
|
| 79 |
classifier_kmer_to_index = pickle.load(f)
|
| 80 |
with open(classifier_maxlen_path, "r") as f:
|
| 81 |
classifier_maxlen = int(f.read().strip())
|
| 82 |
logging.info("Classifier model loaded successfully.")
|
| 83 |
-
else:
|
| 84 |
-
logging.warning(f"Classifier model files not found in {classifier_model_dir}")
|
| 85 |
except Exception as e:
|
| 86 |
logging.error(f"Failed to load classifier model: {e}")
|
|
|
|
| 87 |
|
| 88 |
LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
|
| 89 |
|
|
@@ -468,12 +467,33 @@ def predict_with_keras(sequence):
|
|
| 468 |
return f"Keras prediction failed: {str(e)}"
|
| 469 |
|
| 470 |
def classify_sequence(sequence):
|
| 471 |
-
"""Classify sequence using the second model"""
|
| 472 |
try:
|
| 473 |
if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
return {
|
| 475 |
"status": "error",
|
| 476 |
-
"message": "
|
| 477 |
"confidence": None,
|
| 478 |
"predicted_label": None
|
| 479 |
}
|
|
@@ -551,7 +571,7 @@ def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
|
|
| 551 |
return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
|
| 552 |
|
| 553 |
def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
|
| 554 |
-
"""Run the full pipeline"""
|
| 555 |
try:
|
| 556 |
dna_input = dna_input.upper().strip()
|
| 557 |
if not dna_input:
|
|
@@ -560,27 +580,12 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
|
|
| 560 |
dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
|
| 561 |
logging.info("DNA sequence sanitized")
|
| 562 |
|
| 563 |
-
# Step 1: Boundary
|
| 564 |
processed_sequence = dna_input
|
| 565 |
-
boundary_output = ""
|
| 566 |
-
|
| 567 |
-
try:
|
| 568 |
-
predictions, probs, confidence = boundary_model.predict(dna_input)
|
| 569 |
-
regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 570 |
-
if regions:
|
| 571 |
-
processed_sequence = regions[0]["sequence"]
|
| 572 |
-
boundary_output = processed_sequence
|
| 573 |
-
logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
|
| 574 |
-
else:
|
| 575 |
-
boundary_output = f"No F gene regions found in input sequence"
|
| 576 |
-
logging.warning("No gene regions found, using full sequence")
|
| 577 |
-
except Exception as e:
|
| 578 |
-
logging.error(f"Boundary model failed: {e}")
|
| 579 |
-
boundary_output = f"Boundary model error: {str(e)}"
|
| 580 |
-
else:
|
| 581 |
-
boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
|
| 582 |
|
| 583 |
-
# Step 2: Keras Prediction
|
| 584 |
keras_output = ""
|
| 585 |
if processed_sequence and len(processed_sequence) >= 6:
|
| 586 |
keras_prediction = predict_with_keras(processed_sequence)
|
|
@@ -663,7 +668,7 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
|
|
| 663 |
aligned_file,
|
| 664 |
phy_file,
|
| 665 |
html_file,
|
| 666 |
-
f"Pipeline completed.
|
| 667 |
)
|
| 668 |
except Exception as e:
|
| 669 |
error_msg = f"Pipeline execution failed: {str(e)}"
|
|
@@ -687,11 +692,10 @@ def create_interface():
|
|
| 687 |
gr.Markdown("""
|
| 688 |
# 🧬 F Gene Analysis Pipeline
|
| 689 |
|
| 690 |
-
This tool
|
| 691 |
-
- **Gene
|
| 692 |
-
- **Gene
|
| 693 |
-
- **
|
| 694 |
-
- **Phylogenetic Analysis**: Build maximum likelihood and simplified trees
|
| 695 |
|
| 696 |
**Instructions:**
|
| 697 |
1. Enter your sequence or upload a FASTA file
|
|
@@ -717,10 +721,7 @@ def create_interface():
|
|
| 717 |
status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
|
| 718 |
gr.Markdown("### Available Models")
|
| 719 |
model_status = []
|
| 720 |
-
|
| 721 |
-
model_status.append("✅ Boundary Detection Model")
|
| 722 |
-
else:
|
| 723 |
-
model_status.append("❌ Boundary Detection Model")
|
| 724 |
if keras_model:
|
| 725 |
model_status.append("✅ Gene Validation Model")
|
| 726 |
else:
|
|
@@ -738,7 +739,7 @@ def create_interface():
|
|
| 738 |
with gr.Tab("📊 Results"):
|
| 739 |
with gr.Row():
|
| 740 |
with gr.Column():
|
| 741 |
-
boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False)
|
| 742 |
keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
|
| 743 |
classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
|
| 744 |
classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
|
|
@@ -760,9 +761,9 @@ def create_interface():
|
|
| 760 |
## About This Tool
|
| 761 |
|
| 762 |
### F Gene Analysis Pipeline
|
| 763 |
-
- **🎯 Gene
|
| 764 |
- **🔍 Gene Validation**: Validates with k-mer based machine learning.
|
| 765 |
-
- **🧬 Gene Classification**: Classifies sequences (F gene or other)
|
| 766 |
- **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
|
| 767 |
|
| 768 |
### Input Requirements
|
|
|
|
| 8 |
import re
|
| 9 |
import logging
|
| 10 |
import numpy as np
|
| 11 |
+
from predictor import GenePredictor # Kept for potential future use, but not loaded
|
| 12 |
from tensorflow.keras.models import load_model
|
| 13 |
import ml_simplified_tree
|
| 14 |
import tempfile
|
|
|
|
| 26 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 27 |
|
| 28 |
# --- Paths ---
|
|
|
|
| 29 |
model_repo = "GGproject10/best_boundary_aware_model"
|
| 30 |
csv_path = "f cleaned.csv"
|
| 31 |
classifier_model_dir = "model" # Directory for second model files
|
|
|
|
| 34 |
hf_token = os.getenv("HF_TOKEN")
|
| 35 |
|
| 36 |
# --- Load Models ---
|
| 37 |
+
boundary_model = None # Disabled as per request
|
| 38 |
keras_model = None
|
| 39 |
kmer_to_index = None
|
| 40 |
classifier_model = None
|
| 41 |
classifier_kmer_to_index = None
|
| 42 |
classifier_maxlen = None
|
| 43 |
|
| 44 |
+
# Note: Boundary Model is disabled as per user request
|
| 45 |
+
logging.info("Boundary Model is currently disabled. Input will be used directly for verification and tree analysis.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Try to load Keras model from Hugging Face Hub
|
| 48 |
try:
|
|
|
|
| 64 |
classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
|
| 65 |
classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
|
| 66 |
classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
|
| 67 |
+
missing_files = []
|
| 68 |
+
if not os.path.exists(classifier_path):
|
| 69 |
+
missing_files.append("best_model.keras")
|
| 70 |
+
if not os.path.exists(classifier_kmer_path):
|
| 71 |
+
missing_files.append("kmer_to_index.pkl")
|
| 72 |
+
if not os.path.exists(classifier_maxlen_path):
|
| 73 |
+
missing_files.append("maxlen.txt")
|
| 74 |
+
if missing_files:
|
| 75 |
+
logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
|
| 76 |
+
else:
|
| 77 |
classifier_model = load_model(classifier_path)
|
| 78 |
with open(classifier_kmer_path, "rb") as f:
|
| 79 |
classifier_kmer_to_index = pickle.load(f)
|
| 80 |
with open(classifier_maxlen_path, "r") as f:
|
| 81 |
classifier_maxlen = int(f.read().strip())
|
| 82 |
logging.info("Classifier model loaded successfully.")
|
|
|
|
|
|
|
| 83 |
except Exception as e:
|
| 84 |
logging.error(f"Failed to load classifier model: {e}")
|
| 85 |
+
logging.warning("Falling back to existing Keras model for validation.")
|
| 86 |
|
| 87 |
LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
|
| 88 |
|
|
|
|
| 467 |
return f"Keras prediction failed: {str(e)}"
|
| 468 |
|
| 469 |
def classify_sequence(sequence):
|
| 470 |
+
"""Classify sequence using the second model or fallback"""
|
| 471 |
try:
|
| 472 |
if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
|
| 473 |
+
if keras_model and kmer_to_index: # Fallback to Keras model
|
| 474 |
+
logging.warning("Using Keras model as fallback for classification.")
|
| 475 |
+
if len(sequence) < 6:
|
| 476 |
+
return {
|
| 477 |
+
"status": "error",
|
| 478 |
+
"message": "Sequence too short for k-mer prediction (minimum 6 nucleotides).",
|
| 479 |
+
"confidence": None,
|
| 480 |
+
"predicted_label": None
|
| 481 |
+
}
|
| 482 |
+
kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
|
| 483 |
+
indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
|
| 484 |
+
input_arr = np.array([indices])
|
| 485 |
+
pred = keras_model.predict(input_arr, verbose=0)[0]
|
| 486 |
+
confidence = float(np.max(pred))
|
| 487 |
+
label = "F" if confidence > 0.5 else "Unknown" # Simple threshold-based fallback
|
| 488 |
+
return {
|
| 489 |
+
"status": "success" if label == "F" else "warning",
|
| 490 |
+
"message": f"F gene detected (fallback)" if label == "F" else "Uncertain classification (fallback)",
|
| 491 |
+
"confidence": confidence,
|
| 492 |
+
"predicted_label": label
|
| 493 |
+
}
|
| 494 |
return {
|
| 495 |
"status": "error",
|
| 496 |
+
"message": "No classification model available.",
|
| 497 |
"confidence": None,
|
| 498 |
"predicted_label": None
|
| 499 |
}
|
|
|
|
| 571 |
return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
|
| 572 |
|
| 573 |
def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
|
| 574 |
+
"""Run the full pipeline with direct input to verification and ML tree"""
|
| 575 |
try:
|
| 576 |
dna_input = dna_input.upper().strip()
|
| 577 |
if not dna_input:
|
|
|
|
| 580 |
dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
|
| 581 |
logging.info("DNA sequence sanitized")
|
| 582 |
|
| 583 |
+
# Step 1: Direct input (Boundary Model disabled)
|
| 584 |
processed_sequence = dna_input
|
| 585 |
+
boundary_output = "Boundary Model disabled. Using raw input: " + str(len(dna_input)) + " bp"
|
| 586 |
+
logging.info("Using raw input directly for verification and tree analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
+
# Step 2: Keras Prediction (Verification)
|
| 589 |
keras_output = ""
|
| 590 |
if processed_sequence and len(processed_sequence) >= 6:
|
| 591 |
keras_prediction = predict_with_keras(processed_sequence)
|
|
|
|
| 668 |
aligned_file,
|
| 669 |
phy_file,
|
| 670 |
html_file,
|
| 671 |
+
f"Pipeline completed. Input length: {len(processed_sequence)} bp"
|
| 672 |
)
|
| 673 |
except Exception as e:
|
| 674 |
error_msg = f"Pipeline execution failed: {str(e)}"
|
|
|
|
| 692 |
gr.Markdown("""
|
| 693 |
# 🧬 F Gene Analysis Pipeline
|
| 694 |
|
| 695 |
+
This tool analyzes input sequences directly (Boundary Model disabled):
|
| 696 |
+
- **Gene Validation**: Validates with machine learning.
|
| 697 |
+
- **Gene Classification**: Classifies sequence type (F gene or other).
|
| 698 |
+
- **Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
|
|
|
|
| 699 |
|
| 700 |
**Instructions:**
|
| 701 |
1. Enter your sequence or upload a FASTA file
|
|
|
|
| 721 |
status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
|
| 722 |
gr.Markdown("### Available Models")
|
| 723 |
model_status = []
|
| 724 |
+
model_status.append("❌ Boundary Detection Model (Disabled)") # Reflect disabled state
|
|
|
|
|
|
|
|
|
|
| 725 |
if keras_model:
|
| 726 |
model_status.append("✅ Gene Validation Model")
|
| 727 |
else:
|
|
|
|
| 739 |
with gr.Tab("📊 Results"):
|
| 740 |
with gr.Row():
|
| 741 |
with gr.Column():
|
| 742 |
+
boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False, value="Boundary Model disabled. Using raw input.")
|
| 743 |
keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
|
| 744 |
classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
|
| 745 |
classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
|
|
|
|
| 761 |
## About This Tool
|
| 762 |
|
| 763 |
### F Gene Analysis Pipeline
|
| 764 |
+
- **🎯 F Gene Extraction**: Disabled; uses raw input directly.
|
| 765 |
- **🔍 Gene Validation**: Validates with k-mer based machine learning.
|
| 766 |
+
- **🧬 Gene Classification**: Classifies sequences (F gene or other).
|
| 767 |
- **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
|
| 768 |
|
| 769 |
### Input Requirements
|