Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 9, 2025

Commit

e856e28

verified ·

1 Parent(s): b32e104

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -51

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import os
 import re
 import logging
 import numpy as np
-from predictor import GenePredictor
 from tensorflow.keras.models import load_model
 import ml_simplified_tree
 import tempfile
@@ -26,7 +26,6 @@ IQTREE_PATH = "iqtree/bin/iqtree2"  # Update this path as needed
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Paths ---
-# Model repository and file paths
 model_repo = "GGproject10/best_boundary_aware_model"
 csv_path = "f cleaned.csv"
 classifier_model_dir = "model"  # Directory for second model files
@@ -35,23 +34,15 @@ classifier_model_dir = "model"  # Directory for second model files
 hf_token = os.getenv("HF_TOKEN")
 # --- Load Models ---
-boundary_model = None
 keras_model = None
 kmer_to_index = None
 classifier_model = None
 classifier_kmer_to_index = None
 classifier_maxlen = None
-# Try to load boundary model from Hugging Face Hub
-try:
-    boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token)
-    if os.path.exists(boundary_path):
-        boundary_model = GenePredictor(boundary_path)
-        logging.info("Boundary model loaded successfully from Hugging Face Hub.")
-    else:
-        logging.warning(f"Boundary model file not found after download")
-except Exception as e:
-    logging.error(f"Failed to load boundary model from HF Hub: {e}")
 # Try to load Keras model from Hugging Face Hub
 try:
@@ -73,17 +64,25 @@ try:
     classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
     classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
     classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
-    if os.path.exists(classifier_path) and os.path.exists(classifier_kmer_path) and os.path.exists(classifier_maxlen_path):
         classifier_model = load_model(classifier_path)
         with open(classifier_kmer_path, "rb") as f:
             classifier_kmer_to_index = pickle.load(f)
         with open(classifier_maxlen_path, "r") as f:
             classifier_maxlen = int(f.read().strip())
         logging.info("Classifier model loaded successfully.")
-    else:
-        logging.warning(f"Classifier model files not found in {classifier_model_dir}")
 except Exception as e:
     logging.error(f"Failed to load classifier model: {e}")
 LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
@@ -468,12 +467,33 @@ def predict_with_keras(sequence):
         return f"Keras prediction failed: {str(e)}"
 def classify_sequence(sequence):
-    """Classify sequence using the second model"""
     try:
         if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
             return {
                 "status": "error",
-                "message": "Classifier model not available.",
                 "confidence": None,
                 "predicted_label": None
             }
@@ -551,7 +571,7 @@ def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
         return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
 def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
-    """Run the full pipeline"""
     try:
         dna_input = dna_input.upper().strip()
         if not dna_input:
@@ -560,27 +580,12 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
-        # Step 1: Boundary Prediction
         processed_sequence = dna_input
-        boundary_output = ""
-        if boundary_model:
-            try:
-                predictions, probs, confidence = boundary_model.predict(dna_input)
-                regions = boundary_model.extract_gene_regions(predictions, dna_input)
-                if regions:
-                    processed_sequence = regions[0]["sequence"]
-                    boundary_output = processed_sequence
-                    logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
-                else:
-                    boundary_output = f"No F gene regions found in input sequence"
-                    logging.warning("No gene regions found, using full sequence")
-            except Exception as e:
-                logging.error(f"Boundary model failed: {e}")
-                boundary_output = f"Boundary model error: {str(e)}"
-        else:
-            boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
-        # Step 2: Keras Prediction
         keras_output = ""
         if processed_sequence and len(processed_sequence) >= 6:
             keras_prediction = predict_with_keras(processed_sequence)
@@ -663,7 +668,7 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
             aligned_file,
             phy_file,
             html_file,
-            f"Pipeline completed. F gene length: {len(processed_sequence)} bp"
         )
     except Exception as e:
         error_msg = f"Pipeline execution failed: {str(e)}"
@@ -687,11 +692,10 @@ def create_interface():
         gr.Markdown("""
         # 🧬 F Gene Analysis Pipeline
-        This tool provides comprehensive analysis of F genes including:
-        - **Gene Boundary Detection**: Extract F gene sequences
-        - **Gene Validation**: Validate with machine learning
-        - **Gene Classification**: Classify sequence type (F gene or other)
-        - **Phylogenetic Analysis**: Build maximum likelihood and simplified trees
         **Instructions:**
         1. Enter your sequence or upload a FASTA file
@@ -717,10 +721,7 @@ def create_interface():
                     status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
                     gr.Markdown("### Available Models")
                     model_status = []
-                    if boundary_model:
-                        model_status.append("✅ Boundary Detection Model")
-                    else:
-                        model_status.append("❌ Boundary Detection Model")
                     if keras_model:
                         model_status.append("✅ Gene Validation Model")
                     else:
@@ -738,7 +739,7 @@ def create_interface():
         with gr.Tab("📊 Results"):
             with gr.Row():
                 with gr.Column():
-                    boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False)
                     keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
                     classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
                     classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
@@ -760,9 +761,9 @@ def create_interface():
             ## About This Tool
             ### F Gene Analysis Pipeline
-            - **🎯 Gene Boundary Detection**: Extracts F gene sequences using deep learning.
             - **🔍 Gene Validation**: Validates with k-mer based machine learning.
-            - **🧬 Gene Classification**: Classifies sequences (F gene or other) with confidence scores.
             - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
             ### Input Requirements

 import re
 import logging
 import numpy as np
+from predictor import GenePredictor  # Kept for potential future use, but not loaded
 from tensorflow.keras.models import load_model
 import ml_simplified_tree
 import tempfile
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Paths ---
 model_repo = "GGproject10/best_boundary_aware_model"
 csv_path = "f cleaned.csv"
 classifier_model_dir = "model"  # Directory for second model files
 hf_token = os.getenv("HF_TOKEN")
 # --- Load Models ---
+boundary_model = None  # Disabled as per request
 keras_model = None
 kmer_to_index = None
 classifier_model = None
 classifier_kmer_to_index = None
 classifier_maxlen = None
+# Note: Boundary Model is disabled as per user request
+logging.info("Boundary Model is currently disabled. Input will be used directly for verification and tree analysis.")
 # Try to load Keras model from Hugging Face Hub
 try:
     classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
     classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
     classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
+    missing_files = []
+    if not os.path.exists(classifier_path):
+        missing_files.append("best_model.keras")
+    if not os.path.exists(classifier_kmer_path):
+        missing_files.append("kmer_to_index.pkl")
+    if not os.path.exists(classifier_maxlen_path):
+        missing_files.append("maxlen.txt")
+    if missing_files:
+        logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
+    else:
         classifier_model = load_model(classifier_path)
         with open(classifier_kmer_path, "rb") as f:
             classifier_kmer_to_index = pickle.load(f)
         with open(classifier_maxlen_path, "r") as f:
             classifier_maxlen = int(f.read().strip())
         logging.info("Classifier model loaded successfully.")
 except Exception as e:
     logging.error(f"Failed to load classifier model: {e}")
+    logging.warning("Falling back to existing Keras model for validation.")
 LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
         return f"Keras prediction failed: {str(e)}"
 def classify_sequence(sequence):
+    """Classify sequence using the second model or fallback"""
     try:
         if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
+            if keras_model and kmer_to_index:  # Fallback to Keras model
+                logging.warning("Using Keras model as fallback for classification.")
+                if len(sequence) < 6:
+                    return {
+                        "status": "error",
+                        "message": "Sequence too short for k-mer prediction (minimum 6 nucleotides).",
+                        "confidence": None,
+                        "predicted_label": None
+                    }
+                kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
+                indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
+                input_arr = np.array([indices])
+                pred = keras_model.predict(input_arr, verbose=0)[0]
+                confidence = float(np.max(pred))
+                label = "F" if confidence > 0.5 else "Unknown"  # Simple threshold-based fallback
+                return {
+                    "status": "success" if label == "F" else "warning",
+                    "message": f"F gene detected (fallback)" if label == "F" else "Uncertain classification (fallback)",
+                    "confidence": confidence,
+                    "predicted_label": label
+                }
             return {
                 "status": "error",
+                "message": "No classification model available.",
                 "confidence": None,
                 "predicted_label": None
             }
         return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
 def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
+    """Run the full pipeline with direct input to verification and ML tree"""
     try:
         dna_input = dna_input.upper().strip()
         if not dna_input:
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
+        # Step 1: Direct input (Boundary Model disabled)
         processed_sequence = dna_input
+        boundary_output = "Boundary Model disabled. Using raw input: " + str(len(dna_input)) + " bp"
+        logging.info("Using raw input directly for verification and tree analysis")
+        # Step 2: Keras Prediction (Verification)
         keras_output = ""
         if processed_sequence and len(processed_sequence) >= 6:
             keras_prediction = predict_with_keras(processed_sequence)
             aligned_file,
             phy_file,
             html_file,
+            f"Pipeline completed. Input length: {len(processed_sequence)} bp"
         )
     except Exception as e:
         error_msg = f"Pipeline execution failed: {str(e)}"
         gr.Markdown("""
         # 🧬 F Gene Analysis Pipeline
+        This tool analyzes input sequences directly (Boundary Model disabled):
+        - **Gene Validation**: Validates with machine learning.
+        - **Gene Classification**: Classifies sequence type (F gene or other).
+        - **Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
         **Instructions:**
         1. Enter your sequence or upload a FASTA file
                     status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
                     gr.Markdown("### Available Models")
                     model_status = []
+                    model_status.append("❌ Boundary Detection Model (Disabled)")  # Reflect disabled state
                     if keras_model:
                         model_status.append("✅ Gene Validation Model")
                     else:
         with gr.Tab("📊 Results"):
             with gr.Row():
                 with gr.Column():
+                    boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False, value="Boundary Model disabled. Using raw input.")
                     keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
                     classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
                     classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
             ## About This Tool
             ### F Gene Analysis Pipeline
+            - **🎯 F Gene Extraction**: Disabled; uses raw input directly.
             - **🔍 Gene Validation**: Validates with k-mer based machine learning.
+            - **🧬 Gene Classification**: Classifies sequences (F gene or other).
             - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
             ### Input Requirements