Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 8, 2025

Commit

0a0ab75

verified ·

1 Parent(s): 79cb8b4

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -25

app.py CHANGED Viewed

@@ -24,7 +24,10 @@ import tempfile
 # Model repository and file paths
 model_repo = "GGproject10/best_boundary_aware_model"
-csv_path = "f cleaned.csv"
 # --- Load Models ---
 boundary_model = None
@@ -33,7 +36,11 @@ kmer_to_index = None
 # Try to load boundary model from Hugging Face Hub
 try:
-    boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth")
     if os.path.exists(boundary_path):
         boundary_model = GenePredictor(boundary_path)
         logging.info("Boundary model loaded successfully from Hugging Face Hub.")
@@ -44,8 +51,16 @@ except Exception as e:
 # Try to load Keras model from Hugging Face Hub
 try:
-    keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras")
-    kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl")
     if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
@@ -74,7 +89,7 @@ def predict_with_keras(sequence):
         input_arr = np.array([indices])
         prediction = keras_model.predict(input_arr, verbose=0)[0]
-        # Format prediction
         result = ''.join([str(round(p, 3)) for p in prediction])
         return result
     except Exception as e:
@@ -126,37 +141,48 @@ def run_pipeline(dna_input):
             logging.info("DNA sequence sanitized")
         # Step 1: Boundary Prediction
-        step1_out = dna_input  # Default
         if boundary_model:
             try:
                 predictions, probs, confidence = boundary_model.predict(dna_input)
                 regions = boundary_model.extract_gene_regions(predictions, dna_input)
                 if regions:
-                    step1_out = regions[0]["sequence"]
                 logging.info("Boundary model prediction completed")
             except Exception as e:
                 logging.error(f"Boundary model failed: {e}")
-                step1_out = f"Boundary model error: {str(e)}"
         else:
-            step1_out = f"Boundary model not available. Using original input: {dna_input[:100]}..."
-        # Step 2: Keras Prediction
-        if isinstance(step1_out, str) and not step1_out.startswith("Boundary model error"):
-            step2_out = predict_with_keras(step1_out)
         else:
-            step2_out = "Skipped due to boundary model error"
-        # Step 3: MAFFT and IQ-TREE
         aligned_file = None
         phy_file = None
-        # Only proceed if we have valid sequence data
-        if step2_out and not step2_out.startswith(("Keras", "Skipped")):
             try:
-                # Create FASTA file
                 fasta_file = "input_sequence.fasta"
                 with open(fasta_file, "w") as f:
-                    f.write(">query\n" + step2_out + "\n")
                 # Check if MAFFT is executable
                 if os.path.exists(MAFFT_PATH):
@@ -183,19 +209,28 @@ def run_pipeline(dna_input):
                             logging.error("IQ-TREE not found in system PATH")
                 else:
                     logging.error(f"MAFFT not found at {MAFFT_PATH}")
             except Exception as e:
                 logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
-        # Step 4: ML Simplified Tree
         html_file = None
         tree_html_content = "No tree generated"
         ml_output = ""
-        if os.path.exists(csv_path) and step2_out and not step2_out.startswith(("Keras", "Skipped")):
             try:
                 analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
                 if analyzer.load_data(csv_path):
-                    if analyzer.find_query_sequence(step2_out):
                         matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
                         analyzer.create_interactive_tree(matched_ids, perc)
@@ -204,24 +239,28 @@ def run_pipeline(dna_input):
                             html_file = html_filename
                             with open(html_filename, "r") as f:
                                 tree_html_content = f.read()
-                            ml_output = f"Tree generated successfully with {len(matched_ids)} sequences"
                         else:
                             ml_output = "Tree generation completed but HTML file not found"
                     else:
-                        ml_output = "Query sequence not found in dataset"
                 else:
                     ml_output = "Failed to load CSV dataset"
             except Exception as e:
                 ml_output = f"ML Tree analysis failed: {str(e)}"
                 logging.error(f"ML Tree failed: {e}")
         elif not os.path.exists(csv_path):
             ml_output = f"CSV dataset not found at {csv_path}"
         else:
             ml_output = "Skipped due to previous step errors"
         return (
-            step1_out[:500] + "..." if len(step1_out) > 500 else step1_out,  # Truncate long outputs
-            step2_out[:500] + "..." if len(step2_out) > 500 else step2_out,
             csv_path if os.path.exists(csv_path) else "CSV file not found",
             ml_output,
             html_file,

 # Model repository and file paths
 model_repo = "GGproject10/best_boundary_aware_model"
+csv_path = "f gene clean dataset.csv"
+# Get HF token from environment (if available)
+hf_token = os.getenv("HF_TOKEN")
 # --- Load Models ---
 boundary_model = None
 # Try to load boundary model from Hugging Face Hub
 try:
+    boundary_path = hf_hub_download(
+        repo_id=model_repo,
+        filename="best_boundary_aware_model.pth",
+        token=hf_token
+    )
     if os.path.exists(boundary_path):
         boundary_model = GenePredictor(boundary_path)
         logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 # Try to load Keras model from Hugging Face Hub
 try:
+    keras_path = hf_hub_download(
+        repo_id=model_repo,
+        filename="best_model.keras",
+        token=hf_token
+    )
+    kmer_path = hf_hub_download(
+        repo_id=model_repo,
+        filename="kmer_to_index.pkl",
+        token=hf_token
+    )
     if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
         input_arr = np.array([indices])
         prediction = keras_model.predict(input_arr, verbose=0)[0]
+        # Format prediction as probabilities/scores (not a sequence)
         result = ''.join([str(round(p, 3)) for p in prediction])
         return result
     except Exception as e:
             logging.info("DNA sequence sanitized")
         # Step 1: Boundary Prediction
+        processed_sequence = dna_input  # This will be the sequence used for downstream analysis
+        boundary_output = ""
         if boundary_model:
             try:
                 predictions, probs, confidence = boundary_model.predict(dna_input)
                 regions = boundary_model.extract_gene_regions(predictions, dna_input)
                 if regions:
+                    processed_sequence = regions[0]["sequence"]  # Use the extracted gene region
+                    boundary_output = f"Gene region extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
+                else:
+                    boundary_output = f"No gene regions found, using full sequence: {len(dna_input)} bp"
+                    processed_sequence = dna_input
                 logging.info("Boundary model prediction completed")
             except Exception as e:
                 logging.error(f"Boundary model failed: {e}")
+                boundary_output = f"Boundary model error: {str(e)}"
+                processed_sequence = dna_input  # Fall back to original sequence
         else:
+            boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
+            processed_sequence = dna_input
+        # Step 2: Keras Prediction (for analysis/scoring, not sequence modification)
+        keras_output = ""
+        if processed_sequence and len(processed_sequence) >= 6:
+            keras_output = predict_with_keras(processed_sequence)
         else:
+            keras_output = "Skipped: sequence too short for k-mer analysis"
+        # Step 3: MAFFT and IQ-TREE (using processed_sequence, not keras output)
         aligned_file = None
         phy_file = None
+        # Use the processed_sequence (from boundary model) for alignment
+        if processed_sequence and len(processed_sequence) >= 10:
             try:
+                # Create FASTA file with the actual DNA sequence
                 fasta_file = "input_sequence.fasta"
                 with open(fasta_file, "w") as f:
+                    f.write(">query\n" + processed_sequence + "\n")
+                logging.info(f"Created FASTA file with sequence length: {len(processed_sequence)}")
                 # Check if MAFFT is executable
                 if os.path.exists(MAFFT_PATH):
                             logging.error("IQ-TREE not found in system PATH")
                 else:
                     logging.error(f"MAFFT not found at {MAFFT_PATH}")
+            except subprocess.CalledProcessError as e:
+                logging.error(f"MAFFT failed with exit code {e.returncode}")
+                logging.error(f"MAFFT stderr: {e.stderr.decode() if e.stderr else 'No stderr'}")
             except Exception as e:
                 logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
+        # Step 4: ML Simplified Tree (using processed_sequence, not keras output)
         html_file = None
         tree_html_content = "No tree generated"
         ml_output = ""
+        if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
             try:
+                logging.info(f"Starting ML tree analysis with sequence length: {len(processed_sequence)}")
                 analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
                 if analyzer.load_data(csv_path):
+                    logging.info("CSV data loaded successfully")
+                    # Use the processed DNA sequence (not keras prediction scores)
+                    if analyzer.find_query_sequence(processed_sequence):
+                        logging.info("Query sequence found in dataset")
                         matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
                         analyzer.create_interactive_tree(matched_ids, perc)
                             html_file = html_filename
                             with open(html_filename, "r") as f:
                                 tree_html_content = f.read()
+                            ml_output = f"Tree generated successfully with {len(matched_ids)} sequences (match: {perc:.1f}%)"
                         else:
                             ml_output = "Tree generation completed but HTML file not found"
                     else:
+                        ml_output = f"Query sequence not found in dataset (length: {len(processed_sequence)} bp)"
+                        logging.warning(f"Query sequence not found. Length: {len(processed_sequence)}")
                 else:
                     ml_output = "Failed to load CSV dataset"
+                    logging.error("Failed to load CSV dataset")
             except Exception as e:
                 ml_output = f"ML Tree analysis failed: {str(e)}"
                 logging.error(f"ML Tree failed: {e}")
         elif not os.path.exists(csv_path):
             ml_output = f"CSV dataset not found at {csv_path}"
+        elif not processed_sequence or len(processed_sequence) < 10:
+            ml_output = f"Sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
         else:
             ml_output = "Skipped due to previous step errors"
         return (
+            boundary_output,
+            keras_output[:500] + "..." if len(keras_output) > 500 else keras_output,
             csv_path if os.path.exists(csv_path) else "CSV file not found",
             ml_output,
             html_file,