Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 8, 2025

Commit

b08a8cc

verified ·

1 Parent(s): 78d934e

Update app.py

Browse files

Files changed (1) hide show

app.py +223 -127

app.py CHANGED Viewed

@@ -30,159 +30,255 @@ boundary_model = None
 keras_model = None
 kmer_to_index = None
-if os.path.exists(boundary_path):
-    try:
         boundary_model = GenePredictor(boundary_path)
-        logging.info(f"Boundary model loaded successfully from {boundary_path}")
-    except Exception as e:
-        logging.error(f"Failed to load Boundary model from {boundary_path}: {e}")
-else:
-    logging.error(f"Boundary model file not found at {boundary_path}")
-if os.path.exists(keras_path) and os.path.exists(kmer_path):
-    try:
         keras_model = load_model(keras_path)
         with open(kmer_path, "rb") as f:
             kmer_to_index = pickle.load(f)
-        logging.info(f"Keras model and k-mer index loaded successfully from {keras_path} and {kmer_path}")
-    except Exception as e:
-        logging.error(f"Failed to load Keras model or kmer_to_index from {keras_path} or {kmer_path}: {e}")
-else:
-    logging.error(f"Keras model or kmer_to_index file not found at {keras_path} or {kmer_path}")
 # --- Keras Prediction ---
 def predict_with_keras(sequence):
-    if len(sequence) < 6:
-        return "Sequence too short for k-mer prediction."
-    kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
-    indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
-    input_arr = np.array([indices])  # Changed from torch.tensor to np.array
-    prediction = keras_model.predict(input_arr, verbose=0)[0]
-    return ''.join(str(round(p, 3)) for p in prediction)
 # --- FASTA Reader ---
-def read_fasta_file(f):
-    content = f.read().decode("utf-8") if hasattr(f, "read") else open(f, "r").read()
-    lines = content.strip().split("\n")
-    seq_lines = [line.strip() for line in lines if not line.startswith(">")]
-    return ''.join(seq_lines)
 # --- Full Pipeline ---
 def run_pipeline_from_file(fasta_file_obj):
-    dna_input = read_fasta_file(fasta_file_obj)
-    return run_pipeline(dna_input)
 def run_pipeline(dna_input):
-    dna_input = dna_input.upper()
-    if not re.match('^[ACTGN]+$', dna_input):
-        dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
-        logging.warning("Invalid DNA sequence characters replaced with 'N'.")
-    # Step 1: Boundary Prediction
-    if boundary_model:
-        try:
-            predictions, probs, confidence = boundary_model.predict(dna_input)
-            regions = boundary_model.extract_gene_regions(predictions, dna_input)
-            step1_out = regions[0]["sequence"] if regions else dna_input
-            logging.info(f"Boundary model output: {step1_out[:50]}... (truncated)")
-        except Exception as e:
-            logging.error(f"Boundary model prediction failed: {e}")
-            step1_out = dna_input
-    else:
-        step1_out = dna_input
-        logging.info("Boundary model skipped due to loading failure or missing file")
-    # Step 2: Keras Prediction
-    if keras_model and kmer_to_index:
-        try:
-            step2_out = predict_with_keras(step1_out)
-            logging.info(f"Keras model output: {step2_out[:50]}... (truncated)")
-        except Exception as e:
-            logging.error(f"Keras prediction failed: {e}")
-            step2_out = step1_out
-    else:
-        step2_out = step1_out
-        logging.info("Keras model skipped due to loading failure or missing file")
-    # Save to FASTA for MAFFT/IQTREE (optional, can be skipped if ML tree is independent)
-    fasta_file = "input_sequence.fasta"
-    with open(fasta_file, "w") as f:
-        f.write(">query\n" + step2_out + "\n")
-    # Run MAFFT
-    aligned_file = "aligned.fasta"
-    mafft_exec = MAFFT_PATH  # Use global variable
-    try:
-        subprocess.run([mafft_exec, "--auto", fasta_file], stdout=open(aligned_file, "w"), check=True)
-    except Exception as e:
         aligned_file = None
-        logging.error(f"MAFFT failed: {e}")
-    # Run IQ-TREE (only if alignment exists)
-    phy_file = "input_sequence.phy"
-    if aligned_file is not None:
-        try:
-            subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"], check=True)
-        except Exception as e:
-            logging.error(f"IQ-TREE failed: {e}")
-    else:
-        logging.warning("Skipping IQ-TREE due to missing alignment file")
-    # Step 3: ML Simplified Tree (independent of MAFFT/IQ-TREE)
-    html_file = "phylogenetic_tree_normalized_horizontal.html"
-    ml_output = ""
-    tree_html_content = "No tree generated."
-    if os.path.exists(csv_path):
-        analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
-        if analyzer.load_data(csv_path):
-            if analyzer.find_query_sequence(step2_out):
-                matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
-                try:
-                    analyzer.create_interactive_tree(matched_ids, perc)
-                    ml_output = "Tree generated."
-                    if os.path.exists(html_file):
-                        with open(html_file, "r") as f:
-                            tree_html_content = f.read()
                     else:
-                        ml_output += " (HTML file not generated)"
-                        logging.error("HTML file not created after tree generation")
-                except Exception as e:
-                    ml_output = f"Error creating tree: {e}"
-                    logging.error(f"Tree creation failed: {e}")
-            else:
-                ml_output = "Query sequence not found in CSV."
-                logging.warning(f"Query sequence {step2_out[:50]}... not found")
         else:
-            ml_output = "Failed to load CSV."
-            logging.error("CSV loading failed")
-    else:
-        ml_output = "CSV file missing."
-        logging.error(f"CSV file not found at {csv_path}")
-    return step1_out, step2_out, csv_path, ml_output, html_file, aligned_file, phy_file, tree_html_content
 # --- Gradio UI ---
-with gr.Blocks() as demo:
     gr.Markdown("# Viral Gene Phylogenetic Inference Pipeline")
-    with gr.Tab("Paste DNA"):
-        inp = gr.Textbox(label="DNA Input")
-        btn1 = gr.Button("Run Pipeline")
-    with gr.Tab("Upload FASTA"):
-        file_input = gr.File(label="FASTA File", file_types=['.fasta', '.fa'])
-        btn2 = gr.Button("Run on FASTA")
-    out1 = gr.Textbox(label="Boundary Model Output")
-    out2 = gr.Textbox(label="Keras Model Output")
-    out3 = gr.Textbox(label="CSV File Used")
-    out4 = gr.Textbox(label="ML Tree Output")
-    html = gr.File(label="ML Tree (HTML)", file_types=['.html'])
-    fasta = gr.File(label="Aligned FASTA", file_types=['.fasta'])
-    phy = gr.File(label="IQ-TREE .phy File", file_types=['.phy'])
     tree_html = gr.HTML(label="Interactive Tree Preview")
-    btn1.click(fn=run_pipeline, inputs=inp, outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html])
-    btn2.click(fn=run_pipeline_from_file, inputs=file_input, outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html])
 if __name__ == '__main__':
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 keras_model = None
 kmer_to_index = None
+# Try to load boundary model
+try:
+    if os.path.exists(boundary_path):
         boundary_model = GenePredictor(boundary_path)
+        logging.info("Boundary model loaded successfully.")
+    else:
+        logging.warning(f"Boundary model file not found at {boundary_path}")
+except Exception as e:
+    logging.error(f"Failed to load boundary model: {e}")
+# Try to load Keras model
+try:
+    if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
         with open(kmer_path, "rb") as f:
             kmer_to_index = pickle.load(f)
+        logging.info("Keras model and k-mer index loaded successfully.")
+    else:
+        logging.warning(f"Keras model or kmer files not found at {keras_path} or {kmer_path}")
+except Exception as e:
+    logging.error(f"Failed to load Keras model: {e}")
 # --- Keras Prediction ---
 def predict_with_keras(sequence):
+    try:
+        if not keras_model or not kmer_to_index:
+            return f"Keras model not available. Input sequence: {sequence[:100]}..."
+        if len(sequence) < 6:
+            return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
+        # Generate k-mers
+        kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
+        indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
+        # Prepare input
+        input_arr = np.array([indices])
+        prediction = keras_model.predict(input_arr, verbose=0)[0]
+        # Format prediction
+        result = ''.join([str(round(p, 3)) for p in prediction])
+        return result
+    except Exception as e:
+        logging.error(f"Keras prediction failed: {e}")
+        return f"Keras prediction failed: {str(e)}"
 # --- FASTA Reader ---
+def read_fasta_file(file_obj):
+    try:
+        if file_obj is None:
+            return ""
+        # Handle file object
+        if hasattr(file_obj, 'name'):
+            with open(file_obj.name, "r") as f:
+                content = f.read()
+        else:
+            content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
+        lines = content.strip().split("\n")
+        seq_lines = [line.strip() for line in lines if not line.startswith(">")]
+        return ''.join(seq_lines)
+    except Exception as e:
+        logging.error(f"Failed to read FASTA file: {e}")
+        return ""
 # --- Full Pipeline ---
 def run_pipeline_from_file(fasta_file_obj):
+    try:
+        dna_input = read_fasta_file(fasta_file_obj)
+        if not dna_input:
+            return "Failed to read FASTA file", "", "", "", None, None, None, "No input sequence"
+        return run_pipeline(dna_input)
+    except Exception as e:
+        error_msg = f"Pipeline error: {str(e)}"
+        logging.error(error_msg)
+        return error_msg, "", "", "", None, None, None, error_msg
 def run_pipeline(dna_input):
+    try:
+        # Clean input
+        dna_input = dna_input.upper().strip()
+        if not dna_input:
+            return "Empty input", "", "", "", None, None, None, "No input provided"
+        # Sanitize DNA sequence
+        if not re.match('^[ACTGN]+$', dna_input):
+            dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
+            logging.info("DNA sequence sanitized")
+        # Step 1: Boundary Prediction
+        step1_out = dna_input  # Default
+        if boundary_model:
+            try:
+                predictions, probs, confidence = boundary_model.predict(dna_input)
+                regions = boundary_model.extract_gene_regions(predictions, dna_input)
+                if regions:
+                    step1_out = regions[0]["sequence"]
+                logging.info("Boundary model prediction completed")
+            except Exception as e:
+                logging.error(f"Boundary model failed: {e}")
+                step1_out = f"Boundary model error: {str(e)}"
+        else:
+            step1_out = f"Boundary model not available. Using original input: {dna_input[:100]}..."
+        # Step 2: Keras Prediction
+        if isinstance(step1_out, str) and not step1_out.startswith("Boundary model error"):
+            step2_out = predict_with_keras(step1_out)
+        else:
+            step2_out = "Skipped due to boundary model error"
+        # Step 3: MAFFT and IQ-TREE
         aligned_file = None
+        phy_file = None
+        # Only proceed if we have valid sequence data
+        if step2_out and not step2_out.startswith(("Keras", "Skipped")):
+            try:
+                # Create FASTA file
+                fasta_file = "input_sequence.fasta"
+                with open(fasta_file, "w") as f:
+                    f.write(">query\n" + step2_out + "\n")
+                # Check if MAFFT is executable
+                if os.path.exists(MAFFT_PATH):
+                    # Make MAFFT executable
+                    os.chmod(MAFFT_PATH, 0o755)
+                    # Run MAFFT
+                    aligned_file = "aligned.fasta"
+                    with open(aligned_file, "w") as outfile:
+                        result = subprocess.run([MAFFT_PATH, "--auto", fasta_file],
+                                              stdout=outfile, stderr=subprocess.PIPE, check=True)
+                    logging.info("MAFFT alignment completed")
+                    # Run IQ-TREE if alignment successful
+                    if os.path.exists(aligned_file):
+                        try:
+                            subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"],
+                                         check=True, capture_output=True)
+                            phy_file = "input_sequence.phy"
+                            logging.info("IQ-TREE analysis completed")
+                        except subprocess.CalledProcessError as e:
+                            logging.error(f"IQ-TREE failed: {e}")
+                        except FileNotFoundError:
+                            logging.error("IQ-TREE not found in system PATH")
+                else:
+                    logging.error(f"MAFFT not found at {MAFFT_PATH}")
+            except Exception as e:
+                logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
+        # Step 4: ML Simplified Tree
+        html_file = None
+        tree_html_content = "No tree generated"
+        ml_output = ""
+        if os.path.exists(csv_path) and step2_out and not step2_out.startswith(("Keras", "Skipped")):
+            try:
+                analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
+                if analyzer.load_data(csv_path):
+                    if analyzer.find_query_sequence(step2_out):
+                        matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
+                        analyzer.create_interactive_tree(matched_ids, perc)
+                        html_filename = "phylogenetic_tree_normalized_horizontal.html"
+                        if os.path.exists(html_filename):
+                            html_file = html_filename
+                            with open(html_filename, "r") as f:
+                                tree_html_content = f.read()
+                            ml_output = f"Tree generated successfully with {len(matched_ids)} sequences"
+                        else:
+                            ml_output = "Tree generation completed but HTML file not found"
                     else:
+                        ml_output = "Query sequence not found in dataset"
+                else:
+                    ml_output = "Failed to load CSV dataset"
+            except Exception as e:
+                ml_output = f"ML Tree analysis failed: {str(e)}"
+                logging.error(f"ML Tree failed: {e}")
+        elif not os.path.exists(csv_path):
+            ml_output = f"CSV dataset not found at {csv_path}"
         else:
+            ml_output = "Skipped due to previous step errors"
+        return (
+            step1_out[:500] + "..." if len(step1_out) > 500 else step1_out,  # Truncate long outputs
+            step2_out[:500] + "..." if len(step2_out) > 500 else step2_out,
+            csv_path if os.path.exists(csv_path) else "CSV file not found",
+            ml_output,
+            html_file,
+            aligned_file if aligned_file and os.path.exists(aligned_file) else None,
+            phy_file if phy_file and os.path.exists(phy_file) else None,
+            tree_html_content
+        )
+    except Exception as e:
+        error_msg = f"Pipeline failed: {str(e)}"
+        logging.error(error_msg)
+        return error_msg, "", "", "", None, None, None, error_msg
 # --- Gradio UI ---
+with gr.Blocks(title="Viral Gene Phylogenetic Pipeline") as demo:
     gr.Markdown("# Viral Gene Phylogenetic Inference Pipeline")
+    gr.Markdown("This pipeline processes DNA sequences through boundary detection, k-mer analysis, and phylogenetic tree construction.")
+    with gr.Tab("Paste DNA Sequence"):
+        inp = gr.Textbox(
+            label="DNA Input",
+            placeholder="Paste your DNA sequence here (ACTG format)",
+            lines=5
+        )
+        btn1 = gr.Button("Run Pipeline", variant="primary")
+    with gr.Tab("Upload FASTA File"):
+        file_input = gr.File(
+            label="FASTA File",
+            file_types=['.fasta', '.fa', '.txt']
+        )
+        btn2 = gr.Button("Run on FASTA", variant="primary")
+    # Outputs
+    gr.Markdown("## Pipeline Results")
+    with gr.Row():
+        with gr.Column():
+            out1 = gr.Textbox(label="Step 1: Boundary Model Output", lines=3)
+            out2 = gr.Textbox(label="Step 2: Keras Model Output", lines=3)
+        with gr.Column():
+            out3 = gr.Textbox(label="Dataset Used")
+            out4 = gr.Textbox(label="Step 3: ML Tree Status", lines=3)
+    with gr.Row():
+        html = gr.File(label="Download Tree (HTML)")
+        fasta = gr.File(label="Download Aligned FASTA")
+        phy = gr.File(label="Download IQ-TREE .phy File")
     tree_html = gr.HTML(label="Interactive Tree Preview")
+    # Event handlers
+    btn1.click(
+        fn=run_pipeline,
+        inputs=inp,
+        outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html]
+    )
+    btn2.click(
+        fn=run_pipeline_from_file,
+        inputs=file_input,
+        outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html]
+    )
 if __name__ == '__main__':
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)