Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 8, 2025

Commit

365584f

verified ·

1 Parent(s): 0a0ab75

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -66

app.py CHANGED Viewed

@@ -140,7 +140,7 @@ def run_pipeline(dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
-        # Step 1: Boundary Prediction
         processed_sequence = dna_input  # This will be the sequence used for downstream analysis
         boundary_output = ""
@@ -150,10 +150,12 @@ def run_pipeline(dna_input):
                 regions = boundary_model.extract_gene_regions(predictions, dna_input)
                 if regions:
                     processed_sequence = regions[0]["sequence"]  # Use the extracted gene region
-                    boundary_output = f"Gene region extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
                 else:
-                    boundary_output = f"No gene regions found, using full sequence: {len(dna_input)} bp"
                     processed_sequence = dna_input
                 logging.info("Boundary model prediction completed")
             except Exception as e:
                 logging.error(f"Boundary model failed: {e}")
@@ -163,88 +165,78 @@ def run_pipeline(dna_input):
             boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
             processed_sequence = dna_input
-        # Step 2: Keras Prediction (for analysis/scoring, not sequence modification)
         keras_output = ""
         if processed_sequence and len(processed_sequence) >= 6:
-            keras_output = predict_with_keras(processed_sequence)
         else:
-            keras_output = "Skipped: sequence too short for k-mer analysis"
-        # Step 3: MAFFT and IQ-TREE (using processed_sequence, not keras output)
         aligned_file = None
         phy_file = None
-        # Use the processed_sequence (from boundary model) for alignment
-        if processed_sequence and len(processed_sequence) >= 10:
-            try:
-                # Create FASTA file with the actual DNA sequence
-                fasta_file = "input_sequence.fasta"
-                with open(fasta_file, "w") as f:
-                    f.write(">query\n" + processed_sequence + "\n")
-                logging.info(f"Created FASTA file with sequence length: {len(processed_sequence)}")
-                # Check if MAFFT is executable
-                if os.path.exists(MAFFT_PATH):
-                    # Make MAFFT executable
-                    os.chmod(MAFFT_PATH, 0o755)
-                    # Run MAFFT
-                    aligned_file = "aligned.fasta"
-                    with open(aligned_file, "w") as outfile:
-                        result = subprocess.run([MAFFT_PATH, "--auto", fasta_file],
-                                              stdout=outfile, stderr=subprocess.PIPE, check=True)
-                    logging.info("MAFFT alignment completed")
-                    # Run IQ-TREE if alignment successful
-                    if os.path.exists(aligned_file):
-                        try:
-                            subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"],
-                                         check=True, capture_output=True)
-                            phy_file = "input_sequence.phy"
-                            logging.info("IQ-TREE analysis completed")
-                        except subprocess.CalledProcessError as e:
-                            logging.error(f"IQ-TREE failed: {e}")
-                        except FileNotFoundError:
-                            logging.error("IQ-TREE not found in system PATH")
-                else:
-                    logging.error(f"MAFFT not found at {MAFFT_PATH}")
-            except subprocess.CalledProcessError as e:
-                logging.error(f"MAFFT failed with exit code {e.returncode}")
-                logging.error(f"MAFFT stderr: {e.stderr.decode() if e.stderr else 'No stderr'}")
-            except Exception as e:
-                logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
-        # Step 4: ML Simplified Tree (using processed_sequence, not keras output)
         html_file = None
         tree_html_content = "No tree generated"
         ml_output = ""
         if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
             try:
-                logging.info(f"Starting ML tree analysis with sequence length: {len(processed_sequence)}")
                 analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
                 if analyzer.load_data(csv_path):
                     logging.info("CSV data loaded successfully")
-                    # Use the processed DNA sequence (not keras prediction scores)
                     if analyzer.find_query_sequence(processed_sequence):
-                        logging.info("Query sequence found in dataset")
                         matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
-                        analyzer.create_interactive_tree(matched_ids, perc)
-                        html_filename = "phylogenetic_tree_normalized_horizontal.html"
-                        if os.path.exists(html_filename):
-                            html_file = html_filename
-                            with open(html_filename, "r") as f:
-                                tree_html_content = f.read()
-                            ml_output = f"Tree generated successfully with {len(matched_ids)} sequences (match: {perc:.1f}%)"
-                        else:
-                            ml_output = "Tree generation completed but HTML file not found"
                     else:
-                        ml_output = f"Query sequence not found in dataset (length: {len(processed_sequence)} bp)"
-                        logging.warning(f"Query sequence not found. Length: {len(processed_sequence)}")
                 else:
                     ml_output = "Failed to load CSV dataset"
                     logging.error("Failed to load CSV dataset")
@@ -254,7 +246,7 @@ def run_pipeline(dna_input):
         elif not os.path.exists(csv_path):
             ml_output = f"CSV dataset not found at {csv_path}"
         elif not processed_sequence or len(processed_sequence) < 10:
-            ml_output = f"Sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
         else:
             ml_output = "Skipped due to previous step errors"
@@ -299,11 +291,11 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline") as demo:
     with gr.Row():
         with gr.Column():
-            out1 = gr.Textbox(label="Step 1: Boundary Model Output", lines=3)
-            out2 = gr.Textbox(label="Step 2: Keras Model Output", lines=3)
         with gr.Column():
             out3 = gr.Textbox(label="Dataset Used")
-            out4 = gr.Textbox(label="Step 3: ML Tree Status", lines=3)
     with gr.Row():
         html = gr.File(label="Download Tree (HTML)")

             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
+        # Step 1: Boundary Prediction - Extract F gene sequence
         processed_sequence = dna_input  # This will be the sequence used for downstream analysis
         boundary_output = ""
                 regions = boundary_model.extract_gene_regions(predictions, dna_input)
                 if regions:
                     processed_sequence = regions[0]["sequence"]  # Use the extracted gene region
+                    boundary_output = processed_sequence  # Output the actual F gene sequence
+                    logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
                 else:
+                    boundary_output = f"No F gene regions found in input sequence"
                     processed_sequence = dna_input
+                    logging.warning("No gene regions found, using full sequence")
                 logging.info("Boundary model prediction completed")
             except Exception as e:
                 logging.error(f"Boundary model failed: {e}")
             boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
             processed_sequence = dna_input
+        # Step 2: Keras Prediction (F gene validation)
         keras_output = ""
         if processed_sequence and len(processed_sequence) >= 6:
+            keras_prediction = predict_with_keras(processed_sequence)
+            # Interpret keras prediction as F gene validation
+            if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
+                # You might want to add logic here to interpret the prediction scores
+                # For now, just show the prediction
+                keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
+            else:
+                keras_output = keras_prediction
         else:
+            keras_output = "Skipped: sequence too short for F gene validation"
+        # Step 3: MAFFT and IQ-TREE (skip due to configuration issues)
         aligned_file = None
         phy_file = None
+        # Skip MAFFT due to configuration issues in the container
+        logging.info("Skipping MAFFT/IQ-TREE due to container configuration issues")
+        # Step 4: ML Simplified Tree (using the extracted F gene sequence)
         html_file = None
         tree_html_content = "No tree generated"
         ml_output = ""
         if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
             try:
+                logging.info(f"Starting ML tree analysis with F gene sequence length: {len(processed_sequence)}")
                 analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
                 if analyzer.load_data(csv_path):
                     logging.info("CSV data loaded successfully")
+                    # Use the extracted F gene sequence from boundary model
                     if analyzer.find_query_sequence(processed_sequence):
+                        logging.info("F gene sequence found in dataset")
                         matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
+                        # Try to create tree with error handling
+                        try:
+                            analyzer.create_interactive_tree(matched_ids, perc)
+                            # Check for multiple possible HTML file names
+                            possible_html_files = [
+                                "phylogenetic_tree_normalized_horizontal.html",
+                                "phylogenetic_tree.html",
+                                "tree.html"
+                            ]
+                            for html_filename in possible_html_files:
+                                if os.path.exists(html_filename):
+                                    html_file = html_filename
+                                    with open(html_filename, "r", encoding='utf-8') as f:
+                                        tree_html_content = f.read()
+                                    ml_output = f"Phylogenetic tree generated successfully with {len(matched_ids)} sequences (similarity: {perc:.1f}%)"
+                                    logging.info(f"Tree HTML file found: {html_filename}")
+                                    break
+                            if not html_file:
+                                # List all HTML files in directory for debugging
+                                html_files = [f for f in os.listdir('.') if f.endswith('.html')]
+                                ml_output = f"Tree analysis completed but HTML file not found. Available HTML files: {html_files}"
+                                logging.warning(f"HTML files in directory: {html_files}")
+                        except Exception as tree_error:
+                            ml_output = f"Tree creation failed: {str(tree_error)}"
+                            logging.error(f"Tree creation error: {tree_error}")
                     else:
+                        ml_output = f"F gene sequence not found in dataset (length: {len(processed_sequence)} bp)"
+                        logging.warning(f"F gene sequence not found. Length: {len(processed_sequence)}")
                 else:
                     ml_output = "Failed to load CSV dataset"
                     logging.error("Failed to load CSV dataset")
         elif not os.path.exists(csv_path):
             ml_output = f"CSV dataset not found at {csv_path}"
         elif not processed_sequence or len(processed_sequence) < 10:
+            ml_output = f"F gene sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
         else:
             ml_output = "Skipped due to previous step errors"
     with gr.Row():
         with gr.Column():
+            out1 = gr.Textbox(label="Step 1: Extracted F Gene Sequence", lines=8)
+            out2 = gr.Textbox(label="Step 2: F Gene Validation (Keras)", lines=3)
         with gr.Column():
             out3 = gr.Textbox(label="Dataset Used")
+            out4 = gr.Textbox(label="Step 3: Phylogenetic Tree Status", lines=3)
     with gr.Row():
         html = gr.File(label="Download Tree (HTML)")