Spaces:
No application file
No application file
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,23 +17,14 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
| 17 |
try:
|
| 18 |
boundary_model = GenePredictor("best_boundary_aware_model.pth")
|
| 19 |
keras_model = load_model("best_model.keras")
|
| 20 |
-
logging.info("Models loaded successfully")
|
| 21 |
-
except FileNotFoundError as e:
|
| 22 |
-
logging.error(f"Model file not found: {e}")
|
| 23 |
-
raise
|
| 24 |
-
except Exception as e:
|
| 25 |
-
logging.error(f"Error loading models: {e}")
|
| 26 |
-
raise
|
| 27 |
-
|
| 28 |
-
try:
|
| 29 |
with open("kmer_to_index.pkl", "rb") as f:
|
| 30 |
kmer_to_index = pickle.load(f)
|
| 31 |
-
logging.info("kmer_to_index
|
| 32 |
-
except FileNotFoundError:
|
| 33 |
-
logging.error("
|
| 34 |
raise
|
| 35 |
except Exception as e:
|
| 36 |
-
logging.error(f"Error loading
|
| 37 |
raise
|
| 38 |
|
| 39 |
# --------- Utilities ---------
|
|
@@ -42,7 +33,7 @@ def predict_with_keras(sequence):
|
|
| 42 |
kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
|
| 43 |
indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
|
| 44 |
input_arr = torch.tensor([indices])
|
| 45 |
-
prediction = keras_model.predict(input_arr)[0]
|
| 46 |
return "".join(str(round(p, 3)) for p in prediction)
|
| 47 |
except Exception as e:
|
| 48 |
logging.error(f"Error in Keras prediction: {e}")
|
|
@@ -68,8 +59,10 @@ def save_to_csv(sequence, path):
|
|
| 68 |
logging.error(f"Error saving CSV: {e}")
|
| 69 |
return f"Error saving CSV: {e}"
|
| 70 |
|
| 71 |
-
def run_mafft_and_iqtree(
|
|
|
|
| 72 |
try:
|
|
|
|
| 73 |
subprocess.run(["mafft", "--auto", fasta_file], check=True)
|
| 74 |
subprocess.run(["iqtree", "-s", fasta_file, "-m", "GTR"], check=True)
|
| 75 |
logging.info("MAFFT and IQTree executed successfully")
|
|
@@ -83,32 +76,32 @@ def run_mafft_and_iqtree(fasta_file="f_gene_sequences_aligned.fasta"):
|
|
| 83 |
|
| 84 |
def run_full_pipeline(dna_input):
|
| 85 |
try:
|
| 86 |
-
# 1
|
| 87 |
predictions, probs, confidence = boundary_model.predict(dna_input)
|
| 88 |
gene_regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 89 |
step1_out = gene_regions[0]["sequence"] if gene_regions else dna_input
|
| 90 |
logging.info(f"Boundary model output: {step1_out}")
|
| 91 |
|
| 92 |
-
# 2
|
| 93 |
step2_out = predict_with_keras(step1_out)
|
| 94 |
logging.info(f"Keras model output: {step2_out}")
|
| 95 |
|
| 96 |
-
#
|
| 97 |
fasta_status = save_to_fasta("Predicted_Seq", step2_out, "f_gene_sequences_aligned.fasta")
|
| 98 |
-
csv_status = save_to_csv(step2_out, "
|
| 99 |
|
| 100 |
-
#
|
| 101 |
-
mafft_status = run_mafft_and_iqtree()
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
html_file = "tree.html"
|
| 105 |
try:
|
| 106 |
-
ml_output = maximum_likelihood("
|
| 107 |
if os.path.exists(html_file):
|
| 108 |
logging.info(f"HTML tree file generated: {html_file}")
|
| 109 |
else:
|
| 110 |
logging.warning(f"HTML tree file {html_file} not found")
|
| 111 |
-
html_file = None
|
| 112 |
except Exception as e:
|
| 113 |
logging.error(f"ML Tree Error: {e}")
|
| 114 |
ml_output = f"ML Tree Error: {e}"
|
|
@@ -121,7 +114,7 @@ def run_full_pipeline(dna_input):
|
|
| 121 |
"CSV Save Status": csv_status,
|
| 122 |
"MAFFT + IQTree Status": mafft_status,
|
| 123 |
"Maximum Likelihood Tree Output": ml_output,
|
| 124 |
-
"Tree HTML File": html_file
|
| 125 |
}
|
| 126 |
except Exception as e:
|
| 127 |
logging.error(f"Pipeline failed: {e}")
|
|
@@ -135,10 +128,10 @@ def run_full_pipeline(dna_input):
|
|
| 135 |
"Tree HTML File": None
|
| 136 |
}
|
| 137 |
|
| 138 |
-
# --------- Gradio Interface ---------
|
| 139 |
with gr.Blocks() as gr_interface:
|
| 140 |
gr.Markdown("# Sequential Phylogenetic Inference Pipeline")
|
| 141 |
-
gr.Markdown("This pipeline runs sequentially: Boundary-Aware Model → Keras Model →
|
| 142 |
|
| 143 |
dna_input = gr.Textbox(label="Input DNA Sequence")
|
| 144 |
submit_button = gr.Button("Run Pipeline")
|
|
|
|
| 17 |
try:
|
| 18 |
boundary_model = GenePredictor("best_boundary_aware_model.pth")
|
| 19 |
keras_model = load_model("best_model.keras")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
with open("kmer_to_index.pkl", "rb") as f:
|
| 21 |
kmer_to_index = pickle.load(f)
|
| 22 |
+
logging.info("Models and kmer_to_index loaded successfully")
|
| 23 |
+
except FileNotFoundError as e:
|
| 24 |
+
logging.error(f"Model or file not found: {e}")
|
| 25 |
raise
|
| 26 |
except Exception as e:
|
| 27 |
+
logging.error(f"Error loading models or files: {e}")
|
| 28 |
raise
|
| 29 |
|
| 30 |
# --------- Utilities ---------
|
|
|
|
| 33 |
kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
|
| 34 |
indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
|
| 35 |
input_arr = torch.tensor([indices])
|
| 36 |
+
prediction = keras_model.predict(input_arr, verbose=0)[0]
|
| 37 |
return "".join(str(round(p, 3)) for p in prediction)
|
| 38 |
except Exception as e:
|
| 39 |
logging.error(f"Error in Keras prediction: {e}")
|
|
|
|
| 59 |
logging.error(f"Error saving CSV: {e}")
|
| 60 |
return f"Error saving CSV: {e}"
|
| 61 |
|
| 62 |
+
def run_mafft_and_iqtree(sequence):
|
| 63 |
+
fasta_file = "f_gene_sequences_aligned.fasta"
|
| 64 |
try:
|
| 65 |
+
save_to_fasta("Predicted_Seq", sequence, fasta_file)
|
| 66 |
subprocess.run(["mafft", "--auto", fasta_file], check=True)
|
| 67 |
subprocess.run(["iqtree", "-s", fasta_file, "-m", "GTR"], check=True)
|
| 68 |
logging.info("MAFFT and IQTree executed successfully")
|
|
|
|
| 76 |
|
| 77 |
def run_full_pipeline(dna_input):
|
| 78 |
try:
|
| 79 |
+
# Step 1: Boundary-Aware Prediction
|
| 80 |
predictions, probs, confidence = boundary_model.predict(dna_input)
|
| 81 |
gene_regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 82 |
step1_out = gene_regions[0]["sequence"] if gene_regions else dna_input
|
| 83 |
logging.info(f"Boundary model output: {step1_out}")
|
| 84 |
|
| 85 |
+
# Step 2: Keras Prediction
|
| 86 |
step2_out = predict_with_keras(step1_out)
|
| 87 |
logging.info(f"Keras model output: {step2_out}")
|
| 88 |
|
| 89 |
+
# Step 3a: Save for MAFFT/IQTree and ML Simplified Tree
|
| 90 |
fasta_status = save_to_fasta("Predicted_Seq", step2_out, "f_gene_sequences_aligned.fasta")
|
| 91 |
+
csv_status = save_to_csv(step2_out, "f_gene_clean_dataset.csv")
|
| 92 |
|
| 93 |
+
# Step 3b: Run MAFFT and IQTree
|
| 94 |
+
mafft_status = run_mafft_and_iqtree(step2_out)
|
| 95 |
|
| 96 |
+
# Step 3c: Run ML Simplified Tree
|
| 97 |
+
html_file = "tree.html"
|
| 98 |
try:
|
| 99 |
+
ml_output = maximum_likelihood("f_gene_clean_dataset.csv")
|
| 100 |
if os.path.exists(html_file):
|
| 101 |
logging.info(f"HTML tree file generated: {html_file}")
|
| 102 |
else:
|
| 103 |
logging.warning(f"HTML tree file {html_file} not found")
|
| 104 |
+
html_file = None
|
| 105 |
except Exception as e:
|
| 106 |
logging.error(f"ML Tree Error: {e}")
|
| 107 |
ml_output = f"ML Tree Error: {e}"
|
|
|
|
| 114 |
"CSV Save Status": csv_status,
|
| 115 |
"MAFFT + IQTree Status": mafft_status,
|
| 116 |
"Maximum Likelihood Tree Output": ml_output,
|
| 117 |
+
"Tree HTML File": html_file
|
| 118 |
}
|
| 119 |
except Exception as e:
|
| 120 |
logging.error(f"Pipeline failed: {e}")
|
|
|
|
| 128 |
"Tree HTML File": None
|
| 129 |
}
|
| 130 |
|
| 131 |
+
# --------- Gradio Interface and API ---------
|
| 132 |
with gr.Blocks() as gr_interface:
|
| 133 |
gr.Markdown("# Sequential Phylogenetic Inference Pipeline")
|
| 134 |
+
gr.Markdown("This pipeline runs sequentially: Boundary-Aware Model → Keras Model → MAFFT/IQTree & ML Tree")
|
| 135 |
|
| 136 |
dna_input = gr.Textbox(label="Input DNA Sequence")
|
| 137 |
submit_button = gr.Button("Run Pipeline")
|