Spaces:
No application file
No application file
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,7 +24,10 @@ import tempfile
|
|
| 24 |
|
| 25 |
# Model repository and file paths
|
| 26 |
model_repo = "GGproject10/best_boundary_aware_model"
|
| 27 |
-
csv_path = "f
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# --- Load Models ---
|
| 30 |
boundary_model = None
|
|
@@ -33,7 +36,11 @@ kmer_to_index = None
|
|
| 33 |
|
| 34 |
# Try to load boundary model from Hugging Face Hub
|
| 35 |
try:
|
| 36 |
-
boundary_path = hf_hub_download(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
if os.path.exists(boundary_path):
|
| 38 |
boundary_model = GenePredictor(boundary_path)
|
| 39 |
logging.info("Boundary model loaded successfully from Hugging Face Hub.")
|
|
@@ -44,8 +51,16 @@ except Exception as e:
|
|
| 44 |
|
| 45 |
# Try to load Keras model from Hugging Face Hub
|
| 46 |
try:
|
| 47 |
-
keras_path = hf_hub_download(
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
if os.path.exists(keras_path) and os.path.exists(kmer_path):
|
| 51 |
keras_model = load_model(keras_path)
|
|
@@ -74,7 +89,7 @@ def predict_with_keras(sequence):
|
|
| 74 |
input_arr = np.array([indices])
|
| 75 |
prediction = keras_model.predict(input_arr, verbose=0)[0]
|
| 76 |
|
| 77 |
-
# Format prediction
|
| 78 |
result = ''.join([str(round(p, 3)) for p in prediction])
|
| 79 |
return result
|
| 80 |
except Exception as e:
|
|
@@ -126,37 +141,48 @@ def run_pipeline(dna_input):
|
|
| 126 |
logging.info("DNA sequence sanitized")
|
| 127 |
|
| 128 |
# Step 1: Boundary Prediction
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
if boundary_model:
|
| 131 |
try:
|
| 132 |
predictions, probs, confidence = boundary_model.predict(dna_input)
|
| 133 |
regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 134 |
if regions:
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
logging.info("Boundary model prediction completed")
|
| 137 |
except Exception as e:
|
| 138 |
logging.error(f"Boundary model failed: {e}")
|
| 139 |
-
|
|
|
|
| 140 |
else:
|
| 141 |
-
|
|
|
|
| 142 |
|
| 143 |
-
# Step 2: Keras Prediction
|
| 144 |
-
|
| 145 |
-
|
|
|
|
| 146 |
else:
|
| 147 |
-
|
| 148 |
|
| 149 |
-
# Step 3: MAFFT and IQ-TREE
|
| 150 |
aligned_file = None
|
| 151 |
phy_file = None
|
| 152 |
|
| 153 |
-
#
|
| 154 |
-
if
|
| 155 |
try:
|
| 156 |
-
# Create FASTA file
|
| 157 |
fasta_file = "input_sequence.fasta"
|
| 158 |
with open(fasta_file, "w") as f:
|
| 159 |
-
f.write(">query\n" +
|
|
|
|
|
|
|
| 160 |
|
| 161 |
# Check if MAFFT is executable
|
| 162 |
if os.path.exists(MAFFT_PATH):
|
|
@@ -183,19 +209,28 @@ def run_pipeline(dna_input):
|
|
| 183 |
logging.error("IQ-TREE not found in system PATH")
|
| 184 |
else:
|
| 185 |
logging.error(f"MAFFT not found at {MAFFT_PATH}")
|
|
|
|
|
|
|
|
|
|
| 186 |
except Exception as e:
|
| 187 |
logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
|
| 188 |
|
| 189 |
-
# Step 4: ML Simplified Tree
|
| 190 |
html_file = None
|
| 191 |
tree_html_content = "No tree generated"
|
| 192 |
ml_output = ""
|
| 193 |
|
| 194 |
-
if os.path.exists(csv_path) and
|
| 195 |
try:
|
|
|
|
| 196 |
analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
|
|
|
|
| 197 |
if analyzer.load_data(csv_path):
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
|
| 200 |
analyzer.create_interactive_tree(matched_ids, perc)
|
| 201 |
|
|
@@ -204,24 +239,28 @@ def run_pipeline(dna_input):
|
|
| 204 |
html_file = html_filename
|
| 205 |
with open(html_filename, "r") as f:
|
| 206 |
tree_html_content = f.read()
|
| 207 |
-
ml_output = f"Tree generated successfully with {len(matched_ids)} sequences"
|
| 208 |
else:
|
| 209 |
ml_output = "Tree generation completed but HTML file not found"
|
| 210 |
else:
|
| 211 |
-
ml_output = "Query sequence not found in dataset"
|
|
|
|
| 212 |
else:
|
| 213 |
ml_output = "Failed to load CSV dataset"
|
|
|
|
| 214 |
except Exception as e:
|
| 215 |
ml_output = f"ML Tree analysis failed: {str(e)}"
|
| 216 |
logging.error(f"ML Tree failed: {e}")
|
| 217 |
elif not os.path.exists(csv_path):
|
| 218 |
ml_output = f"CSV dataset not found at {csv_path}"
|
|
|
|
|
|
|
| 219 |
else:
|
| 220 |
ml_output = "Skipped due to previous step errors"
|
| 221 |
|
| 222 |
return (
|
| 223 |
-
|
| 224 |
-
|
| 225 |
csv_path if os.path.exists(csv_path) else "CSV file not found",
|
| 226 |
ml_output,
|
| 227 |
html_file,
|
|
|
|
| 24 |
|
| 25 |
# Model repository and file paths
|
| 26 |
model_repo = "GGproject10/best_boundary_aware_model"
|
| 27 |
+
csv_path = "f gene clean dataset.csv"
|
| 28 |
+
|
| 29 |
+
# Get HF token from environment (if available)
|
| 30 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 31 |
|
| 32 |
# --- Load Models ---
|
| 33 |
boundary_model = None
|
|
|
|
| 36 |
|
| 37 |
# Try to load boundary model from Hugging Face Hub
|
| 38 |
try:
|
| 39 |
+
boundary_path = hf_hub_download(
|
| 40 |
+
repo_id=model_repo,
|
| 41 |
+
filename="best_boundary_aware_model.pth",
|
| 42 |
+
token=hf_token
|
| 43 |
+
)
|
| 44 |
if os.path.exists(boundary_path):
|
| 45 |
boundary_model = GenePredictor(boundary_path)
|
| 46 |
logging.info("Boundary model loaded successfully from Hugging Face Hub.")
|
|
|
|
| 51 |
|
| 52 |
# Try to load Keras model from Hugging Face Hub
|
| 53 |
try:
|
| 54 |
+
keras_path = hf_hub_download(
|
| 55 |
+
repo_id=model_repo,
|
| 56 |
+
filename="best_model.keras",
|
| 57 |
+
token=hf_token
|
| 58 |
+
)
|
| 59 |
+
kmer_path = hf_hub_download(
|
| 60 |
+
repo_id=model_repo,
|
| 61 |
+
filename="kmer_to_index.pkl",
|
| 62 |
+
token=hf_token
|
| 63 |
+
)
|
| 64 |
|
| 65 |
if os.path.exists(keras_path) and os.path.exists(kmer_path):
|
| 66 |
keras_model = load_model(keras_path)
|
|
|
|
| 89 |
input_arr = np.array([indices])
|
| 90 |
prediction = keras_model.predict(input_arr, verbose=0)[0]
|
| 91 |
|
| 92 |
+
# Format prediction as probabilities/scores (not a sequence)
|
| 93 |
result = ''.join([str(round(p, 3)) for p in prediction])
|
| 94 |
return result
|
| 95 |
except Exception as e:
|
|
|
|
| 141 |
logging.info("DNA sequence sanitized")
|
| 142 |
|
| 143 |
# Step 1: Boundary Prediction
|
| 144 |
+
processed_sequence = dna_input # This will be the sequence used for downstream analysis
|
| 145 |
+
boundary_output = ""
|
| 146 |
+
|
| 147 |
if boundary_model:
|
| 148 |
try:
|
| 149 |
predictions, probs, confidence = boundary_model.predict(dna_input)
|
| 150 |
regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 151 |
if regions:
|
| 152 |
+
processed_sequence = regions[0]["sequence"] # Use the extracted gene region
|
| 153 |
+
boundary_output = f"Gene region extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
|
| 154 |
+
else:
|
| 155 |
+
boundary_output = f"No gene regions found, using full sequence: {len(dna_input)} bp"
|
| 156 |
+
processed_sequence = dna_input
|
| 157 |
logging.info("Boundary model prediction completed")
|
| 158 |
except Exception as e:
|
| 159 |
logging.error(f"Boundary model failed: {e}")
|
| 160 |
+
boundary_output = f"Boundary model error: {str(e)}"
|
| 161 |
+
processed_sequence = dna_input # Fall back to original sequence
|
| 162 |
else:
|
| 163 |
+
boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
|
| 164 |
+
processed_sequence = dna_input
|
| 165 |
|
| 166 |
+
# Step 2: Keras Prediction (for analysis/scoring, not sequence modification)
|
| 167 |
+
keras_output = ""
|
| 168 |
+
if processed_sequence and len(processed_sequence) >= 6:
|
| 169 |
+
keras_output = predict_with_keras(processed_sequence)
|
| 170 |
else:
|
| 171 |
+
keras_output = "Skipped: sequence too short for k-mer analysis"
|
| 172 |
|
| 173 |
+
# Step 3: MAFFT and IQ-TREE (using processed_sequence, not keras output)
|
| 174 |
aligned_file = None
|
| 175 |
phy_file = None
|
| 176 |
|
| 177 |
+
# Use the processed_sequence (from boundary model) for alignment
|
| 178 |
+
if processed_sequence and len(processed_sequence) >= 10:
|
| 179 |
try:
|
| 180 |
+
# Create FASTA file with the actual DNA sequence
|
| 181 |
fasta_file = "input_sequence.fasta"
|
| 182 |
with open(fasta_file, "w") as f:
|
| 183 |
+
f.write(">query\n" + processed_sequence + "\n")
|
| 184 |
+
|
| 185 |
+
logging.info(f"Created FASTA file with sequence length: {len(processed_sequence)}")
|
| 186 |
|
| 187 |
# Check if MAFFT is executable
|
| 188 |
if os.path.exists(MAFFT_PATH):
|
|
|
|
| 209 |
logging.error("IQ-TREE not found in system PATH")
|
| 210 |
else:
|
| 211 |
logging.error(f"MAFFT not found at {MAFFT_PATH}")
|
| 212 |
+
except subprocess.CalledProcessError as e:
|
| 213 |
+
logging.error(f"MAFFT failed with exit code {e.returncode}")
|
| 214 |
+
logging.error(f"MAFFT stderr: {e.stderr.decode() if e.stderr else 'No stderr'}")
|
| 215 |
except Exception as e:
|
| 216 |
logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
|
| 217 |
|
| 218 |
+
# Step 4: ML Simplified Tree (using processed_sequence, not keras output)
|
| 219 |
html_file = None
|
| 220 |
tree_html_content = "No tree generated"
|
| 221 |
ml_output = ""
|
| 222 |
|
| 223 |
+
if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
|
| 224 |
try:
|
| 225 |
+
logging.info(f"Starting ML tree analysis with sequence length: {len(processed_sequence)}")
|
| 226 |
analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
|
| 227 |
+
|
| 228 |
if analyzer.load_data(csv_path):
|
| 229 |
+
logging.info("CSV data loaded successfully")
|
| 230 |
+
|
| 231 |
+
# Use the processed DNA sequence (not keras prediction scores)
|
| 232 |
+
if analyzer.find_query_sequence(processed_sequence):
|
| 233 |
+
logging.info("Query sequence found in dataset")
|
| 234 |
matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
|
| 235 |
analyzer.create_interactive_tree(matched_ids, perc)
|
| 236 |
|
|
|
|
| 239 |
html_file = html_filename
|
| 240 |
with open(html_filename, "r") as f:
|
| 241 |
tree_html_content = f.read()
|
| 242 |
+
ml_output = f"Tree generated successfully with {len(matched_ids)} sequences (match: {perc:.1f}%)"
|
| 243 |
else:
|
| 244 |
ml_output = "Tree generation completed but HTML file not found"
|
| 245 |
else:
|
| 246 |
+
ml_output = f"Query sequence not found in dataset (length: {len(processed_sequence)} bp)"
|
| 247 |
+
logging.warning(f"Query sequence not found. Length: {len(processed_sequence)}")
|
| 248 |
else:
|
| 249 |
ml_output = "Failed to load CSV dataset"
|
| 250 |
+
logging.error("Failed to load CSV dataset")
|
| 251 |
except Exception as e:
|
| 252 |
ml_output = f"ML Tree analysis failed: {str(e)}"
|
| 253 |
logging.error(f"ML Tree failed: {e}")
|
| 254 |
elif not os.path.exists(csv_path):
|
| 255 |
ml_output = f"CSV dataset not found at {csv_path}"
|
| 256 |
+
elif not processed_sequence or len(processed_sequence) < 10:
|
| 257 |
+
ml_output = f"Sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
|
| 258 |
else:
|
| 259 |
ml_output = "Skipped due to previous step errors"
|
| 260 |
|
| 261 |
return (
|
| 262 |
+
boundary_output,
|
| 263 |
+
keras_output[:500] + "..." if len(keras_output) > 500 else keras_output,
|
| 264 |
csv_path if os.path.exists(csv_path) else "CSV file not found",
|
| 265 |
ml_output,
|
| 266 |
html_file,
|