re-type commited on
Commit
0a0ab75
·
verified ·
1 Parent(s): 79cb8b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -25
app.py CHANGED
@@ -24,7 +24,10 @@ import tempfile
24
 
25
  # Model repository and file paths
26
  model_repo = "GGproject10/best_boundary_aware_model"
27
- csv_path = "f cleaned.csv"
 
 
 
28
 
29
  # --- Load Models ---
30
  boundary_model = None
@@ -33,7 +36,11 @@ kmer_to_index = None
33
 
34
  # Try to load boundary model from Hugging Face Hub
35
  try:
36
- boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth")
 
 
 
 
37
  if os.path.exists(boundary_path):
38
  boundary_model = GenePredictor(boundary_path)
39
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
@@ -44,8 +51,16 @@ except Exception as e:
44
 
45
  # Try to load Keras model from Hugging Face Hub
46
  try:
47
- keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras")
48
- kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl")
 
 
 
 
 
 
 
 
49
 
50
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
51
  keras_model = load_model(keras_path)
@@ -74,7 +89,7 @@ def predict_with_keras(sequence):
74
  input_arr = np.array([indices])
75
  prediction = keras_model.predict(input_arr, verbose=0)[0]
76
 
77
- # Format prediction
78
  result = ''.join([str(round(p, 3)) for p in prediction])
79
  return result
80
  except Exception as e:
@@ -126,37 +141,48 @@ def run_pipeline(dna_input):
126
  logging.info("DNA sequence sanitized")
127
 
128
  # Step 1: Boundary Prediction
129
- step1_out = dna_input # Default
 
 
130
  if boundary_model:
131
  try:
132
  predictions, probs, confidence = boundary_model.predict(dna_input)
133
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
134
  if regions:
135
- step1_out = regions[0]["sequence"]
 
 
 
 
136
  logging.info("Boundary model prediction completed")
137
  except Exception as e:
138
  logging.error(f"Boundary model failed: {e}")
139
- step1_out = f"Boundary model error: {str(e)}"
 
140
  else:
141
- step1_out = f"Boundary model not available. Using original input: {dna_input[:100]}..."
 
142
 
143
- # Step 2: Keras Prediction
144
- if isinstance(step1_out, str) and not step1_out.startswith("Boundary model error"):
145
- step2_out = predict_with_keras(step1_out)
 
146
  else:
147
- step2_out = "Skipped due to boundary model error"
148
 
149
- # Step 3: MAFFT and IQ-TREE
150
  aligned_file = None
151
  phy_file = None
152
 
153
- # Only proceed if we have valid sequence data
154
- if step2_out and not step2_out.startswith(("Keras", "Skipped")):
155
  try:
156
- # Create FASTA file
157
  fasta_file = "input_sequence.fasta"
158
  with open(fasta_file, "w") as f:
159
- f.write(">query\n" + step2_out + "\n")
 
 
160
 
161
  # Check if MAFFT is executable
162
  if os.path.exists(MAFFT_PATH):
@@ -183,19 +209,28 @@ def run_pipeline(dna_input):
183
  logging.error("IQ-TREE not found in system PATH")
184
  else:
185
  logging.error(f"MAFFT not found at {MAFFT_PATH}")
 
 
 
186
  except Exception as e:
187
  logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
188
 
189
- # Step 4: ML Simplified Tree
190
  html_file = None
191
  tree_html_content = "No tree generated"
192
  ml_output = ""
193
 
194
- if os.path.exists(csv_path) and step2_out and not step2_out.startswith(("Keras", "Skipped")):
195
  try:
 
196
  analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
 
197
  if analyzer.load_data(csv_path):
198
- if analyzer.find_query_sequence(step2_out):
 
 
 
 
199
  matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
200
  analyzer.create_interactive_tree(matched_ids, perc)
201
 
@@ -204,24 +239,28 @@ def run_pipeline(dna_input):
204
  html_file = html_filename
205
  with open(html_filename, "r") as f:
206
  tree_html_content = f.read()
207
- ml_output = f"Tree generated successfully with {len(matched_ids)} sequences"
208
  else:
209
  ml_output = "Tree generation completed but HTML file not found"
210
  else:
211
- ml_output = "Query sequence not found in dataset"
 
212
  else:
213
  ml_output = "Failed to load CSV dataset"
 
214
  except Exception as e:
215
  ml_output = f"ML Tree analysis failed: {str(e)}"
216
  logging.error(f"ML Tree failed: {e}")
217
  elif not os.path.exists(csv_path):
218
  ml_output = f"CSV dataset not found at {csv_path}"
 
 
219
  else:
220
  ml_output = "Skipped due to previous step errors"
221
 
222
  return (
223
- step1_out[:500] + "..." if len(step1_out) > 500 else step1_out, # Truncate long outputs
224
- step2_out[:500] + "..." if len(step2_out) > 500 else step2_out,
225
  csv_path if os.path.exists(csv_path) else "CSV file not found",
226
  ml_output,
227
  html_file,
 
24
 
25
  # Model repository and file paths
26
  model_repo = "GGproject10/best_boundary_aware_model"
27
+ csv_path = "f gene clean dataset.csv"
28
+
29
+ # Get HF token from environment (if available)
30
+ hf_token = os.getenv("HF_TOKEN")
31
 
32
  # --- Load Models ---
33
  boundary_model = None
 
36
 
37
  # Try to load boundary model from Hugging Face Hub
38
  try:
39
+ boundary_path = hf_hub_download(
40
+ repo_id=model_repo,
41
+ filename="best_boundary_aware_model.pth",
42
+ token=hf_token
43
+ )
44
  if os.path.exists(boundary_path):
45
  boundary_model = GenePredictor(boundary_path)
46
  logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 
51
 
52
  # Try to load Keras model from Hugging Face Hub
53
  try:
54
+ keras_path = hf_hub_download(
55
+ repo_id=model_repo,
56
+ filename="best_model.keras",
57
+ token=hf_token
58
+ )
59
+ kmer_path = hf_hub_download(
60
+ repo_id=model_repo,
61
+ filename="kmer_to_index.pkl",
62
+ token=hf_token
63
+ )
64
 
65
  if os.path.exists(keras_path) and os.path.exists(kmer_path):
66
  keras_model = load_model(keras_path)
 
89
  input_arr = np.array([indices])
90
  prediction = keras_model.predict(input_arr, verbose=0)[0]
91
 
92
+ # Format prediction as probabilities/scores (not a sequence)
93
  result = ''.join([str(round(p, 3)) for p in prediction])
94
  return result
95
  except Exception as e:
 
141
  logging.info("DNA sequence sanitized")
142
 
143
  # Step 1: Boundary Prediction
144
+ processed_sequence = dna_input # This will be the sequence used for downstream analysis
145
+ boundary_output = ""
146
+
147
  if boundary_model:
148
  try:
149
  predictions, probs, confidence = boundary_model.predict(dna_input)
150
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
151
  if regions:
152
+ processed_sequence = regions[0]["sequence"] # Use the extracted gene region
153
+ boundary_output = f"Gene region extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
154
+ else:
155
+ boundary_output = f"No gene regions found, using full sequence: {len(dna_input)} bp"
156
+ processed_sequence = dna_input
157
  logging.info("Boundary model prediction completed")
158
  except Exception as e:
159
  logging.error(f"Boundary model failed: {e}")
160
+ boundary_output = f"Boundary model error: {str(e)}"
161
+ processed_sequence = dna_input # Fall back to original sequence
162
  else:
163
+ boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
164
+ processed_sequence = dna_input
165
 
166
+ # Step 2: Keras Prediction (for analysis/scoring, not sequence modification)
167
+ keras_output = ""
168
+ if processed_sequence and len(processed_sequence) >= 6:
169
+ keras_output = predict_with_keras(processed_sequence)
170
  else:
171
+ keras_output = "Skipped: sequence too short for k-mer analysis"
172
 
173
+ # Step 3: MAFFT and IQ-TREE (using processed_sequence, not keras output)
174
  aligned_file = None
175
  phy_file = None
176
 
177
+ # Use the processed_sequence (from boundary model) for alignment
178
+ if processed_sequence and len(processed_sequence) >= 10:
179
  try:
180
+ # Create FASTA file with the actual DNA sequence
181
  fasta_file = "input_sequence.fasta"
182
  with open(fasta_file, "w") as f:
183
+ f.write(">query\n" + processed_sequence + "\n")
184
+
185
+ logging.info(f"Created FASTA file with sequence length: {len(processed_sequence)}")
186
 
187
  # Check if MAFFT is executable
188
  if os.path.exists(MAFFT_PATH):
 
209
  logging.error("IQ-TREE not found in system PATH")
210
  else:
211
  logging.error(f"MAFFT not found at {MAFFT_PATH}")
212
+ except subprocess.CalledProcessError as e:
213
+ logging.error(f"MAFFT failed with exit code {e.returncode}")
214
+ logging.error(f"MAFFT stderr: {e.stderr.decode() if e.stderr else 'No stderr'}")
215
  except Exception as e:
216
  logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
217
 
218
+ # Step 4: ML Simplified Tree (using processed_sequence, not keras output)
219
  html_file = None
220
  tree_html_content = "No tree generated"
221
  ml_output = ""
222
 
223
+ if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
224
  try:
225
+ logging.info(f"Starting ML tree analysis with sequence length: {len(processed_sequence)}")
226
  analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
227
+
228
  if analyzer.load_data(csv_path):
229
+ logging.info("CSV data loaded successfully")
230
+
231
+ # Use the processed DNA sequence (not keras prediction scores)
232
+ if analyzer.find_query_sequence(processed_sequence):
233
+ logging.info("Query sequence found in dataset")
234
  matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
235
  analyzer.create_interactive_tree(matched_ids, perc)
236
 
 
239
  html_file = html_filename
240
  with open(html_filename, "r") as f:
241
  tree_html_content = f.read()
242
+ ml_output = f"Tree generated successfully with {len(matched_ids)} sequences (match: {perc:.1f}%)"
243
  else:
244
  ml_output = "Tree generation completed but HTML file not found"
245
  else:
246
+ ml_output = f"Query sequence not found in dataset (length: {len(processed_sequence)} bp)"
247
+ logging.warning(f"Query sequence not found. Length: {len(processed_sequence)}")
248
  else:
249
  ml_output = "Failed to load CSV dataset"
250
+ logging.error("Failed to load CSV dataset")
251
  except Exception as e:
252
  ml_output = f"ML Tree analysis failed: {str(e)}"
253
  logging.error(f"ML Tree failed: {e}")
254
  elif not os.path.exists(csv_path):
255
  ml_output = f"CSV dataset not found at {csv_path}"
256
+ elif not processed_sequence or len(processed_sequence) < 10:
257
+ ml_output = f"Sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
258
  else:
259
  ml_output = "Skipped due to previous step errors"
260
 
261
  return (
262
+ boundary_output,
263
+ keras_output[:500] + "..." if len(keras_output) > 500 else keras_output,
264
  csv_path if os.path.exists(csv_path) else "CSV file not found",
265
  ml_output,
266
  html_file,