re-type commited on
Commit
365584f
·
verified ·
1 Parent(s): 0a0ab75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -66
app.py CHANGED
@@ -140,7 +140,7 @@ def run_pipeline(dna_input):
140
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
141
  logging.info("DNA sequence sanitized")
142
 
143
- # Step 1: Boundary Prediction
144
  processed_sequence = dna_input # This will be the sequence used for downstream analysis
145
  boundary_output = ""
146
 
@@ -150,10 +150,12 @@ def run_pipeline(dna_input):
150
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
151
  if regions:
152
  processed_sequence = regions[0]["sequence"] # Use the extracted gene region
153
- boundary_output = f"Gene region extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
 
154
  else:
155
- boundary_output = f"No gene regions found, using full sequence: {len(dna_input)} bp"
156
  processed_sequence = dna_input
 
157
  logging.info("Boundary model prediction completed")
158
  except Exception as e:
159
  logging.error(f"Boundary model failed: {e}")
@@ -163,88 +165,78 @@ def run_pipeline(dna_input):
163
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
164
  processed_sequence = dna_input
165
 
166
- # Step 2: Keras Prediction (for analysis/scoring, not sequence modification)
167
  keras_output = ""
168
  if processed_sequence and len(processed_sequence) >= 6:
169
- keras_output = predict_with_keras(processed_sequence)
 
 
 
 
 
 
 
170
  else:
171
- keras_output = "Skipped: sequence too short for k-mer analysis"
172
 
173
- # Step 3: MAFFT and IQ-TREE (using processed_sequence, not keras output)
174
  aligned_file = None
175
  phy_file = None
176
 
177
- # Use the processed_sequence (from boundary model) for alignment
178
- if processed_sequence and len(processed_sequence) >= 10:
179
- try:
180
- # Create FASTA file with the actual DNA sequence
181
- fasta_file = "input_sequence.fasta"
182
- with open(fasta_file, "w") as f:
183
- f.write(">query\n" + processed_sequence + "\n")
184
-
185
- logging.info(f"Created FASTA file with sequence length: {len(processed_sequence)}")
186
-
187
- # Check if MAFFT is executable
188
- if os.path.exists(MAFFT_PATH):
189
- # Make MAFFT executable
190
- os.chmod(MAFFT_PATH, 0o755)
191
-
192
- # Run MAFFT
193
- aligned_file = "aligned.fasta"
194
- with open(aligned_file, "w") as outfile:
195
- result = subprocess.run([MAFFT_PATH, "--auto", fasta_file],
196
- stdout=outfile, stderr=subprocess.PIPE, check=True)
197
- logging.info("MAFFT alignment completed")
198
-
199
- # Run IQ-TREE if alignment successful
200
- if os.path.exists(aligned_file):
201
- try:
202
- subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"],
203
- check=True, capture_output=True)
204
- phy_file = "input_sequence.phy"
205
- logging.info("IQ-TREE analysis completed")
206
- except subprocess.CalledProcessError as e:
207
- logging.error(f"IQ-TREE failed: {e}")
208
- except FileNotFoundError:
209
- logging.error("IQ-TREE not found in system PATH")
210
- else:
211
- logging.error(f"MAFFT not found at {MAFFT_PATH}")
212
- except subprocess.CalledProcessError as e:
213
- logging.error(f"MAFFT failed with exit code {e.returncode}")
214
- logging.error(f"MAFFT stderr: {e.stderr.decode() if e.stderr else 'No stderr'}")
215
- except Exception as e:
216
- logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
217
 
218
- # Step 4: ML Simplified Tree (using processed_sequence, not keras output)
219
  html_file = None
220
  tree_html_content = "No tree generated"
221
  ml_output = ""
222
 
223
  if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
224
  try:
225
- logging.info(f"Starting ML tree analysis with sequence length: {len(processed_sequence)}")
226
  analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
227
 
228
  if analyzer.load_data(csv_path):
229
  logging.info("CSV data loaded successfully")
230
 
231
- # Use the processed DNA sequence (not keras prediction scores)
232
  if analyzer.find_query_sequence(processed_sequence):
233
- logging.info("Query sequence found in dataset")
234
  matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
235
- analyzer.create_interactive_tree(matched_ids, perc)
236
 
237
- html_filename = "phylogenetic_tree_normalized_horizontal.html"
238
- if os.path.exists(html_filename):
239
- html_file = html_filename
240
- with open(html_filename, "r") as f:
241
- tree_html_content = f.read()
242
- ml_output = f"Tree generated successfully with {len(matched_ids)} sequences (match: {perc:.1f}%)"
243
- else:
244
- ml_output = "Tree generation completed but HTML file not found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  else:
246
- ml_output = f"Query sequence not found in dataset (length: {len(processed_sequence)} bp)"
247
- logging.warning(f"Query sequence not found. Length: {len(processed_sequence)}")
248
  else:
249
  ml_output = "Failed to load CSV dataset"
250
  logging.error("Failed to load CSV dataset")
@@ -254,7 +246,7 @@ def run_pipeline(dna_input):
254
  elif not os.path.exists(csv_path):
255
  ml_output = f"CSV dataset not found at {csv_path}"
256
  elif not processed_sequence or len(processed_sequence) < 10:
257
- ml_output = f"Sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
258
  else:
259
  ml_output = "Skipped due to previous step errors"
260
 
@@ -299,11 +291,11 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline") as demo:
299
 
300
  with gr.Row():
301
  with gr.Column():
302
- out1 = gr.Textbox(label="Step 1: Boundary Model Output", lines=3)
303
- out2 = gr.Textbox(label="Step 2: Keras Model Output", lines=3)
304
  with gr.Column():
305
  out3 = gr.Textbox(label="Dataset Used")
306
- out4 = gr.Textbox(label="Step 3: ML Tree Status", lines=3)
307
 
308
  with gr.Row():
309
  html = gr.File(label="Download Tree (HTML)")
 
140
  dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
141
  logging.info("DNA sequence sanitized")
142
 
143
+ # Step 1: Boundary Prediction - Extract F gene sequence
144
  processed_sequence = dna_input # This will be the sequence used for downstream analysis
145
  boundary_output = ""
146
 
 
150
  regions = boundary_model.extract_gene_regions(predictions, dna_input)
151
  if regions:
152
  processed_sequence = regions[0]["sequence"] # Use the extracted gene region
153
+ boundary_output = processed_sequence # Output the actual F gene sequence
154
+ logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
155
  else:
156
+ boundary_output = f"No F gene regions found in input sequence"
157
  processed_sequence = dna_input
158
+ logging.warning("No gene regions found, using full sequence")
159
  logging.info("Boundary model prediction completed")
160
  except Exception as e:
161
  logging.error(f"Boundary model failed: {e}")
 
165
  boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
166
  processed_sequence = dna_input
167
 
168
+ # Step 2: Keras Prediction (F gene validation)
169
  keras_output = ""
170
  if processed_sequence and len(processed_sequence) >= 6:
171
+ keras_prediction = predict_with_keras(processed_sequence)
172
+ # Interpret keras prediction as F gene validation
173
+ if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
174
+ # You might want to add logic here to interpret the prediction scores
175
+ # For now, just show the prediction
176
+ keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
177
+ else:
178
+ keras_output = keras_prediction
179
  else:
180
+ keras_output = "Skipped: sequence too short for F gene validation"
181
 
182
+ # Step 3: MAFFT and IQ-TREE (skip due to configuration issues)
183
  aligned_file = None
184
  phy_file = None
185
 
186
+ # Skip MAFFT due to configuration issues in the container
187
+ logging.info("Skipping MAFFT/IQ-TREE due to container configuration issues")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # Step 4: ML Simplified Tree (using the extracted F gene sequence)
190
  html_file = None
191
  tree_html_content = "No tree generated"
192
  ml_output = ""
193
 
194
  if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
195
  try:
196
+ logging.info(f"Starting ML tree analysis with F gene sequence length: {len(processed_sequence)}")
197
  analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
198
 
199
  if analyzer.load_data(csv_path):
200
  logging.info("CSV data loaded successfully")
201
 
202
+ # Use the extracted F gene sequence from boundary model
203
  if analyzer.find_query_sequence(processed_sequence):
204
+ logging.info("F gene sequence found in dataset")
205
  matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
 
206
 
207
+ # Try to create tree with error handling
208
+ try:
209
+ analyzer.create_interactive_tree(matched_ids, perc)
210
+
211
+ # Check for multiple possible HTML file names
212
+ possible_html_files = [
213
+ "phylogenetic_tree_normalized_horizontal.html",
214
+ "phylogenetic_tree.html",
215
+ "tree.html"
216
+ ]
217
+
218
+ for html_filename in possible_html_files:
219
+ if os.path.exists(html_filename):
220
+ html_file = html_filename
221
+ with open(html_filename, "r", encoding='utf-8') as f:
222
+ tree_html_content = f.read()
223
+ ml_output = f"Phylogenetic tree generated successfully with {len(matched_ids)} sequences (similarity: {perc:.1f}%)"
224
+ logging.info(f"Tree HTML file found: {html_filename}")
225
+ break
226
+
227
+ if not html_file:
228
+ # List all HTML files in directory for debugging
229
+ html_files = [f for f in os.listdir('.') if f.endswith('.html')]
230
+ ml_output = f"Tree analysis completed but HTML file not found. Available HTML files: {html_files}"
231
+ logging.warning(f"HTML files in directory: {html_files}")
232
+
233
+ except Exception as tree_error:
234
+ ml_output = f"Tree creation failed: {str(tree_error)}"
235
+ logging.error(f"Tree creation error: {tree_error}")
236
+
237
  else:
238
+ ml_output = f"F gene sequence not found in dataset (length: {len(processed_sequence)} bp)"
239
+ logging.warning(f"F gene sequence not found. Length: {len(processed_sequence)}")
240
  else:
241
  ml_output = "Failed to load CSV dataset"
242
  logging.error("Failed to load CSV dataset")
 
246
  elif not os.path.exists(csv_path):
247
  ml_output = f"CSV dataset not found at {csv_path}"
248
  elif not processed_sequence or len(processed_sequence) < 10:
249
+ ml_output = f"F gene sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
250
  else:
251
  ml_output = "Skipped due to previous step errors"
252
 
 
291
 
292
  with gr.Row():
293
  with gr.Column():
294
+ out1 = gr.Textbox(label="Step 1: Extracted F Gene Sequence", lines=8)
295
+ out2 = gr.Textbox(label="Step 2: F Gene Validation (Keras)", lines=3)
296
  with gr.Column():
297
  out3 = gr.Textbox(label="Dataset Used")
298
+ out4 = gr.Textbox(label="Step 3: Phylogenetic Tree Status", lines=3)
299
 
300
  with gr.Row():
301
  html = gr.File(label="Download Tree (HTML)")