re-type commited on
Commit
b08a8cc
·
verified ·
1 Parent(s): 78d934e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -127
app.py CHANGED
@@ -30,159 +30,255 @@ boundary_model = None
30
  keras_model = None
31
  kmer_to_index = None
32
 
33
- if os.path.exists(boundary_path):
34
- try:
 
35
  boundary_model = GenePredictor(boundary_path)
36
- logging.info(f"Boundary model loaded successfully from {boundary_path}")
37
- except Exception as e:
38
- logging.error(f"Failed to load Boundary model from {boundary_path}: {e}")
39
- else:
40
- logging.error(f"Boundary model file not found at {boundary_path}")
41
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
42
- try:
 
 
43
  keras_model = load_model(keras_path)
44
  with open(kmer_path, "rb") as f:
45
  kmer_to_index = pickle.load(f)
46
- logging.info(f"Keras model and k-mer index loaded successfully from {keras_path} and {kmer_path}")
47
- except Exception as e:
48
- logging.error(f"Failed to load Keras model or kmer_to_index from {keras_path} or {kmer_path}: {e}")
49
- else:
50
- logging.error(f"Keras model or kmer_to_index file not found at {keras_path} or {kmer_path}")
51
 
52
  # --- Keras Prediction ---
53
  def predict_with_keras(sequence):
54
- if len(sequence) < 6:
55
- return "Sequence too short for k-mer prediction."
56
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
57
- indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
58
- input_arr = np.array([indices]) # Changed from torch.tensor to np.array
59
- prediction = keras_model.predict(input_arr, verbose=0)[0]
60
- return ''.join(str(round(p, 3)) for p in prediction)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # --- FASTA Reader ---
63
- def read_fasta_file(f):
64
- content = f.read().decode("utf-8") if hasattr(f, "read") else open(f, "r").read()
65
- lines = content.strip().split("\n")
66
- seq_lines = [line.strip() for line in lines if not line.startswith(">")]
67
- return ''.join(seq_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # --- Full Pipeline ---
70
  def run_pipeline_from_file(fasta_file_obj):
71
- dna_input = read_fasta_file(fasta_file_obj)
72
- return run_pipeline(dna_input)
 
 
 
 
 
 
 
73
 
74
  def run_pipeline(dna_input):
75
- dna_input = dna_input.upper()
76
- if not re.match('^[ACTGN]+$', dna_input):
77
- dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
78
- logging.warning("Invalid DNA sequence characters replaced with 'N'.")
79
-
80
- # Step 1: Boundary Prediction
81
- if boundary_model:
82
- try:
83
- predictions, probs, confidence = boundary_model.predict(dna_input)
84
- regions = boundary_model.extract_gene_regions(predictions, dna_input)
85
- step1_out = regions[0]["sequence"] if regions else dna_input
86
- logging.info(f"Boundary model output: {step1_out[:50]}... (truncated)")
87
- except Exception as e:
88
- logging.error(f"Boundary model prediction failed: {e}")
89
- step1_out = dna_input
90
- else:
91
- step1_out = dna_input
92
- logging.info("Boundary model skipped due to loading failure or missing file")
93
 
94
- # Step 2: Keras Prediction
95
- if keras_model and kmer_to_index:
96
- try:
97
- step2_out = predict_with_keras(step1_out)
98
- logging.info(f"Keras model output: {step2_out[:50]}... (truncated)")
99
- except Exception as e:
100
- logging.error(f"Keras prediction failed: {e}")
101
- step2_out = step1_out
102
- else:
103
- step2_out = step1_out
104
- logging.info("Keras model skipped due to loading failure or missing file")
 
 
 
105
 
106
- # Save to FASTA for MAFFT/IQTREE (optional, can be skipped if ML tree is independent)
107
- fasta_file = "input_sequence.fasta"
108
- with open(fasta_file, "w") as f:
109
- f.write(">query\n" + step2_out + "\n")
 
110
 
111
- # Run MAFFT
112
- aligned_file = "aligned.fasta"
113
- mafft_exec = MAFFT_PATH # Use global variable
114
- try:
115
- subprocess.run([mafft_exec, "--auto", fasta_file], stdout=open(aligned_file, "w"), check=True)
116
- except Exception as e:
117
  aligned_file = None
118
- logging.error(f"MAFFT failed: {e}")
119
-
120
- # Run IQ-TREE (only if alignment exists)
121
- phy_file = "input_sequence.phy"
122
- if aligned_file is not None:
123
- try:
124
- subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"], check=True)
125
- except Exception as e:
126
- logging.error(f"IQ-TREE failed: {e}")
127
- else:
128
- logging.warning("Skipping IQ-TREE due to missing alignment file")
129
-
130
- # Step 3: ML Simplified Tree (independent of MAFFT/IQ-TREE)
131
- html_file = "phylogenetic_tree_normalized_horizontal.html"
132
- ml_output = ""
133
- tree_html_content = "No tree generated."
134
- if os.path.exists(csv_path):
135
- analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
136
- if analyzer.load_data(csv_path):
137
- if analyzer.find_query_sequence(step2_out):
138
- matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
139
- try:
140
- analyzer.create_interactive_tree(matched_ids, perc)
141
- ml_output = "Tree generated."
142
- if os.path.exists(html_file):
143
- with open(html_file, "r") as f:
144
- tree_html_content = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  else:
146
- ml_output += " (HTML file not generated)"
147
- logging.error("HTML file not created after tree generation")
148
- except Exception as e:
149
- ml_output = f"Error creating tree: {e}"
150
- logging.error(f"Tree creation failed: {e}")
151
- else:
152
- ml_output = "Query sequence not found in CSV."
153
- logging.warning(f"Query sequence {step2_out[:50]}... not found")
154
  else:
155
- ml_output = "Failed to load CSV."
156
- logging.error("CSV loading failed")
157
- else:
158
- ml_output = "CSV file missing."
159
- logging.error(f"CSV file not found at {csv_path}")
160
 
161
- return step1_out, step2_out, csv_path, ml_output, html_file, aligned_file, phy_file, tree_html_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  # --- Gradio UI ---
164
- with gr.Blocks() as demo:
165
  gr.Markdown("# Viral Gene Phylogenetic Inference Pipeline")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- with gr.Tab("Paste DNA"):
168
- inp = gr.Textbox(label="DNA Input")
169
- btn1 = gr.Button("Run Pipeline")
170
-
171
- with gr.Tab("Upload FASTA"):
172
- file_input = gr.File(label="FASTA File", file_types=['.fasta', '.fa'])
173
- btn2 = gr.Button("Run on FASTA")
174
-
175
- out1 = gr.Textbox(label="Boundary Model Output")
176
- out2 = gr.Textbox(label="Keras Model Output")
177
- out3 = gr.Textbox(label="CSV File Used")
178
- out4 = gr.Textbox(label="ML Tree Output")
179
- html = gr.File(label="ML Tree (HTML)", file_types=['.html'])
180
- fasta = gr.File(label="Aligned FASTA", file_types=['.fasta'])
181
- phy = gr.File(label="IQ-TREE .phy File", file_types=['.phy'])
182
  tree_html = gr.HTML(label="Interactive Tree Preview")
183
 
184
- btn1.click(fn=run_pipeline, inputs=inp, outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html])
185
- btn2.click(fn=run_pipeline_from_file, inputs=file_input, outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html])
 
 
 
 
 
 
 
 
 
186
 
187
  if __name__ == '__main__':
188
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
30
  keras_model = None
31
  kmer_to_index = None
32
 
33
+ # Try to load boundary model
34
+ try:
35
+ if os.path.exists(boundary_path):
36
  boundary_model = GenePredictor(boundary_path)
37
+ logging.info("Boundary model loaded successfully.")
38
+ else:
39
+ logging.warning(f"Boundary model file not found at {boundary_path}")
40
+ except Exception as e:
41
+ logging.error(f"Failed to load boundary model: {e}")
42
+
43
+ # Try to load Keras model
44
+ try:
45
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
46
  keras_model = load_model(keras_path)
47
  with open(kmer_path, "rb") as f:
48
  kmer_to_index = pickle.load(f)
49
+ logging.info("Keras model and k-mer index loaded successfully.")
50
+ else:
51
+ logging.warning(f"Keras model or kmer files not found at {keras_path} or {kmer_path}")
52
+ except Exception as e:
53
+ logging.error(f"Failed to load Keras model: {e}")
54
 
55
  # --- Keras Prediction ---
56
  def predict_with_keras(sequence):
57
+ try:
58
+ if not keras_model or not kmer_to_index:
59
+ return f"Keras model not available. Input sequence: {sequence[:100]}..."
60
+
61
+ if len(sequence) < 6:
62
+ return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
63
+
64
+ # Generate k-mers
65
+ kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
66
+ indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
67
+
68
+ # Prepare input
69
+ input_arr = np.array([indices])
70
+ prediction = keras_model.predict(input_arr, verbose=0)[0]
71
+
72
+ # Format prediction
73
+ result = ''.join([str(round(p, 3)) for p in prediction])
74
+ return result
75
+ except Exception as e:
76
+ logging.error(f"Keras prediction failed: {e}")
77
+ return f"Keras prediction failed: {str(e)}"
78
 
79
  # --- FASTA Reader ---
80
+ def read_fasta_file(file_obj):
81
+ try:
82
+ if file_obj is None:
83
+ return ""
84
+
85
+ # Handle file object
86
+ if hasattr(file_obj, 'name'):
87
+ with open(file_obj.name, "r") as f:
88
+ content = f.read()
89
+ else:
90
+ content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
91
+
92
+ lines = content.strip().split("\n")
93
+ seq_lines = [line.strip() for line in lines if not line.startswith(">")]
94
+ return ''.join(seq_lines)
95
+ except Exception as e:
96
+ logging.error(f"Failed to read FASTA file: {e}")
97
+ return ""
98
 
99
  # --- Full Pipeline ---
100
  def run_pipeline_from_file(fasta_file_obj):
101
+ try:
102
+ dna_input = read_fasta_file(fasta_file_obj)
103
+ if not dna_input:
104
+ return "Failed to read FASTA file", "", "", "", None, None, None, "No input sequence"
105
+ return run_pipeline(dna_input)
106
+ except Exception as e:
107
+ error_msg = f"Pipeline error: {str(e)}"
108
+ logging.error(error_msg)
109
+ return error_msg, "", "", "", None, None, None, error_msg
110
 
111
  def run_pipeline(dna_input):
112
+ try:
113
+ # Clean input
114
+ dna_input = dna_input.upper().strip()
115
+ if not dna_input:
116
+ return "Empty input", "", "", "", None, None, None, "No input provided"
117
+
118
+ # Sanitize DNA sequence
119
+ if not re.match('^[ACTGN]+$', dna_input):
120
+ dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
121
+ logging.info("DNA sequence sanitized")
 
 
 
 
 
 
 
 
122
 
123
+ # Step 1: Boundary Prediction
124
+ step1_out = dna_input # Default
125
+ if boundary_model:
126
+ try:
127
+ predictions, probs, confidence = boundary_model.predict(dna_input)
128
+ regions = boundary_model.extract_gene_regions(predictions, dna_input)
129
+ if regions:
130
+ step1_out = regions[0]["sequence"]
131
+ logging.info("Boundary model prediction completed")
132
+ except Exception as e:
133
+ logging.error(f"Boundary model failed: {e}")
134
+ step1_out = f"Boundary model error: {str(e)}"
135
+ else:
136
+ step1_out = f"Boundary model not available. Using original input: {dna_input[:100]}..."
137
 
138
+ # Step 2: Keras Prediction
139
+ if isinstance(step1_out, str) and not step1_out.startswith("Boundary model error"):
140
+ step2_out = predict_with_keras(step1_out)
141
+ else:
142
+ step2_out = "Skipped due to boundary model error"
143
 
144
+ # Step 3: MAFFT and IQ-TREE
 
 
 
 
 
145
  aligned_file = None
146
+ phy_file = None
147
+
148
+ # Only proceed if we have valid sequence data
149
+ if step2_out and not step2_out.startswith(("Keras", "Skipped")):
150
+ try:
151
+ # Create FASTA file
152
+ fasta_file = "input_sequence.fasta"
153
+ with open(fasta_file, "w") as f:
154
+ f.write(">query\n" + step2_out + "\n")
155
+
156
+ # Check if MAFFT is executable
157
+ if os.path.exists(MAFFT_PATH):
158
+ # Make MAFFT executable
159
+ os.chmod(MAFFT_PATH, 0o755)
160
+
161
+ # Run MAFFT
162
+ aligned_file = "aligned.fasta"
163
+ with open(aligned_file, "w") as outfile:
164
+ result = subprocess.run([MAFFT_PATH, "--auto", fasta_file],
165
+ stdout=outfile, stderr=subprocess.PIPE, check=True)
166
+ logging.info("MAFFT alignment completed")
167
+
168
+ # Run IQ-TREE if alignment successful
169
+ if os.path.exists(aligned_file):
170
+ try:
171
+ subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"],
172
+ check=True, capture_output=True)
173
+ phy_file = "input_sequence.phy"
174
+ logging.info("IQ-TREE analysis completed")
175
+ except subprocess.CalledProcessError as e:
176
+ logging.error(f"IQ-TREE failed: {e}")
177
+ except FileNotFoundError:
178
+ logging.error("IQ-TREE not found in system PATH")
179
+ else:
180
+ logging.error(f"MAFFT not found at {MAFFT_PATH}")
181
+ except Exception as e:
182
+ logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
183
+
184
+ # Step 4: ML Simplified Tree
185
+ html_file = None
186
+ tree_html_content = "No tree generated"
187
+ ml_output = ""
188
+
189
+ if os.path.exists(csv_path) and step2_out and not step2_out.startswith(("Keras", "Skipped")):
190
+ try:
191
+ analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
192
+ if analyzer.load_data(csv_path):
193
+ if analyzer.find_query_sequence(step2_out):
194
+ matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
195
+ analyzer.create_interactive_tree(matched_ids, perc)
196
+
197
+ html_filename = "phylogenetic_tree_normalized_horizontal.html"
198
+ if os.path.exists(html_filename):
199
+ html_file = html_filename
200
+ with open(html_filename, "r") as f:
201
+ tree_html_content = f.read()
202
+ ml_output = f"Tree generated successfully with {len(matched_ids)} sequences"
203
+ else:
204
+ ml_output = "Tree generation completed but HTML file not found"
205
  else:
206
+ ml_output = "Query sequence not found in dataset"
207
+ else:
208
+ ml_output = "Failed to load CSV dataset"
209
+ except Exception as e:
210
+ ml_output = f"ML Tree analysis failed: {str(e)}"
211
+ logging.error(f"ML Tree failed: {e}")
212
+ elif not os.path.exists(csv_path):
213
+ ml_output = f"CSV dataset not found at {csv_path}"
214
  else:
215
+ ml_output = "Skipped due to previous step errors"
 
 
 
 
216
 
217
+ return (
218
+ step1_out[:500] + "..." if len(step1_out) > 500 else step1_out, # Truncate long outputs
219
+ step2_out[:500] + "..." if len(step2_out) > 500 else step2_out,
220
+ csv_path if os.path.exists(csv_path) else "CSV file not found",
221
+ ml_output,
222
+ html_file,
223
+ aligned_file if aligned_file and os.path.exists(aligned_file) else None,
224
+ phy_file if phy_file and os.path.exists(phy_file) else None,
225
+ tree_html_content
226
+ )
227
+
228
+ except Exception as e:
229
+ error_msg = f"Pipeline failed: {str(e)}"
230
+ logging.error(error_msg)
231
+ return error_msg, "", "", "", None, None, None, error_msg
232
 
233
  # --- Gradio UI ---
234
+ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline") as demo:
235
  gr.Markdown("# Viral Gene Phylogenetic Inference Pipeline")
236
+ gr.Markdown("This pipeline processes DNA sequences through boundary detection, k-mer analysis, and phylogenetic tree construction.")
237
+
238
+ with gr.Tab("Paste DNA Sequence"):
239
+ inp = gr.Textbox(
240
+ label="DNA Input",
241
+ placeholder="Paste your DNA sequence here (ACTG format)",
242
+ lines=5
243
+ )
244
+ btn1 = gr.Button("Run Pipeline", variant="primary")
245
+
246
+ with gr.Tab("Upload FASTA File"):
247
+ file_input = gr.File(
248
+ label="FASTA File",
249
+ file_types=['.fasta', '.fa', '.txt']
250
+ )
251
+ btn2 = gr.Button("Run on FASTA", variant="primary")
252
+
253
+ # Outputs
254
+ gr.Markdown("## Pipeline Results")
255
+
256
+ with gr.Row():
257
+ with gr.Column():
258
+ out1 = gr.Textbox(label="Step 1: Boundary Model Output", lines=3)
259
+ out2 = gr.Textbox(label="Step 2: Keras Model Output", lines=3)
260
+ with gr.Column():
261
+ out3 = gr.Textbox(label="Dataset Used")
262
+ out4 = gr.Textbox(label="Step 3: ML Tree Status", lines=3)
263
+
264
+ with gr.Row():
265
+ html = gr.File(label="Download Tree (HTML)")
266
+ fasta = gr.File(label="Download Aligned FASTA")
267
+ phy = gr.File(label="Download IQ-TREE .phy File")
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  tree_html = gr.HTML(label="Interactive Tree Preview")
270
 
271
+ # Event handlers
272
+ btn1.click(
273
+ fn=run_pipeline,
274
+ inputs=inp,
275
+ outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html]
276
+ )
277
+ btn2.click(
278
+ fn=run_pipeline_from_file,
279
+ inputs=file_input,
280
+ outputs=[out1, out2, out3, out4, html, fasta, phy, tree_html]
281
+ )
282
 
283
  if __name__ == '__main__':
284
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)