Spaces:
No application file
No application file
Update app.py
Browse files
app.py
CHANGED
|
@@ -140,7 +140,7 @@ def run_pipeline(dna_input):
|
|
| 140 |
dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
|
| 141 |
logging.info("DNA sequence sanitized")
|
| 142 |
|
| 143 |
-
# Step 1: Boundary Prediction
|
| 144 |
processed_sequence = dna_input # This will be the sequence used for downstream analysis
|
| 145 |
boundary_output = ""
|
| 146 |
|
|
@@ -150,10 +150,12 @@ def run_pipeline(dna_input):
|
|
| 150 |
regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 151 |
if regions:
|
| 152 |
processed_sequence = regions[0]["sequence"] # Use the extracted gene region
|
| 153 |
-
boundary_output =
|
|
|
|
| 154 |
else:
|
| 155 |
-
boundary_output = f"No gene regions found
|
| 156 |
processed_sequence = dna_input
|
|
|
|
| 157 |
logging.info("Boundary model prediction completed")
|
| 158 |
except Exception as e:
|
| 159 |
logging.error(f"Boundary model failed: {e}")
|
|
@@ -163,88 +165,78 @@ def run_pipeline(dna_input):
|
|
| 163 |
boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
|
| 164 |
processed_sequence = dna_input
|
| 165 |
|
| 166 |
-
# Step 2: Keras Prediction (
|
| 167 |
keras_output = ""
|
| 168 |
if processed_sequence and len(processed_sequence) >= 6:
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
else:
|
| 171 |
-
keras_output = "Skipped: sequence too short for
|
| 172 |
|
| 173 |
-
# Step 3: MAFFT and IQ-TREE (
|
| 174 |
aligned_file = None
|
| 175 |
phy_file = None
|
| 176 |
|
| 177 |
-
#
|
| 178 |
-
|
| 179 |
-
try:
|
| 180 |
-
# Create FASTA file with the actual DNA sequence
|
| 181 |
-
fasta_file = "input_sequence.fasta"
|
| 182 |
-
with open(fasta_file, "w") as f:
|
| 183 |
-
f.write(">query\n" + processed_sequence + "\n")
|
| 184 |
-
|
| 185 |
-
logging.info(f"Created FASTA file with sequence length: {len(processed_sequence)}")
|
| 186 |
-
|
| 187 |
-
# Check if MAFFT is executable
|
| 188 |
-
if os.path.exists(MAFFT_PATH):
|
| 189 |
-
# Make MAFFT executable
|
| 190 |
-
os.chmod(MAFFT_PATH, 0o755)
|
| 191 |
-
|
| 192 |
-
# Run MAFFT
|
| 193 |
-
aligned_file = "aligned.fasta"
|
| 194 |
-
with open(aligned_file, "w") as outfile:
|
| 195 |
-
result = subprocess.run([MAFFT_PATH, "--auto", fasta_file],
|
| 196 |
-
stdout=outfile, stderr=subprocess.PIPE, check=True)
|
| 197 |
-
logging.info("MAFFT alignment completed")
|
| 198 |
-
|
| 199 |
-
# Run IQ-TREE if alignment successful
|
| 200 |
-
if os.path.exists(aligned_file):
|
| 201 |
-
try:
|
| 202 |
-
subprocess.run(["iqtree2", "-s", aligned_file, "-nt", "AUTO"],
|
| 203 |
-
check=True, capture_output=True)
|
| 204 |
-
phy_file = "input_sequence.phy"
|
| 205 |
-
logging.info("IQ-TREE analysis completed")
|
| 206 |
-
except subprocess.CalledProcessError as e:
|
| 207 |
-
logging.error(f"IQ-TREE failed: {e}")
|
| 208 |
-
except FileNotFoundError:
|
| 209 |
-
logging.error("IQ-TREE not found in system PATH")
|
| 210 |
-
else:
|
| 211 |
-
logging.error(f"MAFFT not found at {MAFFT_PATH}")
|
| 212 |
-
except subprocess.CalledProcessError as e:
|
| 213 |
-
logging.error(f"MAFFT failed with exit code {e.returncode}")
|
| 214 |
-
logging.error(f"MAFFT stderr: {e.stderr.decode() if e.stderr else 'No stderr'}")
|
| 215 |
-
except Exception as e:
|
| 216 |
-
logging.error(f"MAFFT/IQ-TREE pipeline failed: {e}")
|
| 217 |
|
| 218 |
-
# Step 4: ML Simplified Tree (using
|
| 219 |
html_file = None
|
| 220 |
tree_html_content = "No tree generated"
|
| 221 |
ml_output = ""
|
| 222 |
|
| 223 |
if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
|
| 224 |
try:
|
| 225 |
-
logging.info(f"Starting ML tree analysis with sequence length: {len(processed_sequence)}")
|
| 226 |
analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
|
| 227 |
|
| 228 |
if analyzer.load_data(csv_path):
|
| 229 |
logging.info("CSV data loaded successfully")
|
| 230 |
|
| 231 |
-
# Use the
|
| 232 |
if analyzer.find_query_sequence(processed_sequence):
|
| 233 |
-
logging.info("
|
| 234 |
matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
|
| 235 |
-
analyzer.create_interactive_tree(matched_ids, perc)
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
else:
|
| 246 |
-
ml_output = f"
|
| 247 |
-
logging.warning(f"
|
| 248 |
else:
|
| 249 |
ml_output = "Failed to load CSV dataset"
|
| 250 |
logging.error("Failed to load CSV dataset")
|
|
@@ -254,7 +246,7 @@ def run_pipeline(dna_input):
|
|
| 254 |
elif not os.path.exists(csv_path):
|
| 255 |
ml_output = f"CSV dataset not found at {csv_path}"
|
| 256 |
elif not processed_sequence or len(processed_sequence) < 10:
|
| 257 |
-
ml_output = f"
|
| 258 |
else:
|
| 259 |
ml_output = "Skipped due to previous step errors"
|
| 260 |
|
|
@@ -299,11 +291,11 @@ with gr.Blocks(title="Viral Gene Phylogenetic Pipeline") as demo:
|
|
| 299 |
|
| 300 |
with gr.Row():
|
| 301 |
with gr.Column():
|
| 302 |
-
out1 = gr.Textbox(label="Step 1:
|
| 303 |
-
out2 = gr.Textbox(label="Step 2:
|
| 304 |
with gr.Column():
|
| 305 |
out3 = gr.Textbox(label="Dataset Used")
|
| 306 |
-
out4 = gr.Textbox(label="Step 3:
|
| 307 |
|
| 308 |
with gr.Row():
|
| 309 |
html = gr.File(label="Download Tree (HTML)")
|
|
|
|
| 140 |
dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
|
| 141 |
logging.info("DNA sequence sanitized")
|
| 142 |
|
| 143 |
+
# Step 1: Boundary Prediction - Extract F gene sequence
|
| 144 |
processed_sequence = dna_input # This will be the sequence used for downstream analysis
|
| 145 |
boundary_output = ""
|
| 146 |
|
|
|
|
| 150 |
regions = boundary_model.extract_gene_regions(predictions, dna_input)
|
| 151 |
if regions:
|
| 152 |
processed_sequence = regions[0]["sequence"] # Use the extracted gene region
|
| 153 |
+
boundary_output = processed_sequence # Output the actual F gene sequence
|
| 154 |
+
logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
|
| 155 |
else:
|
| 156 |
+
boundary_output = f"No F gene regions found in input sequence"
|
| 157 |
processed_sequence = dna_input
|
| 158 |
+
logging.warning("No gene regions found, using full sequence")
|
| 159 |
logging.info("Boundary model prediction completed")
|
| 160 |
except Exception as e:
|
| 161 |
logging.error(f"Boundary model failed: {e}")
|
|
|
|
| 165 |
boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
|
| 166 |
processed_sequence = dna_input
|
| 167 |
|
| 168 |
+
# Step 2: Keras Prediction (F gene validation)
|
| 169 |
keras_output = ""
|
| 170 |
if processed_sequence and len(processed_sequence) >= 6:
|
| 171 |
+
keras_prediction = predict_with_keras(processed_sequence)
|
| 172 |
+
# Interpret keras prediction as F gene validation
|
| 173 |
+
if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
|
| 174 |
+
# You might want to add logic here to interpret the prediction scores
|
| 175 |
+
# For now, just show the prediction
|
| 176 |
+
keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
|
| 177 |
+
else:
|
| 178 |
+
keras_output = keras_prediction
|
| 179 |
else:
|
| 180 |
+
keras_output = "Skipped: sequence too short for F gene validation"
|
| 181 |
|
| 182 |
+
# Step 3: MAFFT and IQ-TREE (skip due to configuration issues)
|
| 183 |
aligned_file = None
|
| 184 |
phy_file = None
|
| 185 |
|
| 186 |
+
# Skip MAFFT due to configuration issues in the container
|
| 187 |
+
logging.info("Skipping MAFFT/IQ-TREE due to container configuration issues")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
# Step 4: ML Simplified Tree (using the extracted F gene sequence)
|
| 190 |
html_file = None
|
| 191 |
tree_html_content = "No tree generated"
|
| 192 |
ml_output = ""
|
| 193 |
|
| 194 |
if os.path.exists(csv_path) and processed_sequence and len(processed_sequence) >= 10:
|
| 195 |
try:
|
| 196 |
+
logging.info(f"Starting ML tree analysis with F gene sequence length: {len(processed_sequence)}")
|
| 197 |
analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
|
| 198 |
|
| 199 |
if analyzer.load_data(csv_path):
|
| 200 |
logging.info("CSV data loaded successfully")
|
| 201 |
|
| 202 |
+
# Use the extracted F gene sequence from boundary model
|
| 203 |
if analyzer.find_query_sequence(processed_sequence):
|
| 204 |
+
logging.info("F gene sequence found in dataset")
|
| 205 |
matched_ids, perc = analyzer.find_similar_sequences(analyzer.matching_percentage)
|
|
|
|
| 206 |
|
| 207 |
+
# Try to create tree with error handling
|
| 208 |
+
try:
|
| 209 |
+
analyzer.create_interactive_tree(matched_ids, perc)
|
| 210 |
+
|
| 211 |
+
# Check for multiple possible HTML file names
|
| 212 |
+
possible_html_files = [
|
| 213 |
+
"phylogenetic_tree_normalized_horizontal.html",
|
| 214 |
+
"phylogenetic_tree.html",
|
| 215 |
+
"tree.html"
|
| 216 |
+
]
|
| 217 |
+
|
| 218 |
+
for html_filename in possible_html_files:
|
| 219 |
+
if os.path.exists(html_filename):
|
| 220 |
+
html_file = html_filename
|
| 221 |
+
with open(html_filename, "r", encoding='utf-8') as f:
|
| 222 |
+
tree_html_content = f.read()
|
| 223 |
+
ml_output = f"Phylogenetic tree generated successfully with {len(matched_ids)} sequences (similarity: {perc:.1f}%)"
|
| 224 |
+
logging.info(f"Tree HTML file found: {html_filename}")
|
| 225 |
+
break
|
| 226 |
+
|
| 227 |
+
if not html_file:
|
| 228 |
+
# List all HTML files in directory for debugging
|
| 229 |
+
html_files = [f for f in os.listdir('.') if f.endswith('.html')]
|
| 230 |
+
ml_output = f"Tree analysis completed but HTML file not found. Available HTML files: {html_files}"
|
| 231 |
+
logging.warning(f"HTML files in directory: {html_files}")
|
| 232 |
+
|
| 233 |
+
except Exception as tree_error:
|
| 234 |
+
ml_output = f"Tree creation failed: {str(tree_error)}"
|
| 235 |
+
logging.error(f"Tree creation error: {tree_error}")
|
| 236 |
+
|
| 237 |
else:
|
| 238 |
+
ml_output = f"F gene sequence not found in dataset (length: {len(processed_sequence)} bp)"
|
| 239 |
+
logging.warning(f"F gene sequence not found. Length: {len(processed_sequence)}")
|
| 240 |
else:
|
| 241 |
ml_output = "Failed to load CSV dataset"
|
| 242 |
logging.error("Failed to load CSV dataset")
|
|
|
|
| 246 |
elif not os.path.exists(csv_path):
|
| 247 |
ml_output = f"CSV dataset not found at {csv_path}"
|
| 248 |
elif not processed_sequence or len(processed_sequence) < 10:
|
| 249 |
+
ml_output = f"F gene sequence too short for analysis (length: {len(processed_sequence) if processed_sequence else 0})"
|
| 250 |
else:
|
| 251 |
ml_output = "Skipped due to previous step errors"
|
| 252 |
|
|
|
|
| 291 |
|
| 292 |
with gr.Row():
|
| 293 |
with gr.Column():
|
| 294 |
+
out1 = gr.Textbox(label="Step 1: Extracted F Gene Sequence", lines=8)
|
| 295 |
+
out2 = gr.Textbox(label="Step 2: F Gene Validation (Keras)", lines=3)
|
| 296 |
with gr.Column():
|
| 297 |
out3 = gr.Textbox(label="Dataset Used")
|
| 298 |
+
out4 = gr.Textbox(label="Step 3: Phylogenetic Tree Status", lines=3)
|
| 299 |
|
| 300 |
with gr.Row():
|
| 301 |
html = gr.File(label="Download Tree (HTML)")
|