re-type commited on
Commit
e2795d4
·
verified ·
1 Parent(s): 3f97919

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +439 -416
app.py CHANGED
@@ -12,503 +12,526 @@ import tempfile
12
  import shutil
13
  import sys
14
  from pathlib import Path
15
- try:
16
- from predictor import GenePredictor
17
- except ImportError:
18
- GenePredictor = None
19
- try:
20
- from tensorflow.keras.models import load_model
21
- except ImportError:
22
- load_model = None
23
- try:
24
- import ml_simplified_tree
25
- except ImportError:
26
- ml_simplified_tree = None
27
- from huggingface_hub import hf_hub_download
28
 
29
  # --- Global Variables ---
30
- MAFFT_PATH = "/usr/bin/mafft" # Common path in Hugging Face Spaces
31
- IQTREE_PATH = "/usr/bin/iqtree3" # Common path in Hugging Face Spaces
32
- CSV_PATH = "f cleaned.csv" # Persistent storage in Hugging Face
33
- MODEL_REPO = "GGproject10/best_boundary_aware_model"
34
-
35
- # --- Logging Setup ---
36
- logging.basicConfig(
37
- level=logging.INFO,
38
- format='%(asctime)s - %(levelname)s - %(message)s',
39
- handlers=[
40
- logging.FileHandler('/data/gene_analysis.log'),
41
- logging.StreamHandler(sys.stdout)
42
- ]
43
- )
44
 
45
- # --- Model Variables ---
46
- boundary_model = None
47
- keras_model = None
48
- kmer_to_index = None
49
- analyzer = None
50
 
51
- --- Load Models ---
52
  boundary_model = None
53
  keras_model = None
54
  kmer_to_index = None
 
55
 
56
- # Try to load boundary model from Hugging Face Hub
57
- try:
58
- boundary_path = hf_hub_download(
59
- repo_id=model_repo,
60
- filename="best_boundary_aware_model.pth",
61
- token=hf_token
62
- )
63
- if os.path.exists(boundary_path):
64
- boundary_model = GenePredictor(boundary_path)
65
- logging.info("Boundary model loaded successfully from Hugging Face Hub.")
66
- else:
67
- logging.warning(f"Boundary model file not found after download")
68
- except Exception as e:
69
- logging.error(f"Failed to load boundary model from HF Hub: {e}")
70
-
71
- # Try to load Keras model from Hugging Face Hub
72
- try:
73
- keras_path = hf_hub_download(
74
- repo_id=model_repo,
75
- filename="best_model.keras",
76
- token=hf_token
77
- )
78
- kmer_path = hf_hub_download(
79
- repo_id=model_repo,
80
- filename="kmer_to_index.pkl",
81
- token=hf_token
82
- )
83
 
84
- if os.path.exists(keras_path) and os.path.exists(kmer_path):
85
- keras_model = load_model(keras_path)
86
- with open(kmer_path, "rb") as f:
87
- kmer_to_index = pickle.load(f)
88
- logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
89
- else:
90
- logging.warning(f"Keras model or kmer files not found after download")
91
- except Exception as e:
92
- logging.error(f"Failed to load Keras model from HF Hub: {e}")
93
-
94
- # --- Load Verification Models from models directory ---
95
- verification_models = {}
96
-
97
- def load_verification_models():
98
- """Load all verification models from the models directory"""
99
- global verification_models
100
- models_dir = "models"
 
 
 
 
 
 
 
 
 
 
101
 
102
- if not os.path.exists(models_dir):
103
- logging.warning(f"Models directory not found: {models_dir}")
104
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- # Load different types of verification models
107
- model_files = {
108
- "boundary_model": "best_boundary_aware_model.pth",
109
- "keras_model": "best_model.keras",
110
- "kmer_index": "kmer_to_index.pkl",
111
- "additional_model_1": "verification_model_1.pth", # Add your model names here
112
- "additional_model_2": "verification_model_2.keras",
113
- # Add more models as needed
114
- }
115
 
116
- for model_name, filename in model_files.items():
117
- model_path = os.path.join(models_dir, filename)
118
-
119
- try:
120
- if os.path.exists(model_path):
121
- if filename.endswith('.pth'):
122
- # PyTorch model
123
- if model_name == "boundary_model":
124
- verification_models[model_name] = GenePredictor(model_path)
125
- else:
126
- verification_models[model_name] = torch.load(model_path, map_location='cpu')
127
-
128
- elif filename.endswith('.keras'):
129
- # Keras model
130
- verification_models[model_name] = load_model(model_path)
131
-
132
- elif filename.endswith('.pkl'):
133
- # Pickle file
134
- with open(model_path, 'rb') as f:
135
- verification_models[model_name] = pickle.load(f)
136
-
137
- logging.info(f"Loaded verification model: {model_name}")
138
-
139
- except Exception as e:
140
- logging.error(f"Failed to load {model_name} from {model_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- # Load verification models at startup
143
- load_verification_models()
 
144
 
145
- # --- Initialize Tree Analyzer ---
146
- analyzer = None
147
  try:
148
- analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
149
  if os.path.exists(csv_path):
150
- if analyzer.load_data(csv_path):
151
- logging.info("Tree analyzer initialized successfully")
152
- # Try to train AI model (optional)
153
- try:
154
- if not analyzer.train_ai_model():
155
- logging.warning("AI model training failed; proceeding with basic analysis.")
156
- except Exception as e:
157
- logging.warning(f"AI model training failed: {e}")
158
- else:
159
- logging.error("Failed to load CSV data for tree analyzer")
160
- analyzer = None
161
  else:
162
- logging.error(f"CSV file not found: {csv_path}")
163
- analyzer = None
164
  except Exception as e:
165
- logging.error(f"Failed to initialize tree analyzer: {e}")
166
- analyzer = None
167
-
168
- # --- Initialize Tree Analyzer ---
169
- def init_tree_analyzer():
170
- global analyzer
171
- if ml_simplified_tree and os.path.exists(CSV_PATH):
172
- try:
173
- analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
174
- if analyzer.load_data(CSV_PATH):
175
- logging.info("Tree analyzer initialized successfully.")
176
- else:
177
- logging.error("Failed to load CSV data.")
178
- analyzer = None
179
- except Exception as e:
180
- logging.error(f"Failed to initialize tree analyzer: {e}")
181
- analyzer = None
182
- else:
183
- logging.warning("Tree analyzer or CSV file not available.")
184
- analyzer = None
185
 
186
  # --- Tool Detection ---
187
- def check_tool_availability():
188
- mafft_cmd = shutil.which(MAFFT_PATH) or shutil.which("mafft")
189
- iqtree_cmd = shutil.which(IQTREE_PATH) or shutil.which("iqtree3")
190
- return bool(mafft_cmd), bool(iqtree_cmd), mafft_cmd, iqtree_cmd
191
-
192
- # --- Installation Guide ---
193
- def install_dependencies_guide():
194
- return """
195
- 🔧 DEPENDENCY SETUP FOR HUGGING FACE SPACES:
196
-
197
- 1. Add to requirements.txt:
198
- - mafft
199
- - iqtree
200
- 2. Place f_cleaned.csv in the repository root.
201
- 3. Ensure HF_TOKEN is set in Space secrets for model downloads.
202
- 4. If dependencies fail, contact Hugging Face support or use a custom Docker image.
203
- """
204
-
205
- # --- MAFFT and IQ-TREE Functions ---
206
- def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
207
- try:
208
- cmd = [mafft_cmd, '--auto', '--quiet', input_fasta]
209
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) # Reduced timeout for HF
210
- if result.returncode == 0:
211
- with open(output_fasta, 'w') as f:
212
- f.write(result.stdout)
213
- if os.path.getsize(output_fasta) > 0:
214
- logging.info(f"MAFFT alignment completed: {output_fasta}")
215
- return True, output_fasta
216
- return False, "MAFFT output empty."
217
- return False, f"MAFFT error: {result.stderr.strip() or 'Unknown error'}"
218
- except Exception as e:
219
- logging.error(f"MAFFT failed: {e}")
220
- return False, f"MAFFT failed: {str(e)}"
221
-
222
- def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
223
- try:
224
- cmd = [
225
- iqtree_cmd, '-s', aligned_fasta, '-m', 'GTR', '-nt', '1', # Simplified for HF resources
226
- '--prefix', output_prefix, '--quiet'
227
- ]
228
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) # Reduced timeout
229
- tree_file = f"{output_prefix}.treefile"
230
- if result.returncode == 0 and os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
231
- logging.info(f"IQ-TREE completed: {tree_file}")
232
- return True, tree_file
233
- return False, f"IQ-TREE error: {result.stderr.strip() or 'Tree file not generated'}"
234
- except Exception as e:
235
- logging.error(f"IQ-TREE failed: {e}")
236
- return False, f"IQ-TREE failed: {str(e)}"
237
-
238
- # --- Create Multi-FASTA ---
239
- def create_multi_fasta(query_sequence, query_id="Query_F_Gene"):
240
- try:
241
- temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False, dir="/data")
242
- temp_fasta.write(f">{query_id}\n{query_sequence}\n")
243
- ref_fasta_path = "/data/f_gene_sequences_aligned.fasta"
244
- if os.path.exists(ref_fasta_path):
245
- with open(ref_fasta_path, 'r') as ref_file:
246
- temp_fasta.write(ref_file.read())
247
- elif analyzer and hasattr(analyzer, 'data'):
248
- count = 0
249
- for idx, row in analyzer.data.iterrows():
250
- if 'sequence' in row and len(str(row['sequence'])) > 50:
251
- temp_fasta.write(f">{row.get('id', f'Ref_{count}')}\n{str(row['sequence']).upper()}\n")
252
- count += 1
253
- if count >= 10: # Reduced for HF
254
- break
255
- temp_fasta.close()
256
- return temp_fasta.name
257
- except Exception as e:
258
- logging.error(f"Multi-FASTA creation failed: {e}")
259
- return None
260
 
261
- # --- Pipeline: Maximum Likelihood Tree ---
262
- def build_maximum_likelihood_tree(sequence):
 
263
  try:
264
- sequence = re.sub(r'[^ATCG]', '', sequence.upper())
265
- if len(sequence) < 50:
266
- return False, "Sequence too short (<50 bp).", None, None
267
 
268
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
269
- status_msg = "🔍 Dependencies:\n"
270
- status_msg += f"✅ MAFFT: {mafft_cmd or 'Not found'}\n"
271
- status_msg += f"✅ IQ-TREE: {iqtree_cmd or 'Not found'}\n"
272
 
273
- if not mafft_available or not iqtree_available:
274
- guide = install_dependencies_guide()
275
- return False, f"{status_msg}\n❌ Missing tools:\n{guide}", None, None
276
 
277
- os.makedirs("/data/ml_tree_output", exist_ok=True)
278
- multi_fasta = create_multi_fasta(sequence)
279
- if not multi_fasta:
280
- return False, f"{status_msg}\n❌ Failed to create input FASTA.", None, None
281
 
282
- aligned_fasta = "/data/ml_tree_output/aligned_sequences.fasta"
283
- mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
284
- os.unlink(multi_fasta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
- if not mafft_success:
287
- return False, f"{status_msg}\n❌ MAFFT failed: {mafft_result}", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
- tree_prefix = "/data/ml_tree_output/ml_tree"
290
- iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
291
- if not iqtree_success:
292
- return False, f"{status_msg}\n❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
 
 
 
 
293
 
294
- tree_file = iqtree_result
295
- shutil.copy2(aligned_fasta, "/data/f_gene_sequences_aligned.fasta")
296
- shutil.copy2(tree_file, "/data/f_gene_sequences.phy.treefile")
297
 
298
- success_msg = f"{status_msg}\n✅ ML tree built:\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}"
299
- return True, success_msg, aligned_fasta, tree_file
300
  except Exception as e:
301
- logging.error(f"ML tree construction failed: {e}")
302
- return False, f"ML tree construction failed: {str(e)}", None, None
303
-
304
- # --- Pipeline: Verification ---
305
- def run_verification_pipeline(sequence):
306
- results = {}
307
- sequence = re.sub(r'[^ATCG]', '', sequence.upper())
308
- if len(sequence) < 10:
309
- results["error"] = "Sequence too short (<10 bp)."
310
- return results
311
-
312
- # Boundary model verification
313
- if boundary_model:
314
- try:
315
- predictions, probs, confidence = boundary_model.predict(sequence)
316
- regions = boundary_model.extract_gene_regions(predictions, sequence)
317
- results["boundary_model"] = {
318
- "type": "boundary_detection",
319
- "confidence": float(confidence),
320
- "regions_found": len(regions) if regions else 0,
321
- "extracted_sequence": regions[0]["sequence"] if regions else None
322
- }
323
- except Exception as e:
324
- results["boundary_model"] = {"error": f"Boundary prediction failed: {str(e)}"}
325
-
326
- # Keras model verification
327
- if keras_model and kmer_to_index:
328
- try:
329
- kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
330
- indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
331
- input_arr = np.array([indices])
332
- prediction = keras_model.predict(input_arr, verbose=0)[0]
333
- results["keras_model"] = {
334
- "type": "gene_validation",
335
- "mean_score": float(np.mean(prediction)),
336
- "max_score": float(np.max(prediction))
337
- }
338
- except Exception as e:
339
- results["keras_model"] = {"error": f"Keras prediction failed: {str(e)}"}
340
-
341
- return results
342
-
343
- # --- Format Results ---
344
- def format_results(results, sequence, pipeline_type):
345
- output = [f"🧬 {pipeline_type.upper()} ANALYSIS\nSequence length: {len(sequence)} bp\n{'=' * 50}"]
346
-
347
- if "error" in results:
348
- output.append(f"❌ Error: {results['error']}")
349
- return "\n".join(output)
350
-
351
- if pipeline_type == "prediction":
352
- if boundary_model and "boundary_model" in results:
353
- r = results["boundary_model"]
354
- if "error" not in r:
355
- output.append("\n🎯 Boundary Detection:")
356
- output.append(f"- Confidence: {r['confidence']:.3f}")
357
- output.append(f"- Regions Found: {r['regions_found']}")
358
- if r['extracted_sequence']:
359
- output.append(f"- Extracted Length: {len(r['extracted_sequence'])} bp")
360
- else:
361
- output.append(f"\n❌ Boundary Detection: {r['error']}")
362
-
363
- if keras_model and "keras_model" in results:
364
- r = results["keras_model"]
365
- if "error" not in r:
366
- output.append("\n🔍 Keras Validation:")
367
- output.append(f"- Mean Score: {r['mean_score']:.3f}")
368
- output.append(f"- Max Score: {r['max_score']:.3f}")
369
- else:
370
- output.append(f"\n❌ Keras Validation: {r['error']}")
371
-
372
- elif pipeline_type == "tree":
373
- output.append(results.get("message", "No tree results available."))
374
- if results.get("tree_file"):
375
- output.append(f"\nTree File: {os.path.basename(results['tree_file'])}")
376
-
377
- return "\n".join(output)
378
-
379
- # --- Interface Functions ---
380
- def analyze_sequence(sequence):
381
- sequence = re.sub(r'[^ATCG]', '', sequence.upper())
382
- if not sequence or len(sequence) < 10:
383
- return "Invalid or too short sequence (<10 bp)."
384
-
385
- results = run_verification_pipeline(sequence)
386
- return format_results(results, sequence, "prediction")
387
 
388
- def build_tree(sequence):
389
- success, message, aligned_fasta, tree_file = build_maximum_likelihood_tree(sequence)
390
- return format_results({"message": message, "tree_file": tree_file}, sequence, "tree")
391
-
392
- # --- File Processing ---
393
  def process_fasta_file(file):
 
394
  try:
395
- if not file:
396
  return "Please upload a FASTA file."
397
 
 
 
 
 
 
398
  sequences = {}
399
  current_seq = ""
400
  current_name = ""
401
- with open(file.name, 'r') as f:
402
- for line in f:
403
- line = line.strip()
404
- if line.startswith('>'):
405
- if current_name and current_seq:
406
- sequences[current_name] = current_seq
407
- current_name = line[1:]
408
- current_seq = ""
409
- else:
410
- current_seq += line.upper()
 
 
411
  if current_name and current_seq:
412
  sequences[current_name] = current_seq
413
 
414
  if not sequences:
415
- return "No valid sequences in FASTA file."
 
 
 
 
 
 
416
 
417
- results = [f"📁 FASTA FILE ANALYSIS\nFound {len(sequences)} sequences\n{'=' * 50}"]
418
  for i, (name, seq) in enumerate(sequences.items()):
419
- if i >= 3: # Reduced for HF
420
- results.append(f"\n... and {len(sequences) - 3} more sequences")
421
  break
422
- results.append(f"\n🧬 Sequence: {name}\nLength: {len(seq)} bp")
 
 
 
423
  clean_seq = re.sub(r'[^ATCG]', '', seq)
424
  if len(clean_seq) >= 10:
425
- results.append(analyze_sequence(clean_seq))
 
426
  else:
427
  results.append("❌ Sequence too short or invalid")
 
428
  results.append("-" * 40)
429
 
430
  return "\n".join(results)
 
431
  except Exception as e:
432
- logging.error(f"FASTA processing failed: {e}")
433
  return f"FASTA processing failed: {str(e)}"
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  # --- Gradio Interface ---
436
- def create_gradio_interface():
 
 
437
  css = """
438
- .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
439
- .output-text { font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4; }
440
- .input-section { margin-bottom: 20px; }
 
 
 
 
 
441
  """
442
 
443
  with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
444
  gr.Markdown("""
445
  # 🧬 Gene Analysis Tool
446
- Analyze DNA sequences, predict gene boundaries, and build phylogenetic trees.
 
447
  """)
448
 
449
- # Input Section
450
- with gr.Row():
451
- with gr.Column(scale=1):
452
- seq_input = gr.Textbox(
453
- label="DNA Sequence",
454
- placeholder="Enter DNA sequence (A, T, C, G only)...",
455
- lines=5,
456
- max_lines=10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  )
458
- file_input = gr.File(
459
- label="Upload FASTA File",
460
- file_types=[".fasta", ".fa", ".fas", ".txt"]
 
 
461
  )
462
- analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
463
- tree_btn = gr.Button("🌳 Build Tree", variant="primary")
464
 
465
- with gr.Column(scale=2):
466
- output = gr.Textbox(
467
- label="Results",
468
- lines=15,
469
- max_lines=20,
470
- elem_classes=["output-text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  )
472
 
473
- # Status Information
474
- gr.Markdown("### Tool Status")
475
- status = []
476
- status.append(f"✅ Boundary Model: {'Loaded' if boundary_model else 'Not Available'}")
477
- status.append(f"✅ Keras Model: {'Loaded' if keras_model else 'Not Available'}")
478
- status.append(f"✅ Tree Analyzer: {'Initialized' if analyzer else 'Not Available'}")
479
- mafft_available, iqtree_available, _, _ = check_tool_availability()
480
- status.append(f"✅ MAFFT: {'Available' if mafft_available else 'Not Available'}")
481
- status.append(f"✅ IQ-TREE: {'Available' if iqtree_available else 'Not Available'}")
482
- gr.Markdown("\n".join(status))
483
-
484
- # Event Handlers
485
- analyze_btn.click(fn=analyze_sequence, inputs=seq_input, outputs=output)
486
- tree_btn.click(fn=build_tree, inputs=seq_input, outputs=output)
487
- file_input.change(fn=process_fasta_file, inputs=file_input, outputs=output)
488
 
489
  return interface
490
 
491
- # --- Main ---
492
  if __name__ == "__main__":
493
- os.makedirs("/data", exist_ok=True)
494
- os.makedirs("/data/ml_tree_output", exist_ok=True)
495
- os.makedirs("/data/models", exist_ok=True)
496
-
497
- load_models()
498
- init_tree_analyzer()
499
 
 
500
  logging.info("Starting Gene Analysis Tool")
501
- logging.info(f"Boundary model: {boundary_model is not None}")
502
- logging.info(f"Keras model: {keras_model is not None}")
503
- logging.info(f"Tree analyzer: {analyzer is not None}")
504
 
 
505
  try:
506
- interface = create_gradio_interface()
507
  interface.launch(
 
508
  server_name="0.0.0.0",
509
  server_port=7860,
510
- share=False # Managed by Hugging Face
511
  )
512
  except Exception as e:
513
- logging.error(f"Interface launch failed: {e}")
514
  sys.exit(1)
 
12
  import shutil
13
  import sys
14
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # --- Global Variables ---
17
+ MAFFT_PATH = "mafft/mafftdir/bin/mafft"
18
+ IQTREE_PATH = "iqtree/bin/iqtree2"
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # --- Logging ---
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
22
 
23
+ # --- Model Loading ---
24
  boundary_model = None
25
  keras_model = None
26
  kmer_to_index = None
27
+ csv_data = None
28
 
29
+ # Simple predictor class (fallback)
30
+ class SimpleGenePredictor:
31
+ def __init__(self):
32
+ self.name = "Simple Gene Predictor"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def predict(self, sequence):
35
+ """Simple gene prediction based on sequence characteristics"""
36
+ if len(sequence) < 100:
37
+ return [], [], 0.1
38
+
39
+ # Simple ORF detection
40
+ predictions = []
41
+ probabilities = []
42
+
43
+ # Look for start codons (ATG) and stop codons
44
+ start_codons = ['ATG']
45
+ stop_codons = ['TAA', 'TAG', 'TGA']
46
+
47
+ for i in range(len(sequence) - 2):
48
+ codon = sequence[i:i+3]
49
+ if codon in start_codons:
50
+ predictions.append(1) # Start
51
+ probabilities.append(0.8)
52
+ elif codon in stop_codons:
53
+ predictions.append(2) # Stop
54
+ probabilities.append(0.7)
55
+ else:
56
+ predictions.append(0) # Non-coding
57
+ probabilities.append(0.3)
58
+
59
+ confidence = 0.6
60
+ return predictions, probabilities, confidence
61
 
62
+ def extract_gene_regions(self, predictions, sequence):
63
+ """Extract potential gene regions"""
64
+ regions = []
65
+ start_pos = None
66
+
67
+ for i, pred in enumerate(predictions):
68
+ if pred == 1 and start_pos is None: # Start codon
69
+ start_pos = i
70
+ elif pred == 2 and start_pos is not None: # Stop codon
71
+ if i - start_pos > 150: # Minimum gene length
72
+ regions.append({
73
+ 'start': start_pos,
74
+ 'end': i + 3,
75
+ 'sequence': sequence[start_pos:i+3],
76
+ 'confidence': 0.6
77
+ })
78
+ start_pos = None
79
+
80
+ return regions
81
+
82
+ # Try to load models with fallbacks
83
+ try:
84
+ from huggingface_hub import hf_hub_download
85
 
86
+ model_repo = "GGproject10/best_boundary_aware_model"
87
+ hf_token = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
88
 
89
+ # Try to load boundary model
90
+ try:
91
+ boundary_path = hf_hub_download(
92
+ repo_id=model_repo,
93
+ filename="best_boundary_aware_model.pth",
94
+ token=hf_token
95
+ )
96
+ # Since we don't have the actual predictor class, use simple predictor
97
+ boundary_model = SimpleGenePredictor()
98
+ logging.info("Using simple boundary model (fallback)")
99
+ except Exception as e:
100
+ logging.warning(f"Could not load HF model: {e}")
101
+ boundary_model = SimpleGenePredictor()
102
+ logging.info("Using simple boundary model (fallback)")
103
+
104
+ # Try to load Keras model
105
+ try:
106
+ from tensorflow.keras.models import load_model
107
+ keras_path = hf_hub_download(
108
+ repo_id=model_repo,
109
+ filename="best_model.keras",
110
+ token=hf_token
111
+ )
112
+ kmer_path = hf_hub_download(
113
+ repo_id=model_repo,
114
+ filename="kmer_to_index.pkl",
115
+ token=hf_token
116
+ )
117
+
118
+ if os.path.exists(keras_path) and os.path.exists(kmer_path):
119
+ keras_model = load_model(keras_path)
120
+ with open(kmer_path, "rb") as f:
121
+ kmer_to_index = pickle.load(f)
122
+ logging.info("Keras model loaded successfully")
123
+ else:
124
+ logging.warning("Keras model files not found")
125
+ except Exception as e:
126
+ logging.warning(f"Could not load Keras model: {e}")
127
 
128
+ except ImportError:
129
+ logging.warning("huggingface_hub not available, using fallback models")
130
+ boundary_model = SimpleGenePredictor()
131
 
132
+ # Load CSV data if available
 
133
  try:
134
+ csv_path = "f cleaned.csv"
135
  if os.path.exists(csv_path):
136
+ csv_data = pd.read_csv(csv_path)
137
+ logging.info(f"Loaded CSV data with {len(csv_data)} rows")
 
 
 
 
 
 
 
 
 
138
  else:
139
+ logging.warning(f"CSV file not found: {csv_path}")
 
140
  except Exception as e:
141
+ logging.warning(f"Could not load CSV data: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  # --- Tool Detection ---
144
+ def check_tools():
145
+ """Check for external tools"""
146
+ mafft_available = shutil.which('mafft') is not None or os.path.exists(MAFFT_PATH)
147
+ iqtree_available = shutil.which('iqtree2') is not None or shutil.which('iqtree') is not None or os.path.exists(IQTREE_PATH)
148
+
149
+ mafft_cmd = 'mafft' if shutil.which('mafft') else MAFFT_PATH if os.path.exists(MAFFT_PATH) else None
150
+ iqtree_cmd = 'iqtree2' if shutil.which('iqtree2') else 'iqtree' if shutil.which('iqtree') else IQTREE_PATH if os.path.exists(IQTREE_PATH) else None
151
+
152
+ return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # --- Prediction Functions ---
155
+ def predict_gene_sequence(sequence):
156
+ """Main gene prediction function"""
157
  try:
158
+ if not sequence or len(sequence.strip()) == 0:
159
+ return "Please provide a DNA sequence."
 
160
 
161
+ # Clean sequence
162
+ sequence = re.sub(r'[^ATCG]', '', sequence.upper())
 
 
163
 
164
+ if len(sequence) < 10:
165
+ return "Sequence too short. Please provide at least 10 nucleotides."
 
166
 
167
+ results = []
168
+ results.append(f"🧬 GENE SEQUENCE ANALYSIS")
169
+ results.append(f"Input sequence length: {len(sequence)} bp")
170
+ results.append("=" * 50)
171
 
172
+ # Boundary model prediction
173
+ if boundary_model:
174
+ results.append("\n🎯 BOUNDARY DETECTION:")
175
+ try:
176
+ predictions, probabilities, confidence = boundary_model.predict(sequence)
177
+ regions = boundary_model.extract_gene_regions(predictions, sequence)
178
+
179
+ results.append(f"- Overall Confidence: {confidence:.4f}")
180
+ results.append(f"- Regions Detected: {len(regions) if regions else 0}")
181
+
182
+ if regions:
183
+ for i, region in enumerate(regions[:3]):
184
+ results.append(f"\nRegion {i+1}:")
185
+ results.append(f" - Start: {region['start']}")
186
+ results.append(f" - End: {region['end']}")
187
+ results.append(f" - Length: {len(region['sequence'])} bp")
188
+ results.append(f" - Confidence: {region.get('confidence', 0):.4f}")
189
+
190
+ except Exception as e:
191
+ results.append(f"❌ Boundary prediction failed: {str(e)}")
192
+ else:
193
+ results.append("\n❌ Boundary model not available")
194
 
195
+ # Keras model prediction
196
+ if keras_model and kmer_to_index:
197
+ results.append("\n🔍 KERAS MODEL ANALYSIS:")
198
+ try:
199
+ if len(sequence) >= 6:
200
+ # Generate k-mers
201
+ kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
202
+ indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
203
+
204
+ # Prepare input
205
+ input_arr = np.array([indices])
206
+ prediction = keras_model.predict(input_arr, verbose=0)[0]
207
+
208
+ mean_score = np.mean(prediction)
209
+ max_score = np.max(prediction)
210
+ min_score = np.min(prediction)
211
+
212
+ results.append(f"- Mean Score: {mean_score:.4f}")
213
+ results.append(f"- Max Score: {max_score:.4f}")
214
+ results.append(f"- Min Score: {min_score:.4f}")
215
+ results.append(f"- Total K-mers: {len(kmers)}")
216
+ else:
217
+ results.append("❌ Sequence too short for k-mer analysis")
218
+
219
+ except Exception as e:
220
+ results.append(f"❌ Keras prediction failed: {str(e)}")
221
+ else:
222
+ results.append("\n❌ Keras model not available")
223
 
224
+ # Simple sequence analysis
225
+ results.append("\n📊 SEQUENCE STATISTICS:")
226
+ gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100
227
+ results.append(f"- GC Content: {gc_content:.2f}%")
228
+ results.append(f"- A: {sequence.count('A')} ({sequence.count('A')/len(sequence)*100:.1f}%)")
229
+ results.append(f"- T: {sequence.count('T')} ({sequence.count('T')/len(sequence)*100:.1f}%)")
230
+ results.append(f"- G: {sequence.count('G')} ({sequence.count('G')/len(sequence)*100:.1f}%)")
231
+ results.append(f"- C: {sequence.count('C')} ({sequence.count('C')/len(sequence)*100:.1f}%)")
232
 
233
+ return "\n".join(results)
 
 
234
 
 
 
235
  except Exception as e:
236
+ logging.error(f"Gene prediction error: {e}")
237
+ return f"Gene prediction failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
 
 
 
 
 
239
  def process_fasta_file(file):
240
+ """Process FASTA file"""
241
  try:
242
+ if file is None:
243
  return "Please upload a FASTA file."
244
 
245
+ # Read file content
246
+ with open(file.name, 'r') as f:
247
+ content = f.read()
248
+
249
+ # Parse FASTA
250
  sequences = {}
251
  current_seq = ""
252
  current_name = ""
253
+
254
+ lines = content.strip().split('\n')
255
+ for line in lines:
256
+ line = line.strip()
257
+ if line.startswith('>'):
258
+ if current_name and current_seq:
259
+ sequences[current_name] = current_seq
260
+ current_name = line[1:]
261
+ current_seq = ""
262
+ else:
263
+ current_seq += line.upper()
264
+
265
  if current_name and current_seq:
266
  sequences[current_name] = current_seq
267
 
268
  if not sequences:
269
+ return "No valid sequences found in FASTA file."
270
+
271
+ # Process sequences
272
+ results = []
273
+ results.append(f"📁 FASTA FILE ANALYSIS")
274
+ results.append(f"Found {len(sequences)} sequences")
275
+ results.append("=" * 60)
276
 
 
277
  for i, (name, seq) in enumerate(sequences.items()):
278
+ if i >= 5:
279
+ results.append(f"\n... and {len(sequences) - 5} more sequences")
280
  break
281
+
282
+ results.append(f"\n🧬 Sequence: {name}")
283
+ results.append(f"Length: {len(seq)} bp")
284
+
285
  clean_seq = re.sub(r'[^ATCG]', '', seq)
286
  if len(clean_seq) >= 10:
287
+ prediction = predict_gene_sequence(clean_seq)
288
+ results.append(prediction)
289
  else:
290
  results.append("❌ Sequence too short or invalid")
291
+
292
  results.append("-" * 40)
293
 
294
  return "\n".join(results)
295
+
296
  except Exception as e:
297
+ logging.error(f"FASTA processing error: {e}")
298
  return f"FASTA processing failed: {str(e)}"
299
 
300
+ def build_phylogenetic_tree(sequence):
301
+ """Build phylogenetic tree"""
302
+ try:
303
+ if not sequence or len(sequence.strip()) == 0:
304
+ return "Please provide a DNA sequence."
305
+
306
+ clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
307
+
308
+ if len(clean_seq) < 50:
309
+ return "Sequence too short for phylogenetic analysis (minimum 50 bp)."
310
+
311
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tools()
312
+
313
+ result = f"🌳 PHYLOGENETIC TREE ANALYSIS\n"
314
+ result += f"Input sequence length: {len(clean_seq)} bp\n"
315
+ result += "=" * 50 + "\n\n"
316
+
317
+ # Check tools
318
+ result += "🔍 Tool availability:\n"
319
+ if mafft_available:
320
+ result += f"✅ MAFFT: {mafft_cmd}\n"
321
+ else:
322
+ result += "❌ MAFFT: Not available\n"
323
+
324
+ if iqtree_available:
325
+ result += f"✅ IQ-TREE: {iqtree_cmd}\n"
326
+ else:
327
+ result += "❌ IQ-TREE: Not available\n"
328
+
329
+ if not mafft_available or not iqtree_available:
330
+ result += "\n⚠️ External tools required for phylogenetic analysis.\n"
331
+ result += "Please install MAFFT and IQ-TREE:\n"
332
+ result += "- Ubuntu/Debian: sudo apt-get install mafft iqtree\n"
333
+ result += "- macOS: brew install mafft iqtree\n"
334
+ result += "- conda: conda install -c bioconda mafft iqtree\n"
335
+ return result
336
+
337
+ # Simple analysis if CSV data is available
338
+ if csv_data is not None:
339
+ result += f"\n📊 Dataset analysis:\n"
340
+ result += f"- Available sequences: {len(csv_data)}\n"
341
+
342
+ # Simple similarity search
343
+ if 'sequence' in csv_data.columns:
344
+ similarities = []
345
+ query_len = len(clean_seq)
346
+
347
+ for idx, row in csv_data.head(100).iterrows(): # Check first 100
348
+ ref_seq = str(row.get('sequence', ''))
349
+ if len(ref_seq) > 10:
350
+ # Simple similarity calculation
351
+ ref_clean = re.sub(r'[^ATCG]', '', ref_seq.upper())
352
+ if len(ref_clean) > 0:
353
+ min_len = min(len(clean_seq), len(ref_clean))
354
+ matches = sum(1 for i in range(min_len) if clean_seq[i] == ref_clean[i])
355
+ similarity = matches / min_len * 100
356
+ if similarity > 70:
357
+ similarities.append((idx, similarity, len(ref_clean)))
358
+
359
+ result += f"- Similar sequences found: {len(similarities)}\n"
360
+
361
+ if similarities:
362
+ similarities.sort(key=lambda x: x[1], reverse=True)
363
+ result += "\nTop matches:\n"
364
+ for i, (idx, sim, length) in enumerate(similarities[:5]):
365
+ result += f" {i+1}. Index {idx}: {sim:.1f}% similarity ({length} bp)\n"
366
+
367
+ result += "\n✅ Basic phylogenetic analysis completed.\n"
368
+ result += "For full ML tree construction, ensure MAFFT and IQ-TREE are installed."
369
+
370
+ return result
371
+
372
+ except Exception as e:
373
+ logging.error(f"Phylogenetic analysis error: {e}")
374
+ return f"Phylogenetic analysis failed: {str(e)}"
375
+
376
+ def get_model_status():
377
+ """Get current model status"""
378
+ status = []
379
+
380
+ if boundary_model:
381
+ status.append("✅ Boundary Model: Available")
382
+ else:
383
+ status.append("❌ Boundary Model: Not Available")
384
+
385
+ if keras_model:
386
+ status.append("✅ Keras Model: Available")
387
+ else:
388
+ status.append("❌ Keras Model: Not Available")
389
+
390
+ if csv_data is not None:
391
+ status.append(f"✅ Reference Data: {len(csv_data)} sequences")
392
+ else:
393
+ status.append("❌ Reference Data: Not Available")
394
+
395
+ mafft_available, iqtree_available, _, _ = check_tools()
396
+
397
+ if mafft_available:
398
+ status.append("✅ MAFFT: Available")
399
+ else:
400
+ status.append("❌ MAFFT: Not Available")
401
+
402
+ if iqtree_available:
403
+ status.append("✅ IQ-TREE: Available")
404
+ else:
405
+ status.append("❌ IQ-TREE: Not Available")
406
+
407
+ return "\n".join(status)
408
+
409
  # --- Gradio Interface ---
410
+ def create_interface():
411
+ """Create the Gradio interface"""
412
+
413
  css = """
414
+ .gradio-container {
415
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
416
+ }
417
+ .output-text {
418
+ font-family: 'Courier New', monospace;
419
+ font-size: 12px;
420
+ line-height: 1.4;
421
+ }
422
  """
423
 
424
  with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
425
  gr.Markdown("""
426
  # 🧬 Gene Analysis Tool
427
+
428
+ Comprehensive gene sequence analysis with machine learning models and phylogenetic analysis.
429
  """)
430
 
431
+ with gr.Tabs():
432
+ # Main Analysis Tab
433
+ with gr.Tab("🔬 Gene Analysis"):
434
+ with gr.Row():
435
+ with gr.Column(scale=1):
436
+ gr.Markdown("### Single Sequence Analysis")
437
+ seq_input = gr.Textbox(
438
+ label="DNA Sequence",
439
+ placeholder="Enter DNA sequence (A, T, C, G only)...",
440
+ lines=4
441
+ )
442
+ predict_btn = gr.Button("🚀 Analyze Sequence", variant="primary")
443
+
444
+ gr.Markdown("### File Processing")
445
+ file_input = gr.File(
446
+ label="Upload FASTA File",
447
+ file_types=[".fasta", ".fa", ".fas", ".txt"]
448
+ )
449
+ process_btn = gr.Button("📊 Process FASTA", variant="primary")
450
+
451
+ with gr.Column(scale=2):
452
+ output_display = gr.Textbox(
453
+ label="Analysis Results",
454
+ lines=25,
455
+ elem_classes=["output-text"]
456
+ )
457
+
458
+ predict_btn.click(
459
+ fn=predict_gene_sequence,
460
+ inputs=[seq_input],
461
+ outputs=[output_display]
462
  )
463
+
464
+ process_btn.click(
465
+ fn=process_fasta_file,
466
+ inputs=[file_input],
467
+ outputs=[output_display]
468
  )
 
 
469
 
470
+ # Phylogenetic Analysis Tab
471
+ with gr.Tab("🌳 Phylogenetic Analysis"):
472
+ with gr.Row():
473
+ with gr.Column(scale=1):
474
+ gr.Markdown("### Tree Construction")
475
+ tree_seq_input = gr.Textbox(
476
+ label="Query Sequence",
477
+ placeholder="Enter sequence for phylogenetic analysis...",
478
+ lines=4
479
+ )
480
+ tree_btn = gr.Button("🌳 Build Tree", variant="primary")
481
+
482
+ gr.Markdown("### Model Status")
483
+ status_btn = gr.Button("📊 Check Status")
484
+
485
+ with gr.Column(scale=2):
486
+ tree_output = gr.Textbox(
487
+ label="Phylogenetic Analysis Results",
488
+ lines=25,
489
+ elem_classes=["output-text"]
490
+ )
491
+
492
+ tree_btn.click(
493
+ fn=build_phylogenetic_tree,
494
+ inputs=[tree_seq_input],
495
+ outputs=[tree_output]
496
+ )
497
+
498
+ status_btn.click(
499
+ fn=get_model_status,
500
+ outputs=[tree_output]
501
  )
502
 
503
+ # Information footer
504
+ gr.Markdown("""
505
+ ---
506
+ ### Usage Notes:
507
+ - **Input**: Provide DNA sequences with only A, T, C, G characters
508
+ - **FASTA Files**: Upload files with multiple sequences for batch analysis
509
+ - **Phylogenetic Analysis**: Requires MAFFT and IQ-TREE for full functionality
510
+ - **Models**: Uses trained ML models for gene boundary detection and validation
511
+ """)
 
 
 
 
 
 
512
 
513
  return interface
514
 
515
+ # --- Main Application ---
516
  if __name__ == "__main__":
517
+ # Create output directories
518
+ os.makedirs("output", exist_ok=True)
 
 
 
 
519
 
520
+ # Log startup information
521
  logging.info("Starting Gene Analysis Tool")
522
+ logging.info(f"Boundary model available: {boundary_model is not None}")
523
+ logging.info(f"Keras model available: {keras_model is not None}")
524
+ logging.info(f"CSV data available: {csv_data is not None}")
525
 
526
+ # Create and launch interface
527
  try:
528
+ interface = create_interface()
529
  interface.launch(
530
+ share=False,
531
  server_name="0.0.0.0",
532
  server_port=7860,
533
+ show_error=True
534
  )
535
  except Exception as e:
536
+ logging.error(f"Failed to launch interface: {e}")
537
  sys.exit(1)