re-type commited on
Commit
24230e9
·
verified ·
1 Parent(s): 20b71b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -116
app.py CHANGED
@@ -27,22 +27,22 @@ except ImportError:
27
  from huggingface_hub import hf_hub_download
28
 
29
  # --- Global Variables ---
30
- MAFFT_PATH = "mafft/mafftdir/bin/mafft" # Update this path as needed
31
- IQTREE_PATH = "iqtree/bin/iqtree2" # Update this path as needed
32
- CSV_PATH = "f_cleaned.csv" # Updated to match your naming
 
33
 
34
  # --- Logging Setup ---
35
  logging.basicConfig(
36
  level=logging.INFO,
37
  format='%(asctime)s - %(levelname)s - %(message)s',
38
  handlers=[
39
- logging.FileHandler('gene_analysis.log'),
40
  logging.StreamHandler(sys.stdout)
41
  ]
42
  )
43
 
44
- # --- Model Paths and Variables ---
45
- MODEL_REPO = "GGproject10/best_boundary_aware_model"
46
  boundary_model = None
47
  keras_model = None
48
  kmer_to_index = None
@@ -51,7 +51,7 @@ analyzer = None
51
  # --- Load Models ---
52
  def load_models():
53
  global boundary_model, keras_model, kmer_to_index
54
- hf_token = os.getenv("HF_TOKEN")
55
 
56
  # Load boundary model
57
  if GenePredictor:
@@ -59,7 +59,8 @@ def load_models():
59
  boundary_path = hf_hub_download(
60
  repo_id=MODEL_REPO,
61
  filename="best_boundary_aware_model.pth",
62
- token=hf_token
 
63
  )
64
  boundary_model = GenePredictor(boundary_path)
65
  logging.info("Boundary model loaded successfully.")
@@ -75,12 +76,14 @@ def load_models():
75
  keras_path = hf_hub_download(
76
  repo_id=MODEL_REPO,
77
  filename="best_model.keras",
78
- token=hf_token
 
79
  )
80
  kmer_path = hf_hub_download(
81
  repo_id=MODEL_REPO,
82
  filename="kmer_to_index.pkl",
83
- token=hf_token
 
84
  )
85
  keras_model = load_model(keras_path)
86
  with open(kmer_path, "rb") as f:
@@ -101,11 +104,6 @@ def init_tree_analyzer():
101
  analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
102
  if analyzer.load_data(CSV_PATH):
103
  logging.info("Tree analyzer initialized successfully.")
104
- try:
105
- if not analyzer.train_ai_model():
106
- logging.warning("AI model training failed.")
107
- except Exception as e:
108
- logging.warning(f"AI model training failed: {e}")
109
  else:
110
  logging.error("Failed to load CSV data.")
111
  analyzer = None
@@ -118,44 +116,28 @@ def init_tree_analyzer():
118
 
119
  # --- Tool Detection ---
120
  def check_tool_availability():
121
- mafft_candidates = [
122
- MAFFT_PATH, 'mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', 'mafft.bat'
123
- ]
124
- iqtree_candidates = [
125
- IQTREE_PATH, 'iqtree2', 'iqtree', '/usr/bin/iqtree2', '/usr/local/bin/iqtree2',
126
- '/usr/bin/iqtree', '/usr/local/bin/iqtree', 'iqtree2.exe', 'iqtree.exe'
127
- ]
128
-
129
- mafft_cmd = next((cmd for cmd in mafft_candidates if cmd and (os.path.exists(cmd) or shutil.which(cmd))), None)
130
- iqtree_cmd = next((cmd for cmd in iqtree_candidates if cmd and (os.path.exists(cmd) or shutil.which(cmd))), None)
131
-
132
  return bool(mafft_cmd), bool(iqtree_cmd), mafft_cmd, iqtree_cmd
133
 
134
  # --- Installation Guide ---
135
  def install_dependencies_guide():
136
  return """
137
- 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
138
 
139
- For MAFFT:
140
- - Ubuntu/Debian: sudo apt-get install mafft
141
- - CentOS/RHEL: sudo yum install mafft
142
- - macOS: brew install mafft
143
- - Windows: Download from https://mafft.cbrc.jp/alignment/software/
144
-
145
- For IQ-TREE:
146
- - Ubuntu/Debian: sudo apt-get install iqtree
147
- - CentOS/RHEL: sudo yum install iqtree
148
- - macOS: brew install iqtree
149
- - Windows: Download from http://www.iqtree.org/
150
-
151
- Conda: conda install -c bioconda mafft iqtree
152
  """
153
 
154
  # --- MAFFT and IQ-TREE Functions ---
155
  def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
156
  try:
157
  cmd = [mafft_cmd, '--auto', '--quiet', input_fasta]
158
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
159
  if result.returncode == 0:
160
  with open(output_fasta, 'w') as f:
161
  f.write(result.stdout)
@@ -171,10 +153,10 @@ def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
171
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
172
  try:
173
  cmd = [
174
- iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000',
175
- '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '--quiet'
176
  ]
177
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200)
178
  tree_file = f"{output_prefix}.treefile"
179
  if result.returncode == 0 and os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
180
  logging.info(f"IQ-TREE completed: {tree_file}")
@@ -184,26 +166,12 @@ def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
184
  logging.error(f"IQ-TREE failed: {e}")
185
  return False, f"IQ-TREE failed: {str(e)}"
186
 
187
- # --- Fallback Tree Construction ---
188
- def create_simple_tree(sequences_dict):
189
- try:
190
- seq_names = list(sequences_dict.keys())
191
- if len(seq_names) < 2:
192
- return None, "Need at least 2 sequences."
193
- tree_str = f"({','.join([f'{name}:0.1' for name in seq_names[:5]])});"
194
- tree_file = "simple_tree.nwk"
195
- with open(tree_file, 'w') as f:
196
- f.write(tree_str)
197
- return tree_file, "Simple tree created."
198
- except Exception as e:
199
- return None, f"Simple tree creation failed: {str(e)}"
200
-
201
  # --- Create Multi-FASTA ---
202
  def create_multi_fasta(query_sequence, query_id="Query_F_Gene"):
203
  try:
204
- temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
205
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
206
- ref_fasta_path = "f_gene_sequences_aligned.fasta"
207
  if os.path.exists(ref_fasta_path):
208
  with open(ref_fasta_path, 'r') as ref_file:
209
  temp_fasta.write(ref_file.read())
@@ -213,7 +181,7 @@ def create_multi_fasta(query_sequence, query_id="Query_F_Gene"):
213
  if 'sequence' in row and len(str(row['sequence'])) > 50:
214
  temp_fasta.write(f">{row.get('id', f'Ref_{count}')}\n{str(row['sequence']).upper()}\n")
215
  count += 1
216
- if count >= 20:
217
  break
218
  temp_fasta.close()
219
  return temp_fasta.name
@@ -237,26 +205,26 @@ def build_maximum_likelihood_tree(sequence):
237
  guide = install_dependencies_guide()
238
  return False, f"{status_msg}\n❌ Missing tools:\n{guide}", None, None
239
 
240
- os.makedirs("ml_tree_output", exist_ok=True)
241
  multi_fasta = create_multi_fasta(sequence)
242
  if not multi_fasta:
243
  return False, f"{status_msg}\n❌ Failed to create input FASTA.", None, None
244
 
245
- aligned_fasta = "ml_tree_output/aligned_sequences.fasta"
246
  mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
247
  os.unlink(multi_fasta)
248
 
249
  if not mafft_success:
250
  return False, f"{status_msg}\n❌ MAFFT failed: {mafft_result}", None, None
251
 
252
- tree_prefix = "ml_tree_output/ml_tree"
253
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
254
  if not iqtree_success:
255
  return False, f"{status_msg}\n❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
256
 
257
  tree_file = iqtree_result
258
- shutil.copy2(aligned_fasta, "f_gene_sequences_aligned.fasta")
259
- shutil.copy2(tree_file, "f_gene_sequences.phy.treefile")
260
 
261
  success_msg = f"{status_msg}\n✅ ML tree built:\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}"
262
  return True, success_msg, aligned_fasta, tree_file
@@ -352,6 +320,49 @@ def build_tree(sequence):
352
  success, message, aligned_fasta, tree_file = build_maximum_likelihood_tree(sequence)
353
  return format_results({"message": message, "tree_file": tree_file}, sequence, "tree")
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  # --- Gradio Interface ---
356
  def create_gradio_interface():
357
  css = """
@@ -385,8 +396,8 @@ def create_gradio_interface():
385
  with gr.Column(scale=2):
386
  output = gr.Textbox(
387
  label="Results",
388
- lines=20,
389
- max_lines=30,
390
  elem_classes=["output-text"]
391
  )
392
 
@@ -408,53 +419,11 @@ def create_gradio_interface():
408
 
409
  return interface
410
 
411
- # --- File Processing ---
412
- def process_fasta_file(file):
413
- try:
414
- if not file:
415
- return "Please upload a FASTA file."
416
-
417
- sequences = {}
418
- current_seq = ""
419
- current_name = ""
420
- with open(file.name, 'r') as f:
421
- for line in f:
422
- line = line.strip()
423
- if line.startswith('>'):
424
- if current_name and current_seq:
425
- sequences[current_name] = current_seq
426
- current_name = line[1:]
427
- current_seq = ""
428
- else:
429
- current_seq += line.upper()
430
- if current_name and current_seq:
431
- sequences[current_name] = current_seq
432
-
433
- if not sequences:
434
- return "No valid sequences in FASTA file."
435
-
436
- results = [f"📁 FASTA FILE ANALYSIS\nFound {len(sequences)} sequences\n{'=' * 50}"]
437
- for i, (name, seq) in enumerate(sequences.items()):
438
- if i >= 5:
439
- results.append(f"\n... and {len(sequences) - 5} more sequences")
440
- break
441
- results.append(f"\n🧬 Sequence: {name}\nLength: {len(seq)} bp")
442
- clean_seq = re.sub(r'[^ATCG]', '', seq)
443
- if len(clean_seq) >= 10:
444
- results.append(analyze_sequence(clean_seq))
445
- else:
446
- results.append("❌ Sequence too short or invalid")
447
- results.append("-" * 40)
448
-
449
- return "\n".join(results)
450
- except Exception as e:
451
- logging.error(f"FASTA processing failed: {e}")
452
- return f"FASTA processing failed: {str(e)}"
453
-
454
  # --- Main ---
455
  if __name__ == "__main__":
456
- os.makedirs("output", exist_ok=True)
457
- os.makedirs("ml_tree_output", exist_ok=True)
 
458
 
459
  load_models()
460
  init_tree_analyzer()
@@ -467,11 +436,9 @@ if __name__ == "__main__":
467
  try:
468
  interface = create_gradio_interface()
469
  interface.launch(
470
- share=False,
471
  server_name="0.0.0.0",
472
  server_port=7860,
473
- show_error=True,
474
- debug=True
475
  )
476
  except Exception as e:
477
  logging.error(f"Interface launch failed: {e}")
 
27
  from huggingface_hub import hf_hub_download
28
 
29
  # --- Global Variables ---
30
+ MAFFT_PATH = "/usr/bin/mafft" # Common path in Hugging Face Spaces
31
+ IQTREE_PATH = "/usr/bin/iqtree2" # Common path in Hugging Face Spaces
32
+ CSV_PATH = "/data/f_cleaned.csv" # Persistent storage in Hugging Face
33
+ MODEL_REPO = "GGproject10/best_boundary_aware_model"
34
 
35
  # --- Logging Setup ---
36
  logging.basicConfig(
37
  level=logging.INFO,
38
  format='%(asctime)s - %(levelname)s - %(message)s',
39
  handlers=[
40
+ logging.FileHandler('/data/gene_analysis.log'),
41
  logging.StreamHandler(sys.stdout)
42
  ]
43
  )
44
 
45
+ # --- Model Variables ---
 
46
  boundary_model = None
47
  keras_model = None
48
  kmer_to_index = None
 
51
  # --- Load Models ---
52
  def load_models():
53
  global boundary_model, keras_model, kmer_to_index
54
+ hf_token = os.getenv("HF_TOKEN", None)
55
 
56
  # Load boundary model
57
  if GenePredictor:
 
59
  boundary_path = hf_hub_download(
60
  repo_id=MODEL_REPO,
61
  filename="best_boundary_aware_model.pth",
62
+ token=hf_token,
63
+ cache_dir="/data/models"
64
  )
65
  boundary_model = GenePredictor(boundary_path)
66
  logging.info("Boundary model loaded successfully.")
 
76
  keras_path = hf_hub_download(
77
  repo_id=MODEL_REPO,
78
  filename="best_model.keras",
79
+ token=hf_token,
80
+ cache_dir="/data/models"
81
  )
82
  kmer_path = hf_hub_download(
83
  repo_id=MODEL_REPO,
84
  filename="kmer_to_index.pkl",
85
+ token=hf_token,
86
+ cache_dir="/data/models"
87
  )
88
  keras_model = load_model(keras_path)
89
  with open(kmer_path, "rb") as f:
 
104
  analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
105
  if analyzer.load_data(CSV_PATH):
106
  logging.info("Tree analyzer initialized successfully.")
 
 
 
 
 
107
  else:
108
  logging.error("Failed to load CSV data.")
109
  analyzer = None
 
116
 
117
  # --- Tool Detection ---
118
  def check_tool_availability():
119
+ mafft_cmd = shutil.which(MAFFT_PATH) or shutil.which("mafft")
120
+ iqtree_cmd = shutil.which(IQTREE_PATH) or shutil.which("iqtree2")
 
 
 
 
 
 
 
 
 
121
  return bool(mafft_cmd), bool(iqtree_cmd), mafft_cmd, iqtree_cmd
122
 
123
  # --- Installation Guide ---
124
  def install_dependencies_guide():
125
  return """
126
+ 🔧 DEPENDENCY SETUP FOR HUGGING FACE SPACES:
127
 
128
+ 1. Add to requirements.txt:
129
+ - mafft
130
+ - iqtree
131
+ 2. Place f_cleaned.csv in the repository root.
132
+ 3. Ensure HF_TOKEN is set in Space secrets for model downloads.
133
+ 4. If dependencies fail, contact Hugging Face support or use a custom Docker image.
 
 
 
 
 
 
 
134
  """
135
 
136
  # --- MAFFT and IQ-TREE Functions ---
137
  def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
138
  try:
139
  cmd = [mafft_cmd, '--auto', '--quiet', input_fasta]
140
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) # Reduced timeout for HF
141
  if result.returncode == 0:
142
  with open(output_fasta, 'w') as f:
143
  f.write(result.stdout)
 
153
  def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
154
  try:
155
  cmd = [
156
+ iqtree_cmd, '-s', aligned_fasta, '-m', 'GTR', '-nt', '1', # Simplified for HF resources
157
+ '--prefix', output_prefix, '--quiet'
158
  ]
159
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) # Reduced timeout
160
  tree_file = f"{output_prefix}.treefile"
161
  if result.returncode == 0 and os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
162
  logging.info(f"IQ-TREE completed: {tree_file}")
 
166
  logging.error(f"IQ-TREE failed: {e}")
167
  return False, f"IQ-TREE failed: {str(e)}"
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  # --- Create Multi-FASTA ---
170
  def create_multi_fasta(query_sequence, query_id="Query_F_Gene"):
171
  try:
172
+ temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False, dir="/data")
173
  temp_fasta.write(f">{query_id}\n{query_sequence}\n")
174
+ ref_fasta_path = "/data/f_gene_sequences_aligned.fasta"
175
  if os.path.exists(ref_fasta_path):
176
  with open(ref_fasta_path, 'r') as ref_file:
177
  temp_fasta.write(ref_file.read())
 
181
  if 'sequence' in row and len(str(row['sequence'])) > 50:
182
  temp_fasta.write(f">{row.get('id', f'Ref_{count}')}\n{str(row['sequence']).upper()}\n")
183
  count += 1
184
+ if count >= 10: # Reduced for HF
185
  break
186
  temp_fasta.close()
187
  return temp_fasta.name
 
205
  guide = install_dependencies_guide()
206
  return False, f"{status_msg}\n❌ Missing tools:\n{guide}", None, None
207
 
208
+ os.makedirs("/data/ml_tree_output", exist_ok=True)
209
  multi_fasta = create_multi_fasta(sequence)
210
  if not multi_fasta:
211
  return False, f"{status_msg}\n❌ Failed to create input FASTA.", None, None
212
 
213
+ aligned_fasta = "/data/ml_tree_output/aligned_sequences.fasta"
214
  mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
215
  os.unlink(multi_fasta)
216
 
217
  if not mafft_success:
218
  return False, f"{status_msg}\n❌ MAFFT failed: {mafft_result}", None, None
219
 
220
+ tree_prefix = "/data/ml_tree_output/ml_tree"
221
  iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
222
  if not iqtree_success:
223
  return False, f"{status_msg}\n❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
224
 
225
  tree_file = iqtree_result
226
+ shutil.copy2(aligned_fasta, "/data/f_gene_sequences_aligned.fasta")
227
+ shutil.copy2(tree_file, "/data/f_gene_sequences.phy.treefile")
228
 
229
  success_msg = f"{status_msg}\n✅ ML tree built:\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}"
230
  return True, success_msg, aligned_fasta, tree_file
 
320
  success, message, aligned_fasta, tree_file = build_maximum_likelihood_tree(sequence)
321
  return format_results({"message": message, "tree_file": tree_file}, sequence, "tree")
322
 
323
+ # --- File Processing ---
324
+ def process_fasta_file(file):
325
+ try:
326
+ if not file:
327
+ return "Please upload a FASTA file."
328
+
329
+ sequences = {}
330
+ current_seq = ""
331
+ current_name = ""
332
+ with open(file.name, 'r') as f:
333
+ for line in f:
334
+ line = line.strip()
335
+ if line.startswith('>'):
336
+ if current_name and current_seq:
337
+ sequences[current_name] = current_seq
338
+ current_name = line[1:]
339
+ current_seq = ""
340
+ else:
341
+ current_seq += line.upper()
342
+ if current_name and current_seq:
343
+ sequences[current_name] = current_seq
344
+
345
+ if not sequences:
346
+ return "No valid sequences in FASTA file."
347
+
348
+ results = [f"📁 FASTA FILE ANALYSIS\nFound {len(sequences)} sequences\n{'=' * 50}"]
349
+ for i, (name, seq) in enumerate(sequences.items()):
350
+ if i >= 3: # Reduced for HF
351
+ results.append(f"\n... and {len(sequences) - 3} more sequences")
352
+ break
353
+ results.append(f"\n🧬 Sequence: {name}\nLength: {len(seq)} bp")
354
+ clean_seq = re.sub(r'[^ATCG]', '', seq)
355
+ if len(clean_seq) >= 10:
356
+ results.append(analyze_sequence(clean_seq))
357
+ else:
358
+ results.append("❌ Sequence too short or invalid")
359
+ results.append("-" * 40)
360
+
361
+ return "\n".join(results)
362
+ except Exception as e:
363
+ logging.error(f"FASTA processing failed: {e}")
364
+ return f"FASTA processing failed: {str(e)}"
365
+
366
  # --- Gradio Interface ---
367
  def create_gradio_interface():
368
  css = """
 
396
  with gr.Column(scale=2):
397
  output = gr.Textbox(
398
  label="Results",
399
+ lines=15,
400
+ max_lines=20,
401
  elem_classes=["output-text"]
402
  )
403
 
 
419
 
420
  return interface
421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  # --- Main ---
423
  if __name__ == "__main__":
424
+ os.makedirs("/data", exist_ok=True)
425
+ os.makedirs("/data/ml_tree_output", exist_ok=True)
426
+ os.makedirs("/data/models", exist_ok=True)
427
 
428
  load_models()
429
  init_tree_analyzer()
 
436
  try:
437
  interface = create_gradio_interface()
438
  interface.launch(
 
439
  server_name="0.0.0.0",
440
  server_port=7860,
441
+ share=False # Managed by Hugging Face
 
442
  )
443
  except Exception as e:
444
  logging.error(f"Interface launch failed: {e}")