re-type commited on
Commit
e52106a
·
verified ·
1 Parent(s): b17b3c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +359 -310
app.py CHANGED
@@ -20,10 +20,11 @@ from huggingface_hub import hf_hub_download
20
  from Bio import SeqIO
21
  from Bio.Seq import Seq
22
  from Bio.SeqRecord import SeqRecord
 
23
 
24
  # --- Global Variables ---
25
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
26
- MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft.bat") # Windows path
27
  IQTREE_PATH = os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
28
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
29
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
@@ -127,24 +128,41 @@ except Exception as e:
127
  logging.error(f"Failed to initialize tree analyzer: {e}")
128
  analyzer = None
129
 
130
- # --- Enhanced Tool Detection ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def check_tool_availability():
132
  """Enhanced check for MAFFT and IQ-TREE availability with improved path validation"""
133
 
 
 
 
134
  # Check MAFFT
135
  mafft_available = False
136
  mafft_cmd = None
137
 
138
- # Try multiple MAFFT locations with actual execution test
139
  mafft_candidates = [
140
- # Your specific path from the error message
141
- os.path.join(BASE_DIR, "mafft", "mafftdir", "bin", "mafft"),
142
- # Other standard locations
143
- MAFFT_PATH,
144
  'mafft',
145
  '/usr/bin/mafft',
146
  '/usr/local/bin/mafft',
147
- 'mafft.bat', # Windows
148
  os.path.join(BASE_DIR, "binaries", "mafft", "mafftdir", "bin", "mafft"),
149
  # Add potential conda/miniconda paths
150
  os.path.expanduser("~/anaconda3/bin/mafft"),
@@ -181,9 +199,9 @@ def check_tool_availability():
181
  iqtree_available = False
182
  iqtree_cmd = None
183
 
184
- # Try multiple IQ-TREE locations and names with execution test
185
  iqtree_candidates = [
186
- IQTREE_PATH,
187
  'iqtree2',
188
  'iqtree',
189
  'iqtree3',
@@ -194,7 +212,7 @@ def check_tool_availability():
194
  'iqtree2.exe', # Windows
195
  'iqtree.exe', # Windows
196
  'iqtree3.exe', # Windows
197
- os.path.join(BASE_DIR, "iqtree", "bin", "iqtree2"),
198
  # Add potential conda paths
199
  os.path.expanduser("~/anaconda3/bin/iqtree2"),
200
  os.path.expanduser("~/miniconda3/bin/iqtree2"),
@@ -257,12 +275,13 @@ If tools are installed but not detected, try:
257
  1. Add installation directory to PATH
258
  2. Use absolute paths in the configuration
259
  3. Check permissions on executable files
 
260
  """
261
  return guide
262
 
263
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
264
  """
265
- Perform phylogenetic placement using MAFFT + IQ-TREE approach.
266
  This adds the query sequence to a reference alignment and tree.
267
  """
268
  try:
@@ -283,35 +302,21 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
283
  if not os.path.exists(TREE_PATH):
284
  return False, f"Reference tree not found: {TREE_PATH}", None, None
285
 
286
- # Save query sequence as FASTA
287
  try:
288
- query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="Query sequence")
289
  SeqIO.write([query_record], query_fasta, "fasta")
290
  logging.info(f"Query sequence saved: {query_fasta}")
291
  except Exception as e:
292
- return False, f"Failed to save query sequence: {str(e)}", None, None
293
 
294
- # Step 1: Add query sequence to reference alignment using MAFFT
295
  logging.info("Adding query sequence to reference alignment...")
296
  try:
297
  with open(aligned_with_query, "w") as output_file:
298
- mafft_cmd_full = [
299
- mafft_cmd,
300
- "--add", query_fasta,
301
- "--reorder",
302
- ALIGNMENT_PATH
303
- ]
304
-
305
- logging.info(f"Running MAFFT: {' '.join(mafft_cmd_full)}")
306
-
307
- result = subprocess.run(
308
- mafft_cmd_full,
309
- stdout=output_file,
310
- stderr=subprocess.PIPE,
311
- text=True,
312
- timeout=600, # 10 minute timeout
313
- check=True
314
- )
315
 
316
  # Verify alignment file was created and is not empty
317
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
@@ -320,37 +325,22 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
320
  logging.info(f"MAFFT alignment completed: {aligned_with_query}")
321
 
322
  except subprocess.CalledProcessError as e:
323
- error_msg = e.stderr.decode() if e.stderr else "Unknown MAFFT error"
324
  return False, f"MAFFT alignment failed: {error_msg}", None, None
325
  except subprocess.TimeoutExpired:
326
  return False, "MAFFT alignment timeout (>10 minutes)", None, None
327
- except FileNotFoundError as e:
328
- return False, f"MAFFT executable not found: {mafft_cmd}. Please check installation.", None, None
329
  except Exception as e:
330
- return False, f"MAFFT alignment error: {str(e)}", None, None
331
 
332
- # Step 2: Place sequence in phylogenetic tree using IQ-TREE
333
  logging.info("Placing sequence in phylogenetic tree...")
334
  try:
335
- iqtree_cmd_full = [
336
- iqtree_cmd,
337
- "-s", aligned_with_query,
338
- "-g", TREE_PATH, # Constraint tree (reference tree)
339
- "-m", "GTR+G", # Substitution model
340
- "-pre", output_prefix,
341
- "-redo", # Overwrite existing files
342
- "--quiet" # Reduce verbosity
343
- ]
344
-
345
- logging.info(f"Running IQ-TREE: {' '.join(iqtree_cmd_full)}")
346
-
347
- result = subprocess.run(
348
- iqtree_cmd_full,
349
- capture_output=True,
350
- text=True,
351
- timeout=1200, # 20 minute timeout
352
- check=True
353
- )
354
 
355
  # Check if treefile was generated
356
  treefile = f"{output_prefix}.treefile"
@@ -385,10 +375,10 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
385
  return False, f"IQ-TREE placement failed: {error_msg}", aligned_with_query, None
386
  except subprocess.TimeoutExpired:
387
  return False, "IQ-TREE placement timeout (>20 minutes)", aligned_with_query, None
388
- except FileNotFoundError as e:
389
- return False, f"IQ-TREE executable not found: {iqtree_cmd}. Please check installation.", aligned_with_query, None
390
  except Exception as e:
391
- return False, f"IQ-TREE placement error: {str(e)}", aligned_with_query, None
392
 
393
  except Exception as e:
394
  logging.error(f"Phylogenetic placement failed: {e}")
@@ -403,8 +393,7 @@ def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
403
 
404
  def build_maximum_likelihood_tree(f_gene_sequence):
405
  """
406
- Build maximum likelihood phylogenetic tree using phylogenetic placement approach.
407
- This replaces the previous de novo tree building with placement-based analysis.
408
  """
409
  try:
410
  # Check tool availability with enhanced detection
@@ -444,7 +433,7 @@ def build_maximum_likelihood_tree(f_gene_sequence):
444
  status_msg += "Please ensure f_gene_sequences_aligned.fasta and f_gene_sequences.phy.treefile are available."
445
  return False, status_msg, None, None
446
 
447
- # Perform phylogenetic placement
448
  logging.info("Starting phylogenetic placement...")
449
  placement_success, placement_message, aligned_file, tree_file = phylogenetic_placement(
450
  f_gene_sequence, mafft_cmd, iqtree_cmd
@@ -640,7 +629,7 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
640
  else:
641
  keras_output = "Skipped: sequence too short for F gene validation"
642
 
643
- # Step 3: Maximum Likelihood Tree (Phylogenetic Placement)
644
  aligned_file = None
645
  phy_file = None
646
  ml_tree_output = ""
@@ -665,7 +654,7 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
665
  else:
666
  ml_tree_output = "Phylogenetic placement skipped (not requested)"
667
 
668
- # Step 4: NEW Simplified Tree Analysis (using the new analyzer API)
669
  html_file = None
670
  tree_html_content = "No tree generated"
671
  simplified_ml_output = ""
@@ -683,304 +672,364 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
683
  os.makedirs(output_dir, exist_ok=True)
684
 
685
  # Create a safe filename
686
- safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', processed_sequence[:20])
687
- html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{uuid.uuid4().hex[:8]}.html")
 
 
688
 
689
  # Copy the HTML file
690
- shutil.copy2(html_path, html_filename)
691
- html_file = html_filename
692
 
693
  # Read HTML content for display
694
- with open(html_filename, 'r', encoding='utf-8') as f:
695
  tree_html_content = f.read()
696
 
697
  simplified_ml_output = tree_result
698
- logging.info(f"Tree analysis completed successfully. HTML saved to: {html_filename}")
 
 
 
 
 
 
 
699
  else:
700
- simplified_ml_output = tree_result or "❌ Failed to generate tree visualization"
701
- logging.warning("Tree analysis failed or no HTML output generated")
702
 
703
  except Exception as e:
704
- simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
705
- logging.error(f"Simplified ML tree analysis failed: {e}")
706
- elif not analyzer:
707
- simplified_ml_output = "Tree analyzer not available (CSV data not loaded)"
708
- elif len(processed_sequence) < 10:
709
- simplified_ml_output = "❌ Sequence too short for tree analysis (minimum 10 bp)"
710
  else:
711
- simplified_ml_output = "Tree analysis skipped"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
 
713
- # Prepare final outputs
714
- final_boundary_output = f"F gene sequence ({len(processed_sequence)} bp):\n{boundary_output}"
715
- final_keras_output = f"F gene validation: {keras_output}"
716
-
717
- logging.info("Pipeline completed successfully")
718
-
719
  return (
720
- final_boundary_output, # Boundary prediction output
721
- final_keras_output, # Keras prediction output
722
- ml_tree_output, # ML tree output
723
- simplified_ml_output, # Simplified tree output
724
- processed_sequence, # The extracted F gene sequence
725
- aligned_file, # Alignment file path
726
- phy_file, # Tree file path
727
- html_file, # HTML tree file path
728
- tree_html_content # HTML content for display
729
  )
730
 
731
  except Exception as e:
732
  error_msg = f"Pipeline error: {str(e)}"
733
  logging.error(error_msg)
734
- return error_msg, "", "", "", "", None, None, None, "Pipeline failed"
 
 
 
 
 
735
 
736
  # --- Gradio Interface ---
737
- def create_gradio_interface():
738
  """Create and configure the Gradio interface"""
739
 
740
- with gr.Blocks(
741
- title="F Gene Analysis Pipeline",
742
- theme=gr.themes.Soft(),
743
- css="""
744
- .gradio-container {
745
- max-width: 1200px !important;
746
- }
747
- .output-html {
748
- height: 600px;
749
- overflow: auto;
750
- }
751
- """
752
- ) as demo:
753
-
754
- gr.Markdown("""
755
- # 🧬 F Gene Analysis Pipeline
756
-
757
- This tool performs comprehensive analysis of F gene sequences including:
758
- - **Gene Boundary Prediction**: Extract F gene regions from input sequences
759
- - **F Gene Validation**: Validate sequences using ML models
760
- - **Phylogenetic Analysis**: Build phylogenetic trees and perform sequence placement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  """)
762
 
763
- with gr.Tab("Sequence Analysis"):
764
- with gr.Row():
765
- with gr.Column(scale=1):
766
- gr.Markdown("### Input")
767
-
768
- # Input options
769
- input_method = gr.Radio(
770
- choices=["Text Input", "File Upload"],
771
- value="Text Input",
772
- label="Input Method"
773
- )
774
-
775
- # Text input
776
- dna_sequence = gr.Textbox(
777
- label="DNA Sequence",
778
- placeholder="Enter your DNA sequence here (FASTA format or raw sequence)",
779
- lines=8,
780
- visible=True
781
- )
782
-
783
- # File upload
784
- fasta_file = gr.File(
785
- label="Upload FASTA File",
786
- file_types=[".fasta", ".fa", ".txt"],
787
- visible=False
788
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
 
790
- # Parameters
791
- gr.Markdown("### Parameters")
792
- similarity_threshold = gr.Slider(
793
- minimum=50,
794
- maximum=99,
795
- value=95,
796
- step=1,
797
- label="Similarity Threshold (%)",
798
- info="For phylogenetic tree analysis"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
799
  )
800
-
801
- build_ml_tree_checkbox = gr.Checkbox(
802
- label="Build Maximum Likelihood Tree",
803
- value=False,
804
- info="Requires MAFFT and IQ-TREE (slower but more comprehensive)"
805
  )
806
-
807
- # Action buttons
808
- analyze_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
809
- clear_btn = gr.Button("🗑️ Clear", variant="secondary")
810
-
811
- with gr.Column(scale=2):
812
- gr.Markdown("### Results")
813
-
814
- # Output tabs
815
- with gr.Tabs():
816
- with gr.Tab("Gene Prediction"):
817
- boundary_output = gr.Textbox(
818
- label="F Gene Boundary Prediction",
819
- lines=8,
820
- interactive=False
821
- )
822
-
823
- keras_output = gr.Textbox(
824
- label="F Gene Validation",
825
- lines=3,
826
- interactive=False
827
- )
828
-
829
- with gr.Tab("Phylogenetic Tree"):
830
- ml_tree_output = gr.Textbox(
831
- label="Maximum Likelihood Tree Status",
832
- lines=8,
833
- interactive=False
834
- )
835
-
836
- tree_analysis_output = gr.Textbox(
837
- label="Tree Analysis Status",
838
- lines=4,
839
- interactive=False
840
- )
841
-
842
- # Tree visualization
843
- tree_html = gr.HTML(
844
- label="Interactive Phylogenetic Tree",
845
- elem_classes=["output-html"]
846
- )
847
-
848
- with gr.Tab("Sequence Output"):
849
- extracted_sequence = gr.Textbox(
850
- label="Extracted F Gene Sequence",
851
- lines=10,
852
- interactive=False,
853
- info="The F gene sequence extracted by the boundary model"
854
- )
855
-
856
- with gr.Tab("Download Files"):
857
- alignment_file = gr.File(
858
- label="Alignment File",
859
- interactive=False
860
- )
861
-
862
- tree_file = gr.File(
863
- label="Tree File",
864
- interactive=False
865
- )
866
-
867
- html_tree_file = gr.File(
868
- label="Interactive Tree (HTML)",
869
- interactive=False
870
- )
871
 
 
 
 
872
 
873
- # Event handlers
874
- def toggle_input_method(method):
875
- if method == "Text Input":
876
- return gr.update(visible=True), gr.update(visible=False)
877
- else:
878
- return gr.update(visible=False), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879
 
880
- input_method.change(
881
- toggle_input_method,
882
- inputs=[input_method],
883
- outputs=[dna_sequence, fasta_file]
884
- )
 
 
 
 
 
 
 
 
 
 
885
 
886
- def run_analysis(input_method, dna_seq, file_obj, similarity, build_ml):
887
- """Main analysis function"""
888
- try:
889
- if input_method == "File Upload" and file_obj is not None:
890
- # Process file upload
891
- results = run_pipeline_from_file(file_obj, similarity, build_ml)
892
- else:
893
- # Process text input
894
- if not dna_seq or not dna_seq.strip():
895
- return ("❌ Please provide a DNA sequence", "", "", "", "", None, None, None, "No input provided")
896
- results = run_pipeline(dna_seq, similarity, build_ml)
897
-
898
- return results
899
-
900
- except Exception as e:
901
- error_msg = f"Analysis failed: {str(e)}"
902
- logging.error(error_msg)
903
- return (error_msg, "", "", "", "", None, None, None, "Analysis failed")
904
-
905
- def clear_all():
906
- """Clear all inputs and outputs"""
907
- return (
908
- "", # dna_sequence
909
- None, # fasta_file
910
- 95, # similarity_threshold
911
- False, # build_ml_tree_checkbox
912
- "", # boundary_output
913
- "", # keras_output
914
- "", # ml_tree_output
915
- "", # tree_analysis_output
916
- "", # extracted_sequence
917
- "", # tree_html
918
- None, # alignment_file
919
- None, # tree_file
920
- None # html_tree_file
921
- )
922
-
923
- # Connect the analyze button
924
- analyze_btn.click(
925
- fn=run_analysis,
926
- inputs=[
927
- input_method,
928
- dna_sequence,
929
- fasta_file,
930
- similarity_threshold,
931
- build_ml_tree_checkbox
932
- ],
933
  outputs=[
934
- boundary_output,
935
  keras_output,
936
  ml_tree_output,
937
  tree_analysis_output,
938
- extracted_sequence,
939
  alignment_file,
940
  tree_file,
941
  html_tree_file,
942
- tree_html
943
  ]
944
  )
945
 
946
- # Connect the clear button
947
- clear_btn.click(
948
- fn=clear_all,
949
  outputs=[
950
- dna_sequence,
951
- fasta_file,
952
- similarity_threshold,
953
- build_ml_tree_checkbox,
954
- boundary_output,
955
  keras_output,
956
  ml_tree_output,
957
  tree_analysis_output,
958
- extracted_sequence,
959
- tree_html,
960
  alignment_file,
961
  tree_file,
962
- html_tree_file
 
963
  ]
964
  )
965
 
966
- return demo
967
 
968
- # --- Main Application ---
969
  if __name__ == "__main__":
970
  try:
971
- # Initialize the Gradio interface
972
- demo = create_gradio_interface()
973
-
974
- # Launch the application
975
- demo.queue(max_size=10) # Enable queuing for long-running tasks
976
- demo.launch(
977
- server_name="0.0.0.0", # Allow external connections
978
- server_port=7860, # Default port
979
- share=False, # Set to True to create public link
980
- debug=True, # Enable debug mode
981
- show_error=True # Show detailed error messages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
982
  )
983
 
984
  except Exception as e:
985
- logging.error(f"Failed to launch application: {e}")
986
- print(f"❌ Application failed to start: {e}")
 
 
 
 
20
  from Bio import SeqIO
21
  from Bio.Seq import Seq
22
  from Bio.SeqRecord import SeqRecord
23
+ import stat
24
 
25
  # --- Global Variables ---
26
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
27
+ MAFFT_PATH = os.path.join(BASE_DIR, "binaries", "mafft", "mafft") # Updated path
28
  IQTREE_PATH = os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
29
  ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
30
  TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
 
128
  logging.error(f"Failed to initialize tree analyzer: {e}")
129
  analyzer = None
130
 
131
+ # --- Enhanced Tool Detection with Binary Permission Setup ---
132
+ def setup_binary_permissions():
133
+ """Set executable permissions on MAFFT and IQ-TREE binaries"""
134
+ binaries = [MAFFT_PATH, IQTREE_PATH]
135
+
136
+ for binary in binaries:
137
+ if os.path.exists(binary):
138
+ try:
139
+ # Set executable permission
140
+ current_mode = os.stat(binary).st_mode
141
+ os.chmod(binary, current_mode | stat.S_IEXEC)
142
+ logging.info(f"Set executable permission on {binary}")
143
+ except Exception as e:
144
+ logging.warning(f"Failed to set executable permission on {binary}: {e}")
145
+ else:
146
+ logging.warning(f"Binary not found: {binary}")
147
+
148
  def check_tool_availability():
149
  """Enhanced check for MAFFT and IQ-TREE availability with improved path validation"""
150
 
151
+ # First, ensure binaries have executable permissions
152
+ setup_binary_permissions()
153
+
154
  # Check MAFFT
155
  mafft_available = False
156
  mafft_cmd = None
157
 
158
+ # Updated MAFFT candidates list based on your new API
159
  mafft_candidates = [
160
+ MAFFT_PATH, # Primary path from your new API
161
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft"),
162
+ os.path.join(BASE_DIR, "binaries", "mafft", "mafft.bat"), # Windows fallback
 
163
  'mafft',
164
  '/usr/bin/mafft',
165
  '/usr/local/bin/mafft',
 
166
  os.path.join(BASE_DIR, "binaries", "mafft", "mafftdir", "bin", "mafft"),
167
  # Add potential conda/miniconda paths
168
  os.path.expanduser("~/anaconda3/bin/mafft"),
 
199
  iqtree_available = False
200
  iqtree_cmd = None
201
 
202
+ # Updated IQ-TREE candidates list
203
  iqtree_candidates = [
204
+ IQTREE_PATH, # Primary path from your new API
205
  'iqtree2',
206
  'iqtree',
207
  'iqtree3',
 
212
  'iqtree2.exe', # Windows
213
  'iqtree.exe', # Windows
214
  'iqtree3.exe', # Windows
215
+ os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree2"),
216
  # Add potential conda paths
217
  os.path.expanduser("~/anaconda3/bin/iqtree2"),
218
  os.path.expanduser("~/miniconda3/bin/iqtree2"),
 
275
  1. Add installation directory to PATH
276
  2. Use absolute paths in the configuration
277
  3. Check permissions on executable files
278
+ 4. Ensure binaries have executable permissions (chmod +x)
279
  """
280
  return guide
281
 
282
  def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
283
  """
284
+ Improved phylogenetic placement using the new API approach.
285
  This adds the query sequence to a reference alignment and tree.
286
  """
287
  try:
 
302
  if not os.path.exists(TREE_PATH):
303
  return False, f"Reference tree not found: {TREE_PATH}", None, None
304
 
305
+ # Save query sequence as FASTA (improved error handling)
306
  try:
307
+ query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
308
  SeqIO.write([query_record], query_fasta, "fasta")
309
  logging.info(f"Query sequence saved: {query_fasta}")
310
  except Exception as e:
311
+ return False, f"Error writing query sequence: {e}", None, None
312
 
313
+ # Step 1: Add query sequence to reference alignment using MAFFT (improved approach)
314
  logging.info("Adding query sequence to reference alignment...")
315
  try:
316
  with open(aligned_with_query, "w") as output_file:
317
+ mafft_result = subprocess.run([
318
+ mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
319
+ ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
  # Verify alignment file was created and is not empty
322
  if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
 
325
  logging.info(f"MAFFT alignment completed: {aligned_with_query}")
326
 
327
  except subprocess.CalledProcessError as e:
328
+ error_msg = e.stderr if e.stderr else "Unknown MAFFT error"
329
  return False, f"MAFFT alignment failed: {error_msg}", None, None
330
  except subprocess.TimeoutExpired:
331
  return False, "MAFFT alignment timeout (>10 minutes)", None, None
332
+ except FileNotFoundError:
333
+ return False, f"MAFFT executable not found: {mafft_cmd}", None, None
334
  except Exception as e:
335
+ return False, f"MAFFT execution error: {e}", None, None
336
 
337
+ # Step 2: Place sequence in phylogenetic tree using IQ-TREE (improved approach)
338
  logging.info("Placing sequence in phylogenetic tree...")
339
  try:
340
+ iqtree_result = subprocess.run([
341
+ iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
342
+ "-m", "GTR+G", "-pre", output_prefix, "-redo"
343
+ ], capture_output=True, text=True, timeout=1200, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  # Check if treefile was generated
346
  treefile = f"{output_prefix}.treefile"
 
375
  return False, f"IQ-TREE placement failed: {error_msg}", aligned_with_query, None
376
  except subprocess.TimeoutExpired:
377
  return False, "IQ-TREE placement timeout (>20 minutes)", aligned_with_query, None
378
+ except FileNotFoundError:
379
+ return False, f"IQ-TREE executable not found: {iqtree_cmd}", aligned_with_query, None
380
  except Exception as e:
381
+ return False, f"IQ-TREE execution error: {e}", aligned_with_query, None
382
 
383
  except Exception as e:
384
  logging.error(f"Phylogenetic placement failed: {e}")
 
393
 
394
  def build_maximum_likelihood_tree(f_gene_sequence):
395
  """
396
+ Build maximum likelihood phylogenetic tree using the improved phylogenetic placement approach.
 
397
  """
398
  try:
399
  # Check tool availability with enhanced detection
 
433
  status_msg += "Please ensure f_gene_sequences_aligned.fasta and f_gene_sequences.phy.treefile are available."
434
  return False, status_msg, None, None
435
 
436
+ # Perform phylogenetic placement using improved method
437
  logging.info("Starting phylogenetic placement...")
438
  placement_success, placement_message, aligned_file, tree_file = phylogenetic_placement(
439
  f_gene_sequence, mafft_cmd, iqtree_cmd
 
629
  else:
630
  keras_output = "Skipped: sequence too short for F gene validation"
631
 
632
+ # Step 3: Maximum Likelihood Tree (Phylogenetic Placement) - Using improved API
633
  aligned_file = None
634
  phy_file = None
635
  ml_tree_output = ""
 
654
  else:
655
  ml_tree_output = "Phylogenetic placement skipped (not requested)"
656
 
657
+ # Step 4: NEW Simplified Tree Analysis (using the new analyzer API)
658
  html_file = None
659
  tree_html_content = "No tree generated"
660
  simplified_ml_output = ""
 
672
  os.makedirs(output_dir, exist_ok=True)
673
 
674
  # Create a safe filename
675
+ safe_seq_name = re.sub(r'[^a-zA-Z0-9_-]', '', processed_sequence[:20])
676
+ timestamp = str(int(time.time()))
677
+ html_filename = f"tree_{safe_seq_name}_{timestamp}.html"
678
+ final_html_path = os.path.join(output_dir, html_filename)
679
 
680
  # Copy the HTML file
681
+ shutil.copy2(html_path, final_html_path)
682
+ html_file = final_html_path
683
 
684
  # Read HTML content for display
685
+ with open(html_path, 'r', encoding='utf-8') as f:
686
  tree_html_content = f.read()
687
 
688
  simplified_ml_output = tree_result
689
+ logging.info(f"Tree analysis completed successfully: {html_filename}")
690
+
691
+ # Clean up temporary file
692
+ try:
693
+ os.unlink(html_path)
694
+ except:
695
+ pass
696
+
697
  else:
698
+ simplified_ml_output = tree_result # Error message
699
+ tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
700
 
701
  except Exception as e:
702
+ error_msg = f"❌ Tree analysis failed: {str(e)}"
703
+ simplified_ml_output = error_msg
704
+ tree_html_content = f"<div style='color: red;'>{error_msg}</div>"
705
+ logging.error(f"Tree analysis failed: {e}")
 
 
706
  else:
707
+ if not analyzer:
708
+ simplified_ml_output = "❌ Tree analyzer not available (CSV data not loaded)"
709
+ elif len(processed_sequence) < 10:
710
+ simplified_ml_output = "❌ F gene sequence too short for tree analysis (minimum 10 bp)"
711
+ else:
712
+ simplified_ml_output = "❌ No processed sequence available for tree analysis"
713
+
714
+ tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
715
+
716
+ # Final summary
717
+ summary_output = f"""
718
+ 🧬 ANALYSIS SUMMARY:
719
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
720
+ 📊 INPUT: {len(dna_input)} bp DNA sequence
721
+ 🎯 F GENE EXTRACTED: {len(processed_sequence)} bp
722
+ ✅ F GENE VALIDATION: {keras_output}
723
+ 🌳 PHYLOGENETIC PLACEMENT: {'✅ Completed' if 'successfully' in ml_tree_output else '❌ ' + ('Skipped' if 'skipped' in ml_tree_output else 'Failed')}
724
+ 🔬 TREE ANALYSIS: {'✅ Completed' if '✅' in simplified_ml_output else '❌ ' + ('Not available' if 'not available' in simplified_ml_output else 'Failed')}
725
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
726
+ """
727
 
 
 
 
 
 
 
728
  return (
729
+ boundary_output, # F gene sequence
730
+ keras_output, # F gene validation
731
+ ml_tree_output, # Phylogenetic placement
732
+ simplified_ml_output, # Tree analysis
733
+ summary_output, # Summary
734
+ aligned_file, # Alignment file
735
+ phy_file, # Tree file
736
+ html_file, # HTML tree file
737
+ tree_html_content # HTML content for display
738
  )
739
 
740
  except Exception as e:
741
  error_msg = f"Pipeline error: {str(e)}"
742
  logging.error(error_msg)
743
+ import traceback
744
+ logging.error(f"Full traceback: {traceback.format_exc()}")
745
+ return error_msg, "", "", "", "", None, None, None, error_msg
746
+
747
+ # Add missing import
748
+ import time
749
 
750
  # --- Gradio Interface ---
751
+ def create_interface():
752
  """Create and configure the Gradio interface"""
753
 
754
+ # Custom CSS for better styling
755
+ custom_css = """
756
+ .gradio-container {
757
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
758
+ }
759
+ .gr-button-primary {
760
+ background: linear-gradient(45deg, #1e3a8a, #3b82f6);
761
+ border: none;
762
+ border-radius: 8px;
763
+ font-weight: 600;
764
+ }
765
+ .gr-button-primary:hover {
766
+ background: linear-gradient(45deg, #1e40af, #2563eb);
767
+ transform: translateY(-1px);
768
+ box-shadow: 0 4px 12px rgba(59, 130, 246, 0.4);
769
+ }
770
+ .gr-textbox, .gr-textarea {
771
+ border-radius: 8px;
772
+ border: 2px solid #e5e7eb;
773
+ }
774
+ .gr-textbox:focus, .gr-textarea:focus {
775
+ border-color: #3b82f6;
776
+ box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1);
777
+ }
778
+ .warning-box {
779
+ background: linear-gradient(135deg, #fef3c7, #fbbf24);
780
+ border: 1px solid #f59e0b;
781
+ border-radius: 8px;
782
+ padding: 12px;
783
+ margin: 8px 0;
784
+ }
785
+ .success-box {
786
+ background: linear-gradient(135deg, #d1fae5, #10b981);
787
+ border: 1px solid #059669;
788
+ border-radius: 8px;
789
+ padding: 12px;
790
+ margin: 8px 0;
791
+ }
792
+ .error-box {
793
+ background: linear-gradient(135deg, #fee2e2, #ef4444);
794
+ border: 1px solid #dc2626;
795
+ border-radius: 8px;
796
+ padding: 12px;
797
+ margin: 8px 0;
798
+ }
799
+ """
800
+
801
+ with gr.Blocks(css=custom_css, title="🧬 Advanced Gene Analysis Pipeline", theme=gr.themes.Soft()) as iface:
802
+
803
+ # Header
804
+ gr.HTML("""
805
+ <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px;">
806
+ <h1 style="color: white; margin: 0; font-size: 2.5em; font-weight: 700;">🧬 Advanced Gene Analysis Pipeline</h1>
807
+ <p style="color: rgba(255,255,255,0.9); margin: 10px 0 0 0; font-size: 1.2em;">F Gene Boundary Detection • Validation • Phylogenetic Analysis</p>
808
+ </div>
809
  """)
810
 
811
+ # Instructions
812
+ with gr.Accordion("📋 Instructions & Information", open=False):
813
+ gr.HTML("""
814
+ <div style="background: #f8fafc; padding: 20px; border-radius: 10px; border-left: 4px solid #3b82f6;">
815
+ <h3 style="color: #1e40af; margin-top: 0;">🔬 Pipeline Overview</h3>
816
+ <ol style="line-height: 1.6;">
817
+ <li><strong>F Gene Extraction:</strong> Uses boundary-aware model to identify and extract F gene regions</li>
818
+ <li><strong>Gene Validation:</strong> Validates extracted sequence as F gene using deep learning</li>
819
+ <li><strong>Phylogenetic Placement:</strong> Places sequence in reference phylogenetic tree (MAFFT + IQ-TREE)</li>
820
+ <li><strong>Interactive Tree Analysis:</strong> Creates interactive phylogenetic tree with similar sequences</li>
821
+ </ol>
822
+
823
+ <h3 style="color: #1e40af;">📁 Input Requirements</h3>
824
+ <ul style="line-height: 1.6;">
825
+ <li><strong>DNA Sequence:</strong> Minimum 100 bp for phylogenetic analysis</li>
826
+ <li><strong>FASTA Format:</strong> Supported for file uploads</li>
827
+ <li><strong>Similarity Score:</strong> 70-99% (default: 95%)</li>
828
+ </ul>
829
+
830
+ <h3 style="color: #1e40af;">⚙️ Dependencies</h3>
831
+ <p style="background: #fef3c7; padding: 10px; border-radius: 5px; border-left: 3px solid #f59e0b;">
832
+ <strong>Required:</strong> MAFFT and IQ-TREE must be installed for phylogenetic analysis.<br>
833
+ <strong>Installation:</strong> <code>conda install -c bioconda mafft iqtree</code>
834
+ </p>
835
+ </div>
836
+ """)
837
+
838
+ # Main input section
839
+ with gr.Row():
840
+ with gr.Column(scale=2):
841
+ gr.HTML("<h3 style='color: #1e40af; margin-bottom: 10px;'>📝 Sequence Input</h3>")
842
+
843
+ # Input tabs
844
+ with gr.Tabs():
845
+ with gr.TabItem("✍️ Text Input"):
846
+ dna_input = gr.Textbox(
847
+ label="DNA Sequence",
848
+ placeholder="Enter your DNA sequence here (A, T, C, G, N)...",
849
+ lines=6,
850
+ value="",
851
+ info="Paste your DNA sequence or enter it manually"
852
+ )
853
 
854
+ with gr.TabItem("📁 File Upload"):
855
+ fasta_file = gr.File(
856
+ label="Upload FASTA File",
857
+ file_types=[".fasta", ".fa", ".fas", ".txt"],
858
+ type="filepath"
859
+ )
860
+
861
+ with gr.Column(scale=1):
862
+ gr.HTML("<h3 style='color: #1e40af; margin-bottom: 10px;'>⚙️ Analysis Settings</h3>")
863
+
864
+ similarity_score = gr.Slider(
865
+ minimum=70.0,
866
+ maximum=99.0,
867
+ value=95.0,
868
+ step=1.0,
869
+ label="Similarity Threshold (%)",
870
+ info="Minimum similarity for tree analysis"
871
+ )
872
+
873
+ build_ml_tree = gr.Checkbox(
874
+ label="🌳 Enable Phylogenetic Placement",
875
+ value=False,
876
+ info="Requires MAFFT and IQ-TREE (slower but more accurate)"
877
+ )
878
+
879
+ # Action buttons
880
+ with gr.Row():
881
+ analyze_text_btn = gr.Button(
882
+ "🚀 Analyze Text Input",
883
+ variant="primary",
884
+ size="lg"
885
  )
886
+ analyze_file_btn = gr.Button(
887
+ "📁 Analyze File",
888
+ variant="secondary",
889
+ size="lg"
 
890
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891
 
892
+ # Results section
893
+ gr.HTML("<hr style='margin: 30px 0; border: none; height: 2px; background: linear-gradient(to right, #3b82f6, #8b5cf6);'>")
894
+ gr.HTML("<h2 style='color: #1e40af; text-align: center; margin-bottom: 20px;'>📊 Analysis Results</h2>")
895
 
896
+ # Output tabs
897
+ with gr.Tabs():
898
+ with gr.TabItem("🎯 F Gene Extraction"):
899
+ f_gene_output = gr.Textbox(
900
+ label="Extracted F Gene Sequence",
901
+ lines=8,
902
+ info="Boundary-detected F gene region"
903
+ )
904
+
905
+ with gr.TabItem("✅ Gene Validation"):
906
+ keras_output = gr.Textbox(
907
+ label="F Gene Validation Result",
908
+ lines=3,
909
+ info="Deep learning validation of F gene"
910
+ )
911
+
912
+ with gr.TabItem("🌳 Phylogenetic Placement"):
913
+ ml_tree_output = gr.Textbox(
914
+ label="Phylogenetic Placement Results",
915
+ lines=10,
916
+ info="MAFFT alignment + IQ-TREE placement results"
917
+ )
918
+
919
+ with gr.TabItem("🔬 Interactive Tree"):
920
+ tree_analysis_output = gr.Textbox(
921
+ label="Tree Analysis Status",
922
+ lines=5,
923
+ info="Interactive phylogenetic tree generation"
924
+ )
925
+ tree_html_display = gr.HTML(
926
+ label="Interactive Phylogenetic Tree",
927
+ value="<div style='text-align: center; color: #6b7280; padding: 40px;'>No tree generated yet. Run analysis to create interactive tree.</div>"
928
+ )
929
+
930
+ with gr.TabItem("📋 Summary"):
931
+ summary_output = gr.Textbox(
932
+ label="Analysis Summary",
933
+ lines=12,
934
+ info="Complete pipeline summary"
935
+ )
936
 
937
+ # Download section
938
+ with gr.Accordion("💾 Download Results", open=False):
939
+ with gr.Row():
940
+ alignment_file = gr.File(
941
+ label="📄 Download Alignment",
942
+ visible=True
943
+ )
944
+ tree_file = gr.File(
945
+ label="🌳 Download Tree",
946
+ visible=True
947
+ )
948
+ html_tree_file = gr.File(
949
+ label="🌐 Download Interactive Tree (HTML)",
950
+ visible=True
951
+ )
952
 
953
+ # Footer
954
+ gr.HTML("""
955
+ <div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 2px solid #e5e7eb; color: #6b7280;">
956
+ <p style="margin: 0;">🧬 Advanced Gene Analysis Pipeline | Powered by Deep Learning & Phylogenetics</p>
957
+ <p style="margin: 5px 0 0 0; font-size: 0.9em;">Built with Gradio • MAFFT • IQ-TREE • TensorFlow</p>
958
+ </div>
959
+ """)
960
+
961
+ # Event handlers
962
+ analyze_text_btn.click(
963
+ fn=run_pipeline,
964
+ inputs=[dna_input, similarity_score, build_ml_tree],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
965
  outputs=[
966
+ f_gene_output,
967
  keras_output,
968
  ml_tree_output,
969
  tree_analysis_output,
970
+ summary_output,
971
  alignment_file,
972
  tree_file,
973
  html_tree_file,
974
+ tree_html_display
975
  ]
976
  )
977
 
978
+ analyze_file_btn.click(
979
+ fn=run_pipeline_from_file,
980
+ inputs=[fasta_file, similarity_score, build_ml_tree],
981
  outputs=[
982
+ f_gene_output,
 
 
 
 
983
  keras_output,
984
  ml_tree_output,
985
  tree_analysis_output,
986
+ summary_output,
 
987
  alignment_file,
988
  tree_file,
989
+ html_tree_file,
990
+ tree_html_display
991
  ]
992
  )
993
 
994
+ return iface
995
 
996
+ # --- Main Execution ---
997
  if __name__ == "__main__":
998
  try:
999
+ # Print startup information
1000
+ print("🧬 Advanced Gene Analysis Pipeline")
1001
+ print("=" * 50)
1002
+ print(f"Base Directory: {BASE_DIR}")
1003
+ print(f"Boundary Model: {'✅ Loaded' if boundary_model else '❌ Not Available'}")
1004
+ print(f"Keras Model: {'✅ Loaded' if keras_model else '❌ Not Available'}")
1005
+ print(f"Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Not Available'}")
1006
+
1007
+ # Check tool availability
1008
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1009
+ print(f"MAFFT: {'✅ Available' if mafft_available else '❌ Not Found'}")
1010
+ print(f"IQ-TREE: {'✅ Available' if iqtree_available else '❌ Not Found'}")
1011
+
1012
+ if not mafft_available or not iqtree_available:
1013
+ print("\n⚠️ Warning: Some phylogenetic tools are missing!")
1014
+ print("Install with: conda install -c bioconda mafft iqtree")
1015
+
1016
+ print("\n🚀 Starting Gradio interface...")
1017
+
1018
+ # Create and launch interface
1019
+ iface = create_interface()
1020
+ iface.launch(
1021
+ share=False, # Set to True if you want to create a public link
1022
+ server_name="0.0.0.0", # Allow connections from any IP
1023
+ server_port=7860, # Default Gradio port
1024
+ show_error=True,
1025
+ show_tips=True,
1026
+ enable_queue=True, # Enable queuing for long-running tasks
1027
+ max_threads=4 # Limit concurrent processing
1028
  )
1029
 
1030
  except Exception as e:
1031
+ logging.error(f"Failed to start application: {e}")
1032
+ import traceback
1033
+ print(f"Error: {e}")
1034
+ print(f"Traceback: {traceback.format_exc()}")
1035
+ sys.exit(1)