re-type commited on
Commit
b5a86a2
·
verified ·
1 Parent(s): 57d61f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +377 -256
app.py CHANGED
@@ -712,383 +712,504 @@ def predict_with_keras(sequence):
712
  input_arr = np.array([indices])
713
  prediction = keras_model.predict(input_arr, verbose=0)[0]
714
 
715
- # Format prediction as probabilities/scores (not a sequence)
716
- result = ''.join([str(round(p, 3)) for p in prediction])
 
 
 
 
 
 
 
 
 
 
717
  return result
718
  except Exception as e:
719
- logging.error(f"Keras prediction failed: {e}")
720
  return f"Keras prediction failed: {str(e)}"
721
 
722
- # --- Boundary Model Prediction ---
723
- def predict_with_boundary_model(sequence):
724
  try:
725
  if not boundary_model:
726
  return f"Boundary model not available. Input sequence: {sequence[:100]}..."
727
 
 
728
  predictions, probabilities, confidence = boundary_model.predict(sequence)
729
 
730
  # Extract gene regions
731
  regions = boundary_model.extract_gene_regions(predictions, sequence)
732
 
 
 
 
 
733
  if regions:
734
- result = f"Confidence: {confidence:.3f}\n"
735
- result += f"Regions found: {len(regions)}\n"
736
  for i, region in enumerate(regions[:3]): # Show first 3 regions
737
- result += f"Region {i+1}: {region['start']}-{region['end']} ({len(region['sequence'])} bp)\n"
738
- if len(regions) > 3:
739
- result += f"... and {len(regions) - 3} more regions\n"
740
-
741
- # Return the longest region's sequence
742
- longest_region = max(regions, key=lambda x: len(x['sequence']))
743
- return longest_region['sequence']
744
- else:
745
- return f"No gene regions found. Confidence: {confidence:.3f}"
746
-
747
  except Exception as e:
748
- logging.error(f"Boundary model prediction failed: {e}")
749
- return f"Boundary model prediction failed: {str(e)}"
750
 
751
- # --- Combined Prediction ---
752
- def predict_f_gene(sequence):
753
- """Main prediction function that combines all models"""
754
  try:
755
- # Clean sequence
756
- sequence = sequence.upper().strip()
757
- sequence = re.sub(r'[^ATCG]', '', sequence)
 
 
758
 
759
  if len(sequence) < 10:
760
- return "Error: Sequence too short (minimum 10 nucleotides required)."
761
 
762
  results = []
 
763
  results.append(f"Input sequence length: {len(sequence)} bp\n")
 
764
 
765
- # Try boundary model first
766
  if boundary_model:
767
- results.append("🔍 BOUNDARY MODEL PREDICTION:")
768
- boundary_result = predict_with_boundary_model(sequence)
769
  results.append(boundary_result)
770
- results.append("")
 
771
 
772
- # Try Keras model
773
- if keras_model and kmer_to_index:
774
- results.append("🧠 KERAS MODEL PREDICTION:")
775
  keras_result = predict_with_keras(sequence)
776
  results.append(keras_result)
777
- results.append("")
778
-
779
- # Run verification pipeline
780
- verification_results = run_verification_pipeline(sequence)
781
- if verification_results:
782
- results.append(format_verification_results(verification_results))
783
 
784
- # If no models available
785
- if not boundary_model and not keras_model:
786
- results.append("⚠️ No prediction models available.")
787
- results.append("Models are being loaded from Hugging Face Hub...")
 
 
788
 
789
  return "\n".join(results)
790
 
791
  except Exception as e:
792
- logging.error(f"F-gene prediction failed: {e}")
793
- return f"Prediction failed: {str(e)}"
794
 
795
  # --- File Processing Functions ---
796
- def process_fasta_file(file_path):
797
  """Process uploaded FASTA file"""
798
  try:
 
 
 
 
 
 
 
 
799
  sequences = {}
800
  current_seq = ""
801
  current_name = ""
802
 
803
- with open(file_path, 'r') as f:
804
- for line in f:
805
- line = line.strip()
806
- if line.startswith('>'):
807
- if current_name and current_seq:
808
- sequences[current_name] = current_seq
809
- current_name = line[1:]
810
- current_seq = ""
811
- else:
812
- current_seq += line.upper()
813
-
814
- # Add last sequence
815
- if current_name and current_seq:
816
- sequences[current_name] = current_seq
817
 
818
- return sequences
819
- except Exception as e:
820
- logging.error(f"FASTA processing failed: {e}")
821
- return None
822
-
823
- def batch_predict_fasta(file_path):
824
- """Batch prediction for FASTA file"""
825
- try:
826
- sequences = process_fasta_file(file_path)
827
  if not sequences:
828
- return "Error: Could not process FASTA file."
829
 
 
830
  results = []
831
- results.append(f"Processing {len(sequences)} sequences from FASTA file:\n")
 
 
832
 
833
- for i, (name, sequence) in enumerate(sequences.items()):
834
- if i >= 10: # Limit to first 10 sequences
835
- results.append(f"... and {len(sequences) - 10} more sequences (showing first 10)")
836
  break
 
 
 
837
 
838
- results.append(f"📄 SEQUENCE: {name}")
839
- results.append(f"Length: {len(sequence)} bp")
 
 
 
 
 
 
840
 
841
- # Predict
842
- prediction = predict_f_gene(sequence)
843
- results.append(prediction)
844
- results.append("-" * 50)
845
 
846
  return "\n".join(results)
847
 
848
  except Exception as e:
849
- logging.error(f"Batch FASTA prediction failed: {e}")
850
- return f"Batch prediction failed: {str(e)}"
851
 
852
- # --- Gradio Interface Functions ---
853
- def predict_sequence(sequence, file_upload=None):
854
- """Main interface function for sequence prediction"""
855
  try:
856
- # Handle file upload
857
- if file_upload is not None:
858
- return batch_predict_fasta(file_upload.name)
 
 
 
 
 
859
 
860
- # Handle text input
861
- if not sequence or len(sequence.strip()) < 10:
862
- return "Please enter a DNA sequence (minimum 10 nucleotides) or upload a FASTA file."
863
 
864
- return predict_f_gene(sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
 
866
  except Exception as e:
867
- logging.error(f"Interface prediction failed: {e}")
868
- return f"Prediction interface error: {str(e)}"
869
 
870
- def build_tree_interface(sequence, matching_percentage=85):
871
- """Interface function for phylogenetic tree building"""
872
  try:
873
- if not sequence or len(sequence.strip()) < 10:
874
- return "Please enter a DNA sequence (minimum 10 nucleotides).", "", ""
875
 
876
- # Try ML tree first
877
- ml_success, ml_message, aligned_file, tree_file = build_maximum_likelihood_tree(sequence)
878
 
879
- if ml_success and tree_file:
880
- # ML tree successful
881
- tree_info = f"🌳 Maximum Likelihood Tree Built Successfully!\n\n{ml_message}"
882
-
883
- # Also try simplified tree analysis
884
- html_content, html_file, simple_message = analyze_sequence_for_tree(sequence, matching_percentage)
 
 
 
 
 
 
 
 
 
 
 
 
885
 
886
- if html_content:
887
- return tree_info, html_content, f"{tree_info}\n\n{simple_message}"
888
- else:
889
- return tree_info, "", tree_info
890
  else:
891
- # ML failed, try simplified tree
892
- html_content, html_file, simple_message = analyze_sequence_for_tree(sequence, matching_percentage)
893
 
894
- if html_content:
895
- fallback_msg = f"⚠️ ML Tree Construction Issues:\n{ml_message}\n\n"
896
- fallback_msg += "📊 Simplified Tree Analysis:\n"
897
- return fallback_msg, html_content, f"{fallback_msg}{simple_message}"
898
- else:
899
- return f"❌ Tree construction failed:\n{ml_message}\n\nSimplified analysis: {simple_message}", "", ""
900
-
901
- except Exception as e:
902
- error_msg = f"Tree building interface error: {str(e)}"
903
- logging.error(error_msg)
904
- return error_msg, "", ""
905
-
906
- def get_system_status():
907
- """Get system status for debugging"""
908
- try:
909
- status = []
910
- status.append("🔧 SYSTEM STATUS:")
911
- status.append("")
912
-
913
- # Model status
914
- status.append("📊 MODELS:")
915
- status.append(f" - Boundary Model: {'✅ Loaded' if boundary_model else '❌ Not loaded'}")
916
- status.append(f" - Keras Model: {'✅ Loaded' if keras_model else '❌ Not loaded'}")
917
- status.append(f" - Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Not loaded'}")
918
- status.append(f" - Verification Models: {len(verification_models)} loaded")
919
- status.append("")
920
-
921
- # Tool availability
922
- mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
923
- status.append("🛠️ PHYLOGENETIC TOOLS:")
924
- status.append(f" - MAFFT: {'✅ Available' if mafft_available else '❌ Not available'}")
925
- if mafft_available:
926
- status.append(f" Path: {mafft_cmd}")
927
- status.append(f" - IQ-TREE: {'✅ Available' if iqtree_available else '❌ Not available'}")
928
- if iqtree_available:
929
- status.append(f" Path: {iqtree_cmd}")
930
- status.append("")
931
-
932
- # File system
933
- status.append("📁 FILES:")
934
- status.append(f" - CSV Data: {'✅ Found' if os.path.exists(csv_path) else '❌ Not found'}")
935
- status.append(f" - Models Directory: {'✅ Found' if os.path.exists('models') else '❌ Not found'}")
936
- status.append(f" - Output Directory: {'✅ Created' if os.path.exists('output') else '📝 Will be created'}")
937
-
938
- return "\n".join(status)
939
-
940
  except Exception as e:
941
- return f"Status check failed: {str(e)}"
 
942
 
943
  # --- Gradio Interface ---
944
  def create_gradio_interface():
945
- """Create the main Gradio interface"""
946
 
947
- # Custom CSS
948
  css = """
949
  .gradio-container {
950
- max-width: 1200px;
951
- margin: auto;
952
- }
953
- .tab-nav button {
954
- font-size: 16px;
955
- font-weight: bold;
956
  }
957
  .output-text {
958
  font-family: 'Courier New', monospace;
959
- font-size: 14px;
 
 
 
 
960
  }
961
  """
962
 
963
- with gr.Blocks(css=css, title="F-Gene Prediction & Phylogenetic Analysis") as demo:
964
- gr.Markdown("# 🧬 F-Gene Prediction & Phylogenetic Analysis Platform")
965
- gr.Markdown("Advanced bioinformatics pipeline for F-gene prediction and phylogenetic tree construction")
 
 
 
 
 
 
 
966
 
967
  with gr.Tabs():
968
  # Tab 1: Gene Prediction
969
- with gr.TabItem("🔬 Gene Prediction"):
970
- gr.Markdown("## F-Gene Sequence Prediction")
971
- gr.Markdown("Enter a DNA sequence or upload a FASTA file for F-gene prediction using multiple ML models.")
972
 
973
  with gr.Row():
974
  with gr.Column(scale=2):
975
- sequence_input = gr.Textbox(
976
  label="DNA Sequence",
977
- placeholder="Enter DNA sequence (ATCG format)...",
978
  lines=5,
979
  max_lines=10
980
  )
981
- file_input = gr.File(
982
- label="Or Upload FASTA File",
983
- file_types=[".fasta", ".fa", ".fas", ".fna"]
984
- )
985
- predict_btn = gr.Button("🔍 Predict F-Gene", variant="primary")
986
 
987
  with gr.Column(scale=3):
988
  prediction_output = gr.Textbox(
989
- label="Prediction Results",
990
  lines=20,
991
  max_lines=30,
992
  elem_classes=["output-text"]
993
  )
994
 
995
  predict_btn.click(
996
- fn=predict_sequence,
997
- inputs=[sequence_input, file_input],
998
- outputs=prediction_output
999
  )
1000
 
1001
- # Tab 2: Phylogenetic Analysis
1002
- with gr.TabItem("🌳 Phylogenetic Tree"):
1003
- gr.Markdown("## Phylogenetic Tree Construction")
1004
- gr.Markdown("Build maximum likelihood phylogenetic trees and perform sequence similarity analysis.")
1005
 
1006
  with gr.Row():
1007
  with gr.Column(scale=1):
1008
- tree_sequence_input = gr.Textbox(
1009
- label="DNA Sequence for Tree Analysis",
1010
- placeholder="Enter DNA sequence...",
1011
- lines=5
1012
- )
1013
- similarity_slider = gr.Slider(
1014
- minimum=70,
1015
- maximum=99,
1016
- value=85,
1017
- step=1,
1018
- label="Similarity Threshold (%)"
1019
- )
1020
- tree_btn = gr.Button("🌳 Build Tree", variant="primary")
1021
-
1022
- tree_status = gr.Textbox(
1023
- label="Tree Construction Status",
1024
- lines=8,
1025
- elem_classes=["output-text"]
1026
  )
 
1027
 
1028
  with gr.Column(scale=2):
1029
- tree_output = gr.HTML(
1030
- label="Interactive Phylogenetic Tree",
1031
- height=600
1032
- )
1033
-
1034
- tree_info = gr.Textbox(
1035
- label="Tree Information",
1036
- lines=5,
1037
  elem_classes=["output-text"]
1038
  )
1039
 
1040
- tree_btn.click(
1041
- fn=build_tree_interface,
1042
- inputs=[tree_sequence_input, similarity_slider],
1043
- outputs=[tree_status, tree_output, tree_info]
1044
  )
1045
 
1046
- # Tab 3: System Status
1047
- with gr.TabItem("⚙️ System Status"):
1048
- gr.Markdown("## System Status & Diagnostics")
1049
 
1050
- status_btn = gr.Button("🔄 Refresh Status", variant="secondary")
1051
- status_output = gr.Textbox(
1052
- label="System Status",
1053
- lines=20,
1054
- elem_classes=["output-text"]
1055
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1056
 
1057
- status_btn.click(
1058
- fn=get_system_status,
1059
- outputs=status_output
1060
- )
1061
 
1062
- # Load initial status
1063
- demo.load(
1064
- fn=get_system_status,
1065
- outputs=status_output
1066
- )
1067
-
1068
- # Footer
1069
- gr.Markdown("---")
1070
- gr.Markdown("🔬 **Powered by**: PyTorch, TensorFlow, MAFFT, IQ-TREE, and Plotly | 🧬 **Bioinformatics Pipeline v2.0**")
1071
-
1072
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1073
 
1074
  # --- Main Application ---
1075
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1076
  try:
1077
- logging.info("Starting F-Gene Prediction & Phylogenetic Analysis Platform...")
1078
 
1079
- # Create Gradio interface
1080
- demo = create_gradio_interface()
1081
-
1082
- # Launch the interface
1083
- demo.launch(
1084
- server_name="0.0.0.0",
1085
  server_port=7860,
1086
- share=True,
1087
- debug=True,
1088
- show_error=True
1089
  )
1090
 
1091
  except Exception as e:
1092
- logging.error(f"Failed to start application: {e}")
1093
- print(f"Error: {e}")
1094
- print("Please check the logs and ensure all dependencies are installed.")
 
 
712
  input_arr = np.array([indices])
713
  prediction = keras_model.predict(input_arr, verbose=0)[0]
714
 
715
+ # Format prediction as probabilities/scores
716
+ mean_score = np.mean(prediction)
717
+ max_score = np.max(prediction)
718
+ min_score = np.min(prediction)
719
+
720
+ result = f"Keras Model Prediction Results:\n"
721
+ result += f"- Mean Score: {mean_score:.4f}\n"
722
+ result += f"- Max Score: {max_score:.4f}\n"
723
+ result += f"- Min Score: {min_score:.4f}\n"
724
+ result += f"- Total K-mers: {len(kmers)}\n"
725
+ result += f"- Sequence Length: {len(sequence)} bp"
726
+
727
  return result
728
  except Exception as e:
729
+ logging.error(f"Keras prediction error: {e}")
730
  return f"Keras prediction failed: {str(e)}"
731
 
732
+ # --- Boundary Prediction ---
733
+ def predict_with_boundary(sequence):
734
  try:
735
  if not boundary_model:
736
  return f"Boundary model not available. Input sequence: {sequence[:100]}..."
737
 
738
+ # Get predictions from boundary model
739
  predictions, probabilities, confidence = boundary_model.predict(sequence)
740
 
741
  # Extract gene regions
742
  regions = boundary_model.extract_gene_regions(predictions, sequence)
743
 
744
+ result = f"Boundary Model Prediction Results:\n"
745
+ result += f"- Overall Confidence: {confidence:.4f}\n"
746
+ result += f"- Regions Detected: {len(regions) if regions else 0}\n"
747
+
748
  if regions:
 
 
749
  for i, region in enumerate(regions[:3]): # Show first 3 regions
750
+ result += f"\nRegion {i+1}:\n"
751
+ result += f" - Start: {region['start']}\n"
752
+ result += f" - End: {region['end']}\n"
753
+ result += f" - Length: {len(region['sequence'])} bp\n"
754
+ result += f" - Confidence: {region.get('confidence', 'N/A'):.4f}\n"
755
+
756
+ return result
 
 
 
757
  except Exception as e:
758
+ logging.error(f"Boundary prediction error: {e}")
759
+ return f"Boundary prediction failed: {str(e)}"
760
 
761
+ # --- Combined Prediction Function ---
762
+ def predict_gene_sequence(sequence):
763
+ """Combined prediction using both models"""
764
  try:
765
+ if not sequence or len(sequence.strip()) == 0:
766
+ return "Please provide a DNA sequence."
767
+
768
+ # Clean and validate sequence
769
+ sequence = re.sub(r'[^ATCG]', '', sequence.upper())
770
 
771
  if len(sequence) < 10:
772
+ return "Sequence too short. Please provide at least 10 nucleotides."
773
 
774
  results = []
775
+ results.append(f"🧬 GENE SEQUENCE ANALYSIS\n")
776
  results.append(f"Input sequence length: {len(sequence)} bp\n")
777
+ results.append("=" * 50)
778
 
779
+ # Boundary model prediction
780
  if boundary_model:
781
+ results.append("\n🎯 BOUNDARY DETECTION:")
782
+ boundary_result = predict_with_boundary(sequence)
783
  results.append(boundary_result)
784
+ else:
785
+ results.append("\n❌ Boundary model not available")
786
 
787
+ # Keras model prediction
788
+ if keras_model:
789
+ results.append("\n🔍 KERAS MODEL ANALYSIS:")
790
  keras_result = predict_with_keras(sequence)
791
  results.append(keras_result)
792
+ else:
793
+ results.append("\n❌ Keras model not available")
 
 
 
 
794
 
795
+ # Verification models
796
+ if verification_models:
797
+ results.append("\n🔬 VERIFICATION ANALYSIS:")
798
+ verification_result = run_verification_pipeline(sequence)
799
+ formatted_verification = format_verification_results(verification_result)
800
+ results.append(formatted_verification)
801
 
802
  return "\n".join(results)
803
 
804
  except Exception as e:
805
+ logging.error(f"Gene prediction error: {e}")
806
+ return f"Gene prediction failed: {str(e)}"
807
 
808
  # --- File Processing Functions ---
809
+ def process_fasta_file(file):
810
  """Process uploaded FASTA file"""
811
  try:
812
+ if file is None:
813
+ return "Please upload a FASTA file."
814
+
815
+ # Read file content
816
+ with open(file.name, 'r') as f:
817
+ content = f.read()
818
+
819
+ # Parse FASTA
820
  sequences = {}
821
  current_seq = ""
822
  current_name = ""
823
 
824
+ lines = content.strip().split('\n')
825
+ for line in lines:
826
+ line = line.strip()
827
+ if line.startswith('>'):
828
+ if current_name and current_seq:
829
+ sequences[current_name] = current_seq
830
+ current_name = line[1:] # Remove '>'
831
+ current_seq = ""
832
+ else:
833
+ current_seq += line.upper()
834
+
835
+ # Add last sequence
836
+ if current_name and current_seq:
837
+ sequences[current_name] = current_seq
838
 
 
 
 
 
 
 
 
 
 
839
  if not sequences:
840
+ return "No valid sequences found in FASTA file."
841
 
842
+ # Process each sequence
843
  results = []
844
+ results.append(f"📁 FASTA FILE ANALYSIS")
845
+ results.append(f"Found {len(sequences)} sequences\n")
846
+ results.append("=" * 60)
847
 
848
+ for i, (name, seq) in enumerate(sequences.items()):
849
+ if i >= 5: # Limit to first 5 sequences
850
+ results.append(f"\n... and {len(sequences) - 5} more sequences")
851
  break
852
+
853
+ results.append(f"\n🧬 Sequence: {name}")
854
+ results.append(f"Length: {len(seq)} bp")
855
 
856
+ # Clean sequence
857
+ clean_seq = re.sub(r'[^ATCG]', '', seq)
858
+ if len(clean_seq) >= 10:
859
+ # Run prediction on cleaned sequence
860
+ prediction = predict_gene_sequence(clean_seq)
861
+ results.append(prediction)
862
+ else:
863
+ results.append("❌ Sequence too short or invalid")
864
 
865
+ results.append("-" * 40)
 
 
 
866
 
867
  return "\n".join(results)
868
 
869
  except Exception as e:
870
+ logging.error(f"FASTA processing error: {e}")
871
+ return f"FASTA processing failed: {str(e)}"
872
 
873
+ # --- Tree Building Interface Functions ---
874
+ def build_tree_interface(sequence):
875
+ """Interface function for building phylogenetic trees"""
876
  try:
877
+ if not sequence or len(sequence.strip()) == 0:
878
+ return "Please provide a DNA sequence for tree construction."
879
+
880
+ # Clean sequence
881
+ clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
882
+
883
+ if len(clean_seq) < 50:
884
+ return "Sequence too short for phylogenetic analysis (minimum 50 bp required)."
885
 
886
+ # Try ML tree construction first
887
+ success, message, aligned_file, tree_file = build_maximum_likelihood_tree(clean_seq)
 
888
 
889
+ result = f"🌳 PHYLOGENETIC TREE CONSTRUCTION\n"
890
+ result += f"Input sequence length: {len(clean_seq)} bp\n"
891
+ result += "=" * 50 + "\n\n"
892
+ result += message
893
+
894
+ if success and tree_file:
895
+ # Try to read and display tree
896
+ try:
897
+ with open(tree_file, 'r') as f:
898
+ tree_content = f.read().strip()
899
+
900
+ result += f"\n\n📄 Tree file content:\n"
901
+ result += f"File: {os.path.basename(tree_file)}\n"
902
+ result += f"Size: {len(tree_content)} characters\n"
903
+
904
+ # Show first part of tree if it's very long
905
+ if len(tree_content) > 500:
906
+ result += f"Preview: {tree_content[:500]}...\n"
907
+ else:
908
+ result += f"Content: {tree_content}\n"
909
+
910
+ except Exception as e:
911
+ result += f"\n⚠️ Could not read tree file: {e}"
912
+
913
+ return result
914
 
915
  except Exception as e:
916
+ logging.error(f"Tree building interface error: {e}")
917
+ return f"Tree construction failed: {str(e)}"
918
 
919
+ def analyze_tree_interface(sequence, similarity_threshold):
920
+ """Interface function for tree analysis with similarity threshold"""
921
  try:
922
+ if not sequence or len(sequence.strip()) == 0:
923
+ return "Please provide a DNA sequence.", None
924
 
925
+ # Clean sequence
926
+ clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
927
 
928
+ if len(clean_seq) < 20:
929
+ return "Sequence too short for analysis (minimum 20 bp required).", None
930
+
931
+ # Validate similarity threshold
932
+ if not (1 <= similarity_threshold <= 99):
933
+ return "Similarity threshold must be between 1 and 99%.", None
934
+
935
+ # Run tree analysis
936
+ html_content, html_file, success_msg = analyze_sequence_for_tree(
937
+ clean_seq, similarity_threshold
938
+ )
939
+
940
+ if html_content:
941
+ result = f"🌳 PHYLOGENETIC TREE ANALYSIS\n"
942
+ result += f"Input sequence length: {len(clean_seq)} bp\n"
943
+ result += f"Similarity threshold: {similarity_threshold}%\n"
944
+ result += "=" * 50 + "\n\n"
945
+ result += success_msg
946
 
947
+ return result, html_file
 
 
 
948
  else:
949
+ return success_msg or "Tree analysis failed.", None
 
950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
951
  except Exception as e:
952
+ logging.error(f"Tree analysis interface error: {e}")
953
+ return f"Tree analysis failed: {str(e)}", None
954
 
955
  # --- Gradio Interface ---
956
  def create_gradio_interface():
957
+ """Create the Gradio interface"""
958
 
959
+ # Custom CSS for better styling
960
  css = """
961
  .gradio-container {
962
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 
 
 
 
 
963
  }
964
  .output-text {
965
  font-family: 'Courier New', monospace;
966
+ font-size: 12px;
967
+ line-height: 1.4;
968
+ }
969
+ .tab-nav {
970
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
971
  }
972
  """
973
 
974
+ with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
975
+ gr.Markdown("""
976
+ # 🧬 Advanced Gene Analysis Tool
977
+
978
+ This tool provides comprehensive gene sequence analysis including:
979
+ - **Gene Prediction**: Boundary detection and validation
980
+ - **Phylogenetic Analysis**: Tree construction and similarity analysis
981
+ - **File Processing**: Batch analysis of FASTA files
982
+ - **Model Verification**: Multi-model validation pipeline
983
+ """)
984
 
985
  with gr.Tabs():
986
  # Tab 1: Gene Prediction
987
+ with gr.Tab("🔬 Gene Prediction"):
988
+ gr.Markdown("### Predict gene sequences using trained models")
 
989
 
990
  with gr.Row():
991
  with gr.Column(scale=2):
992
+ seq_input = gr.Textbox(
993
  label="DNA Sequence",
994
+ placeholder="Enter DNA sequence (A, T, C, G only)...",
995
  lines=5,
996
  max_lines=10
997
  )
998
+ predict_btn = gr.Button("🚀 Analyze Sequence", variant="primary")
 
 
 
 
999
 
1000
  with gr.Column(scale=3):
1001
  prediction_output = gr.Textbox(
1002
+ label="Analysis Results",
1003
  lines=20,
1004
  max_lines=30,
1005
  elem_classes=["output-text"]
1006
  )
1007
 
1008
  predict_btn.click(
1009
+ fn=predict_gene_sequence,
1010
+ inputs=[seq_input],
1011
+ outputs=[prediction_output]
1012
  )
1013
 
1014
+ # Tab 2: File Processing
1015
+ with gr.Tab("📁 File Processing"):
1016
+ gr.Markdown("### Upload and analyze FASTA files")
 
1017
 
1018
  with gr.Row():
1019
  with gr.Column(scale=1):
1020
+ file_input = gr.File(
1021
+ label="Upload FASTA File",
1022
+ file_types=[".fasta", ".fa", ".fas", ".txt"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  )
1024
+ process_btn = gr.Button("📊 Process File", variant="primary")
1025
 
1026
  with gr.Column(scale=2):
1027
+ file_output = gr.Textbox(
1028
+ label="Processing Results",
1029
+ lines=25,
1030
+ max_lines=35,
 
 
 
 
1031
  elem_classes=["output-text"]
1032
  )
1033
 
1034
+ process_btn.click(
1035
+ fn=process_fasta_file,
1036
+ inputs=[file_input],
1037
+ outputs=[file_output]
1038
  )
1039
 
1040
+ # Tab 3: Phylogenetic Trees
1041
+ with gr.Tab("🌳 Phylogenetic Trees"):
1042
+ gr.Markdown("### Build and analyze phylogenetic trees")
1043
 
1044
+ with gr.Tabs():
1045
+ # Subtab: ML Tree Construction
1046
+ with gr.Tab("Maximum Likelihood Tree"):
1047
+ gr.Markdown("**Build ML tree using MAFFT + IQ-TREE**")
1048
+
1049
+ with gr.Row():
1050
+ with gr.Column(scale=1):
1051
+ ml_seq_input = gr.Textbox(
1052
+ label="DNA Sequence",
1053
+ placeholder="Enter sequence for ML tree construction...",
1054
+ lines=4
1055
+ )
1056
+ ml_tree_btn = gr.Button("🌳 Build ML Tree", variant="primary")
1057
+
1058
+ with gr.Column(scale=2):
1059
+ ml_tree_output = gr.Textbox(
1060
+ label="ML Tree Results",
1061
+ lines=20,
1062
+ elem_classes=["output-text"]
1063
+ )
1064
+
1065
+ ml_tree_btn.click(
1066
+ fn=build_tree_interface,
1067
+ inputs=[ml_seq_input],
1068
+ outputs=[ml_tree_output]
1069
+ )
1070
+
1071
+ # Subtab: Interactive Tree Analysis
1072
+ with gr.Tab("Interactive Analysis"):
1073
+ gr.Markdown("**Analyze sequence similarity with interactive tree**")
1074
+
1075
+ with gr.Row():
1076
+ with gr.Column(scale=1):
1077
+ tree_seq_input = gr.Textbox(
1078
+ label="Query Sequence",
1079
+ placeholder="Enter sequence for tree analysis...",
1080
+ lines=4
1081
+ )
1082
+ similarity_slider = gr.Slider(
1083
+ minimum=1,
1084
+ maximum=99,
1085
+ value=80,
1086
+ step=1,
1087
+ label="Similarity Threshold (%)"
1088
+ )
1089
+ tree_analyze_btn = gr.Button("🔍 Analyze Tree", variant="primary")
1090
+
1091
+ with gr.Column(scale=2):
1092
+ tree_analysis_output = gr.Textbox(
1093
+ label="Tree Analysis Results",
1094
+ lines=15,
1095
+ elem_classes=["output-text"]
1096
+ )
1097
+ tree_file_output = gr.File(
1098
+ label="Interactive Tree File (HTML)"
1099
+ )
1100
+
1101
+ tree_analyze_btn.click(
1102
+ fn=analyze_tree_interface,
1103
+ inputs=[tree_seq_input, similarity_slider],
1104
+ outputs=[tree_analysis_output, tree_file_output]
1105
+ )
1106
+
1107
+ # Tab 4: Model Information
1108
+ with gr.Tab("ℹ️ Model Information"):
1109
+ gr.Markdown("""
1110
+ ### Model Status and Information
1111
 
1112
+ **Available Models:**
1113
+ """)
 
 
1114
 
1115
+ # Model status
1116
+ model_status = []
1117
+ if boundary_model:
1118
+ model_status.append("✅ Boundary Detection Model: Loaded")
1119
+ else:
1120
+ model_status.append("❌ Boundary Detection Model: Not Available")
1121
+
1122
+ if keras_model:
1123
+ model_status.append(" Keras Validation Model: Loaded")
1124
+ else:
1125
+ model_status.append("❌ Keras Validation Model: Not Available")
1126
+
1127
+ if verification_models:
1128
+ model_status.append(f"✅ Verification Models: {len(verification_models)} loaded")
1129
+ for model_name in verification_models.keys():
1130
+ model_status.append(f" - {model_name}")
1131
+ else:
1132
+ model_status.append("❌ Verification Models: None loaded")
1133
+
1134
+ if analyzer:
1135
+ model_status.append("✅ Tree Analyzer: Initialized")
1136
+ else:
1137
+ model_status.append("❌ Tree Analyzer: Not Available")
1138
+
1139
+ # Check external tools
1140
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1141
+ if mafft_available:
1142
+ model_status.append(f"✅ MAFFT: Available ({mafft_cmd})")
1143
+ else:
1144
+ model_status.append("❌ MAFFT: Not Available")
1145
+
1146
+ if iqtree_available:
1147
+ model_status.append(f"✅ IQ-TREE: Available ({iqtree_cmd})")
1148
+ else:
1149
+ model_status.append("❌ IQ-TREE: Not Available")
1150
+
1151
+ gr.Markdown("\n".join(model_status))
1152
+
1153
+ gr.Markdown("""
1154
+ ### Usage Guidelines:
1155
+
1156
+ 1. **Gene Prediction**: Input DNA sequences containing only A, T, C, G characters
1157
+ 2. **File Processing**: Upload FASTA files with multiple sequences
1158
+ 3. **ML Trees**: Requires MAFFT and IQ-TREE installation
1159
+ 4. **Interactive Trees**: Uses simplified clustering for quick analysis
1160
+
1161
+ ### System Requirements:
1162
+ - Python 3.8+
1163
+ - TensorFlow/Keras for neural network models
1164
+ - PyTorch for boundary detection
1165
+ - MAFFT and IQ-TREE for phylogenetic analysis (optional)
1166
+ """)
1167
+
1168
+ return interface
1169
 
1170
  # --- Main Application ---
1171
  if __name__ == "__main__":
1172
+ # Initialize logging
1173
+ logging.basicConfig(
1174
+ level=logging.INFO,
1175
+ format='%(asctime)s - %(levelname)s - %(message)s',
1176
+ handlers=[
1177
+ logging.FileHandler('gene_analysis.log'),
1178
+ logging.StreamHandler(sys.stdout)
1179
+ ]
1180
+ )
1181
+
1182
+ # Create output directories
1183
+ os.makedirs("output", exist_ok=True)
1184
+ os.makedirs("ml_tree_output", exist_ok=True)
1185
+
1186
+ # Log startup information
1187
+ logging.info("Starting Gene Analysis Tool")
1188
+ logging.info(f"Boundary model loaded: {boundary_model is not None}")
1189
+ logging.info(f"Keras model loaded: {keras_model is not None}")
1190
+ logging.info(f"Verification models loaded: {len(verification_models) if verification_models else 0}")
1191
+ logging.info(f"Tree analyzer initialized: {analyzer is not None}")
1192
+
1193
+ # Check external tools
1194
+ mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
1195
+ logging.info(f"MAFFT available: {mafft_available}")
1196
+ logging.info(f"IQ-TREE available: {iqtree_available}")
1197
+
1198
+ # Create and launch interface
1199
  try:
1200
+ interface = create_gradio_interface()
1201
 
1202
+ # Launch with appropriate settings
1203
+ interface.launch(
1204
+ share=False, # Set to True if you want a public link
1205
+ server_name="0.0.0.0", # Allow external connections
 
 
1206
  server_port=7860,
1207
+ show_error=True,
1208
+ debug=True
 
1209
  )
1210
 
1211
  except Exception as e:
1212
+ logging.error(f"Failed to launch interface: {e}")
1213
+ import traceback
1214
+ logging.error(f"Full traceback: {traceback.format_exc()}")
1215
+ sys.exit(1)