Spaces:
No application file
No application file
Update app.py
Browse files
app.py
CHANGED
|
@@ -712,383 +712,504 @@ def predict_with_keras(sequence):
|
|
| 712 |
input_arr = np.array([indices])
|
| 713 |
prediction = keras_model.predict(input_arr, verbose=0)[0]
|
| 714 |
|
| 715 |
-
# Format prediction as probabilities/scores
|
| 716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
return result
|
| 718 |
except Exception as e:
|
| 719 |
-
logging.error(f"Keras prediction
|
| 720 |
return f"Keras prediction failed: {str(e)}"
|
| 721 |
|
| 722 |
-
# --- Boundary
|
| 723 |
-
def
|
| 724 |
try:
|
| 725 |
if not boundary_model:
|
| 726 |
return f"Boundary model not available. Input sequence: {sequence[:100]}..."
|
| 727 |
|
|
|
|
| 728 |
predictions, probabilities, confidence = boundary_model.predict(sequence)
|
| 729 |
|
| 730 |
# Extract gene regions
|
| 731 |
regions = boundary_model.extract_gene_regions(predictions, sequence)
|
| 732 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
if regions:
|
| 734 |
-
result = f"Confidence: {confidence:.3f}\n"
|
| 735 |
-
result += f"Regions found: {len(regions)}\n"
|
| 736 |
for i, region in enumerate(regions[:3]): # Show first 3 regions
|
| 737 |
-
result += f"
|
| 738 |
-
|
| 739 |
-
result += f"
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
else:
|
| 745 |
-
return f"No gene regions found. Confidence: {confidence:.3f}"
|
| 746 |
-
|
| 747 |
except Exception as e:
|
| 748 |
-
logging.error(f"Boundary
|
| 749 |
-
return f"Boundary
|
| 750 |
|
| 751 |
-
# --- Combined Prediction ---
|
| 752 |
-
def
|
| 753 |
-
"""
|
| 754 |
try:
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
|
|
|
|
|
|
| 758 |
|
| 759 |
if len(sequence) < 10:
|
| 760 |
-
return "
|
| 761 |
|
| 762 |
results = []
|
|
|
|
| 763 |
results.append(f"Input sequence length: {len(sequence)} bp\n")
|
|
|
|
| 764 |
|
| 765 |
-
#
|
| 766 |
if boundary_model:
|
| 767 |
-
results.append("
|
| 768 |
-
boundary_result =
|
| 769 |
results.append(boundary_result)
|
| 770 |
-
|
|
|
|
| 771 |
|
| 772 |
-
#
|
| 773 |
-
if keras_model
|
| 774 |
-
results.append("
|
| 775 |
keras_result = predict_with_keras(sequence)
|
| 776 |
results.append(keras_result)
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
# Run verification pipeline
|
| 780 |
-
verification_results = run_verification_pipeline(sequence)
|
| 781 |
-
if verification_results:
|
| 782 |
-
results.append(format_verification_results(verification_results))
|
| 783 |
|
| 784 |
-
#
|
| 785 |
-
if
|
| 786 |
-
results.append("
|
| 787 |
-
|
|
|
|
|
|
|
| 788 |
|
| 789 |
return "\n".join(results)
|
| 790 |
|
| 791 |
except Exception as e:
|
| 792 |
-
logging.error(f"
|
| 793 |
-
return f"
|
| 794 |
|
| 795 |
# --- File Processing Functions ---
|
| 796 |
-
def process_fasta_file(
|
| 797 |
"""Process uploaded FASTA file"""
|
| 798 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
sequences = {}
|
| 800 |
current_seq = ""
|
| 801 |
current_name = ""
|
| 802 |
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
|
| 818 |
-
return sequences
|
| 819 |
-
except Exception as e:
|
| 820 |
-
logging.error(f"FASTA processing failed: {e}")
|
| 821 |
-
return None
|
| 822 |
-
|
| 823 |
-
def batch_predict_fasta(file_path):
|
| 824 |
-
"""Batch prediction for FASTA file"""
|
| 825 |
-
try:
|
| 826 |
-
sequences = process_fasta_file(file_path)
|
| 827 |
if not sequences:
|
| 828 |
-
return "
|
| 829 |
|
|
|
|
| 830 |
results = []
|
| 831 |
-
results.append(f"
|
|
|
|
|
|
|
| 832 |
|
| 833 |
-
for i, (name,
|
| 834 |
-
if i >=
|
| 835 |
-
results.append(f"... and {len(sequences) -
|
| 836 |
break
|
|
|
|
|
|
|
|
|
|
| 837 |
|
| 838 |
-
|
| 839 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
|
| 841 |
-
|
| 842 |
-
prediction = predict_f_gene(sequence)
|
| 843 |
-
results.append(prediction)
|
| 844 |
-
results.append("-" * 50)
|
| 845 |
|
| 846 |
return "\n".join(results)
|
| 847 |
|
| 848 |
except Exception as e:
|
| 849 |
-
logging.error(f"
|
| 850 |
-
return f"
|
| 851 |
|
| 852 |
-
# ---
|
| 853 |
-
def
|
| 854 |
-
"""
|
| 855 |
try:
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 859 |
|
| 860 |
-
#
|
| 861 |
-
|
| 862 |
-
return "Please enter a DNA sequence (minimum 10 nucleotides) or upload a FASTA file."
|
| 863 |
|
| 864 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
|
| 866 |
except Exception as e:
|
| 867 |
-
logging.error(f"
|
| 868 |
-
return f"
|
| 869 |
|
| 870 |
-
def
|
| 871 |
-
"""Interface function for
|
| 872 |
try:
|
| 873 |
-
if not sequence or len(sequence.strip())
|
| 874 |
-
return "Please
|
| 875 |
|
| 876 |
-
#
|
| 877 |
-
|
| 878 |
|
| 879 |
-
if
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 885 |
|
| 886 |
-
|
| 887 |
-
return tree_info, html_content, f"{tree_info}\n\n{simple_message}"
|
| 888 |
-
else:
|
| 889 |
-
return tree_info, "", tree_info
|
| 890 |
else:
|
| 891 |
-
|
| 892 |
-
html_content, html_file, simple_message = analyze_sequence_for_tree(sequence, matching_percentage)
|
| 893 |
|
| 894 |
-
if html_content:
|
| 895 |
-
fallback_msg = f"⚠️ ML Tree Construction Issues:\n{ml_message}\n\n"
|
| 896 |
-
fallback_msg += "📊 Simplified Tree Analysis:\n"
|
| 897 |
-
return fallback_msg, html_content, f"{fallback_msg}{simple_message}"
|
| 898 |
-
else:
|
| 899 |
-
return f"❌ Tree construction failed:\n{ml_message}\n\nSimplified analysis: {simple_message}", "", ""
|
| 900 |
-
|
| 901 |
-
except Exception as e:
|
| 902 |
-
error_msg = f"Tree building interface error: {str(e)}"
|
| 903 |
-
logging.error(error_msg)
|
| 904 |
-
return error_msg, "", ""
|
| 905 |
-
|
| 906 |
-
def get_system_status():
|
| 907 |
-
"""Get system status for debugging"""
|
| 908 |
-
try:
|
| 909 |
-
status = []
|
| 910 |
-
status.append("🔧 SYSTEM STATUS:")
|
| 911 |
-
status.append("")
|
| 912 |
-
|
| 913 |
-
# Model status
|
| 914 |
-
status.append("📊 MODELS:")
|
| 915 |
-
status.append(f" - Boundary Model: {'✅ Loaded' if boundary_model else '❌ Not loaded'}")
|
| 916 |
-
status.append(f" - Keras Model: {'✅ Loaded' if keras_model else '❌ Not loaded'}")
|
| 917 |
-
status.append(f" - Tree Analyzer: {'✅ Loaded' if analyzer else '❌ Not loaded'}")
|
| 918 |
-
status.append(f" - Verification Models: {len(verification_models)} loaded")
|
| 919 |
-
status.append("")
|
| 920 |
-
|
| 921 |
-
# Tool availability
|
| 922 |
-
mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
|
| 923 |
-
status.append("🛠️ PHYLOGENETIC TOOLS:")
|
| 924 |
-
status.append(f" - MAFFT: {'✅ Available' if mafft_available else '❌ Not available'}")
|
| 925 |
-
if mafft_available:
|
| 926 |
-
status.append(f" Path: {mafft_cmd}")
|
| 927 |
-
status.append(f" - IQ-TREE: {'✅ Available' if iqtree_available else '❌ Not available'}")
|
| 928 |
-
if iqtree_available:
|
| 929 |
-
status.append(f" Path: {iqtree_cmd}")
|
| 930 |
-
status.append("")
|
| 931 |
-
|
| 932 |
-
# File system
|
| 933 |
-
status.append("📁 FILES:")
|
| 934 |
-
status.append(f" - CSV Data: {'✅ Found' if os.path.exists(csv_path) else '❌ Not found'}")
|
| 935 |
-
status.append(f" - Models Directory: {'✅ Found' if os.path.exists('models') else '❌ Not found'}")
|
| 936 |
-
status.append(f" - Output Directory: {'✅ Created' if os.path.exists('output') else '📝 Will be created'}")
|
| 937 |
-
|
| 938 |
-
return "\n".join(status)
|
| 939 |
-
|
| 940 |
except Exception as e:
|
| 941 |
-
|
|
|
|
| 942 |
|
| 943 |
# --- Gradio Interface ---
|
| 944 |
def create_gradio_interface():
|
| 945 |
-
"""Create the
|
| 946 |
|
| 947 |
-
# Custom CSS
|
| 948 |
css = """
|
| 949 |
.gradio-container {
|
| 950 |
-
|
| 951 |
-
margin: auto;
|
| 952 |
-
}
|
| 953 |
-
.tab-nav button {
|
| 954 |
-
font-size: 16px;
|
| 955 |
-
font-weight: bold;
|
| 956 |
}
|
| 957 |
.output-text {
|
| 958 |
font-family: 'Courier New', monospace;
|
| 959 |
-
font-size:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 960 |
}
|
| 961 |
"""
|
| 962 |
|
| 963 |
-
with gr.Blocks(css=css, title="
|
| 964 |
-
gr.Markdown("
|
| 965 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
|
| 967 |
with gr.Tabs():
|
| 968 |
# Tab 1: Gene Prediction
|
| 969 |
-
with gr.
|
| 970 |
-
gr.Markdown("##
|
| 971 |
-
gr.Markdown("Enter a DNA sequence or upload a FASTA file for F-gene prediction using multiple ML models.")
|
| 972 |
|
| 973 |
with gr.Row():
|
| 974 |
with gr.Column(scale=2):
|
| 975 |
-
|
| 976 |
label="DNA Sequence",
|
| 977 |
-
placeholder="Enter DNA sequence (
|
| 978 |
lines=5,
|
| 979 |
max_lines=10
|
| 980 |
)
|
| 981 |
-
|
| 982 |
-
label="Or Upload FASTA File",
|
| 983 |
-
file_types=[".fasta", ".fa", ".fas", ".fna"]
|
| 984 |
-
)
|
| 985 |
-
predict_btn = gr.Button("🔍 Predict F-Gene", variant="primary")
|
| 986 |
|
| 987 |
with gr.Column(scale=3):
|
| 988 |
prediction_output = gr.Textbox(
|
| 989 |
-
label="
|
| 990 |
lines=20,
|
| 991 |
max_lines=30,
|
| 992 |
elem_classes=["output-text"]
|
| 993 |
)
|
| 994 |
|
| 995 |
predict_btn.click(
|
| 996 |
-
fn=
|
| 997 |
-
inputs=[
|
| 998 |
-
outputs=prediction_output
|
| 999 |
)
|
| 1000 |
|
| 1001 |
-
# Tab 2:
|
| 1002 |
-
with gr.
|
| 1003 |
-
gr.Markdown("##
|
| 1004 |
-
gr.Markdown("Build maximum likelihood phylogenetic trees and perform sequence similarity analysis.")
|
| 1005 |
|
| 1006 |
with gr.Row():
|
| 1007 |
with gr.Column(scale=1):
|
| 1008 |
-
|
| 1009 |
-
label="
|
| 1010 |
-
|
| 1011 |
-
lines=5
|
| 1012 |
-
)
|
| 1013 |
-
similarity_slider = gr.Slider(
|
| 1014 |
-
minimum=70,
|
| 1015 |
-
maximum=99,
|
| 1016 |
-
value=85,
|
| 1017 |
-
step=1,
|
| 1018 |
-
label="Similarity Threshold (%)"
|
| 1019 |
-
)
|
| 1020 |
-
tree_btn = gr.Button("🌳 Build Tree", variant="primary")
|
| 1021 |
-
|
| 1022 |
-
tree_status = gr.Textbox(
|
| 1023 |
-
label="Tree Construction Status",
|
| 1024 |
-
lines=8,
|
| 1025 |
-
elem_classes=["output-text"]
|
| 1026 |
)
|
|
|
|
| 1027 |
|
| 1028 |
with gr.Column(scale=2):
|
| 1029 |
-
|
| 1030 |
-
label="
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
tree_info = gr.Textbox(
|
| 1035 |
-
label="Tree Information",
|
| 1036 |
-
lines=5,
|
| 1037 |
elem_classes=["output-text"]
|
| 1038 |
)
|
| 1039 |
|
| 1040 |
-
|
| 1041 |
-
fn=
|
| 1042 |
-
inputs=[
|
| 1043 |
-
outputs=[
|
| 1044 |
)
|
| 1045 |
|
| 1046 |
-
# Tab 3:
|
| 1047 |
-
with gr.
|
| 1048 |
-
gr.Markdown("##
|
| 1049 |
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
|
| 1055 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1056 |
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
outputs=status_output
|
| 1060 |
-
)
|
| 1061 |
|
| 1062 |
-
#
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1073 |
|
| 1074 |
# --- Main Application ---
|
| 1075 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1076 |
try:
|
| 1077 |
-
|
| 1078 |
|
| 1079 |
-
#
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
demo.launch(
|
| 1084 |
-
server_name="0.0.0.0",
|
| 1085 |
server_port=7860,
|
| 1086 |
-
|
| 1087 |
-
debug=True
|
| 1088 |
-
show_error=True
|
| 1089 |
)
|
| 1090 |
|
| 1091 |
except Exception as e:
|
| 1092 |
-
logging.error(f"Failed to
|
| 1093 |
-
|
| 1094 |
-
|
|
|
|
|
|
| 712 |
input_arr = np.array([indices])
|
| 713 |
prediction = keras_model.predict(input_arr, verbose=0)[0]
|
| 714 |
|
| 715 |
+
# Format prediction as probabilities/scores
|
| 716 |
+
mean_score = np.mean(prediction)
|
| 717 |
+
max_score = np.max(prediction)
|
| 718 |
+
min_score = np.min(prediction)
|
| 719 |
+
|
| 720 |
+
result = f"Keras Model Prediction Results:\n"
|
| 721 |
+
result += f"- Mean Score: {mean_score:.4f}\n"
|
| 722 |
+
result += f"- Max Score: {max_score:.4f}\n"
|
| 723 |
+
result += f"- Min Score: {min_score:.4f}\n"
|
| 724 |
+
result += f"- Total K-mers: {len(kmers)}\n"
|
| 725 |
+
result += f"- Sequence Length: {len(sequence)} bp"
|
| 726 |
+
|
| 727 |
return result
|
| 728 |
except Exception as e:
|
| 729 |
+
logging.error(f"Keras prediction error: {e}")
|
| 730 |
return f"Keras prediction failed: {str(e)}"
|
| 731 |
|
| 732 |
+
# --- Boundary Prediction ---
|
| 733 |
+
def predict_with_boundary(sequence):
|
| 734 |
try:
|
| 735 |
if not boundary_model:
|
| 736 |
return f"Boundary model not available. Input sequence: {sequence[:100]}..."
|
| 737 |
|
| 738 |
+
# Get predictions from boundary model
|
| 739 |
predictions, probabilities, confidence = boundary_model.predict(sequence)
|
| 740 |
|
| 741 |
# Extract gene regions
|
| 742 |
regions = boundary_model.extract_gene_regions(predictions, sequence)
|
| 743 |
|
| 744 |
+
result = f"Boundary Model Prediction Results:\n"
|
| 745 |
+
result += f"- Overall Confidence: {confidence:.4f}\n"
|
| 746 |
+
result += f"- Regions Detected: {len(regions) if regions else 0}\n"
|
| 747 |
+
|
| 748 |
if regions:
|
|
|
|
|
|
|
| 749 |
for i, region in enumerate(regions[:3]): # Show first 3 regions
|
| 750 |
+
result += f"\nRegion {i+1}:\n"
|
| 751 |
+
result += f" - Start: {region['start']}\n"
|
| 752 |
+
result += f" - End: {region['end']}\n"
|
| 753 |
+
result += f" - Length: {len(region['sequence'])} bp\n"
|
| 754 |
+
result += f" - Confidence: {region.get('confidence', 'N/A'):.4f}\n"
|
| 755 |
+
|
| 756 |
+
return result
|
|
|
|
|
|
|
|
|
|
| 757 |
except Exception as e:
|
| 758 |
+
logging.error(f"Boundary prediction error: {e}")
|
| 759 |
+
return f"Boundary prediction failed: {str(e)}"
|
| 760 |
|
| 761 |
+
# --- Combined Prediction Function ---
|
| 762 |
+
def predict_gene_sequence(sequence):
|
| 763 |
+
"""Combined prediction using both models"""
|
| 764 |
try:
|
| 765 |
+
if not sequence or len(sequence.strip()) == 0:
|
| 766 |
+
return "Please provide a DNA sequence."
|
| 767 |
+
|
| 768 |
+
# Clean and validate sequence
|
| 769 |
+
sequence = re.sub(r'[^ATCG]', '', sequence.upper())
|
| 770 |
|
| 771 |
if len(sequence) < 10:
|
| 772 |
+
return "Sequence too short. Please provide at least 10 nucleotides."
|
| 773 |
|
| 774 |
results = []
|
| 775 |
+
results.append(f"🧬 GENE SEQUENCE ANALYSIS\n")
|
| 776 |
results.append(f"Input sequence length: {len(sequence)} bp\n")
|
| 777 |
+
results.append("=" * 50)
|
| 778 |
|
| 779 |
+
# Boundary model prediction
|
| 780 |
if boundary_model:
|
| 781 |
+
results.append("\n🎯 BOUNDARY DETECTION:")
|
| 782 |
+
boundary_result = predict_with_boundary(sequence)
|
| 783 |
results.append(boundary_result)
|
| 784 |
+
else:
|
| 785 |
+
results.append("\n❌ Boundary model not available")
|
| 786 |
|
| 787 |
+
# Keras model prediction
|
| 788 |
+
if keras_model:
|
| 789 |
+
results.append("\n🔍 KERAS MODEL ANALYSIS:")
|
| 790 |
keras_result = predict_with_keras(sequence)
|
| 791 |
results.append(keras_result)
|
| 792 |
+
else:
|
| 793 |
+
results.append("\n❌ Keras model not available")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
|
| 795 |
+
# Verification models
|
| 796 |
+
if verification_models:
|
| 797 |
+
results.append("\n🔬 VERIFICATION ANALYSIS:")
|
| 798 |
+
verification_result = run_verification_pipeline(sequence)
|
| 799 |
+
formatted_verification = format_verification_results(verification_result)
|
| 800 |
+
results.append(formatted_verification)
|
| 801 |
|
| 802 |
return "\n".join(results)
|
| 803 |
|
| 804 |
except Exception as e:
|
| 805 |
+
logging.error(f"Gene prediction error: {e}")
|
| 806 |
+
return f"Gene prediction failed: {str(e)}"
|
| 807 |
|
| 808 |
# --- File Processing Functions ---
|
| 809 |
+
def process_fasta_file(file):
|
| 810 |
"""Process uploaded FASTA file"""
|
| 811 |
try:
|
| 812 |
+
if file is None:
|
| 813 |
+
return "Please upload a FASTA file."
|
| 814 |
+
|
| 815 |
+
# Read file content
|
| 816 |
+
with open(file.name, 'r') as f:
|
| 817 |
+
content = f.read()
|
| 818 |
+
|
| 819 |
+
# Parse FASTA
|
| 820 |
sequences = {}
|
| 821 |
current_seq = ""
|
| 822 |
current_name = ""
|
| 823 |
|
| 824 |
+
lines = content.strip().split('\n')
|
| 825 |
+
for line in lines:
|
| 826 |
+
line = line.strip()
|
| 827 |
+
if line.startswith('>'):
|
| 828 |
+
if current_name and current_seq:
|
| 829 |
+
sequences[current_name] = current_seq
|
| 830 |
+
current_name = line[1:] # Remove '>'
|
| 831 |
+
current_seq = ""
|
| 832 |
+
else:
|
| 833 |
+
current_seq += line.upper()
|
| 834 |
+
|
| 835 |
+
# Add last sequence
|
| 836 |
+
if current_name and current_seq:
|
| 837 |
+
sequences[current_name] = current_seq
|
| 838 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
if not sequences:
|
| 840 |
+
return "No valid sequences found in FASTA file."
|
| 841 |
|
| 842 |
+
# Process each sequence
|
| 843 |
results = []
|
| 844 |
+
results.append(f"📁 FASTA FILE ANALYSIS")
|
| 845 |
+
results.append(f"Found {len(sequences)} sequences\n")
|
| 846 |
+
results.append("=" * 60)
|
| 847 |
|
| 848 |
+
for i, (name, seq) in enumerate(sequences.items()):
|
| 849 |
+
if i >= 5: # Limit to first 5 sequences
|
| 850 |
+
results.append(f"\n... and {len(sequences) - 5} more sequences")
|
| 851 |
break
|
| 852 |
+
|
| 853 |
+
results.append(f"\n🧬 Sequence: {name}")
|
| 854 |
+
results.append(f"Length: {len(seq)} bp")
|
| 855 |
|
| 856 |
+
# Clean sequence
|
| 857 |
+
clean_seq = re.sub(r'[^ATCG]', '', seq)
|
| 858 |
+
if len(clean_seq) >= 10:
|
| 859 |
+
# Run prediction on cleaned sequence
|
| 860 |
+
prediction = predict_gene_sequence(clean_seq)
|
| 861 |
+
results.append(prediction)
|
| 862 |
+
else:
|
| 863 |
+
results.append("❌ Sequence too short or invalid")
|
| 864 |
|
| 865 |
+
results.append("-" * 40)
|
|
|
|
|
|
|
|
|
|
| 866 |
|
| 867 |
return "\n".join(results)
|
| 868 |
|
| 869 |
except Exception as e:
|
| 870 |
+
logging.error(f"FASTA processing error: {e}")
|
| 871 |
+
return f"FASTA processing failed: {str(e)}"
|
| 872 |
|
| 873 |
+
# --- Tree Building Interface Functions ---
|
| 874 |
+
def build_tree_interface(sequence):
|
| 875 |
+
"""Interface function for building phylogenetic trees"""
|
| 876 |
try:
|
| 877 |
+
if not sequence or len(sequence.strip()) == 0:
|
| 878 |
+
return "Please provide a DNA sequence for tree construction."
|
| 879 |
+
|
| 880 |
+
# Clean sequence
|
| 881 |
+
clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
|
| 882 |
+
|
| 883 |
+
if len(clean_seq) < 50:
|
| 884 |
+
return "Sequence too short for phylogenetic analysis (minimum 50 bp required)."
|
| 885 |
|
| 886 |
+
# Try ML tree construction first
|
| 887 |
+
success, message, aligned_file, tree_file = build_maximum_likelihood_tree(clean_seq)
|
|
|
|
| 888 |
|
| 889 |
+
result = f"🌳 PHYLOGENETIC TREE CONSTRUCTION\n"
|
| 890 |
+
result += f"Input sequence length: {len(clean_seq)} bp\n"
|
| 891 |
+
result += "=" * 50 + "\n\n"
|
| 892 |
+
result += message
|
| 893 |
+
|
| 894 |
+
if success and tree_file:
|
| 895 |
+
# Try to read and display tree
|
| 896 |
+
try:
|
| 897 |
+
with open(tree_file, 'r') as f:
|
| 898 |
+
tree_content = f.read().strip()
|
| 899 |
+
|
| 900 |
+
result += f"\n\n📄 Tree file content:\n"
|
| 901 |
+
result += f"File: {os.path.basename(tree_file)}\n"
|
| 902 |
+
result += f"Size: {len(tree_content)} characters\n"
|
| 903 |
+
|
| 904 |
+
# Show first part of tree if it's very long
|
| 905 |
+
if len(tree_content) > 500:
|
| 906 |
+
result += f"Preview: {tree_content[:500]}...\n"
|
| 907 |
+
else:
|
| 908 |
+
result += f"Content: {tree_content}\n"
|
| 909 |
+
|
| 910 |
+
except Exception as e:
|
| 911 |
+
result += f"\n⚠️ Could not read tree file: {e}"
|
| 912 |
+
|
| 913 |
+
return result
|
| 914 |
|
| 915 |
except Exception as e:
|
| 916 |
+
logging.error(f"Tree building interface error: {e}")
|
| 917 |
+
return f"Tree construction failed: {str(e)}"
|
| 918 |
|
| 919 |
+
def analyze_tree_interface(sequence, similarity_threshold):
|
| 920 |
+
"""Interface function for tree analysis with similarity threshold"""
|
| 921 |
try:
|
| 922 |
+
if not sequence or len(sequence.strip()) == 0:
|
| 923 |
+
return "Please provide a DNA sequence.", None
|
| 924 |
|
| 925 |
+
# Clean sequence
|
| 926 |
+
clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
|
| 927 |
|
| 928 |
+
if len(clean_seq) < 20:
|
| 929 |
+
return "Sequence too short for analysis (minimum 20 bp required).", None
|
| 930 |
+
|
| 931 |
+
# Validate similarity threshold
|
| 932 |
+
if not (1 <= similarity_threshold <= 99):
|
| 933 |
+
return "Similarity threshold must be between 1 and 99%.", None
|
| 934 |
+
|
| 935 |
+
# Run tree analysis
|
| 936 |
+
html_content, html_file, success_msg = analyze_sequence_for_tree(
|
| 937 |
+
clean_seq, similarity_threshold
|
| 938 |
+
)
|
| 939 |
+
|
| 940 |
+
if html_content:
|
| 941 |
+
result = f"🌳 PHYLOGENETIC TREE ANALYSIS\n"
|
| 942 |
+
result += f"Input sequence length: {len(clean_seq)} bp\n"
|
| 943 |
+
result += f"Similarity threshold: {similarity_threshold}%\n"
|
| 944 |
+
result += "=" * 50 + "\n\n"
|
| 945 |
+
result += success_msg
|
| 946 |
|
| 947 |
+
return result, html_file
|
|
|
|
|
|
|
|
|
|
| 948 |
else:
|
| 949 |
+
return success_msg or "Tree analysis failed.", None
|
|
|
|
| 950 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
except Exception as e:
|
| 952 |
+
logging.error(f"Tree analysis interface error: {e}")
|
| 953 |
+
return f"Tree analysis failed: {str(e)}", None
|
| 954 |
|
| 955 |
# --- Gradio Interface ---
|
| 956 |
def create_gradio_interface():
|
| 957 |
+
"""Create the Gradio interface"""
|
| 958 |
|
| 959 |
+
# Custom CSS for better styling
|
| 960 |
css = """
|
| 961 |
.gradio-container {
|
| 962 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 963 |
}
|
| 964 |
.output-text {
|
| 965 |
font-family: 'Courier New', monospace;
|
| 966 |
+
font-size: 12px;
|
| 967 |
+
line-height: 1.4;
|
| 968 |
+
}
|
| 969 |
+
.tab-nav {
|
| 970 |
+
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
|
| 971 |
}
|
| 972 |
"""
|
| 973 |
|
| 974 |
+
with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
|
| 975 |
+
gr.Markdown("""
|
| 976 |
+
# 🧬 Advanced Gene Analysis Tool
|
| 977 |
+
|
| 978 |
+
This tool provides comprehensive gene sequence analysis including:
|
| 979 |
+
- **Gene Prediction**: Boundary detection and validation
|
| 980 |
+
- **Phylogenetic Analysis**: Tree construction and similarity analysis
|
| 981 |
+
- **File Processing**: Batch analysis of FASTA files
|
| 982 |
+
- **Model Verification**: Multi-model validation pipeline
|
| 983 |
+
""")
|
| 984 |
|
| 985 |
with gr.Tabs():
|
| 986 |
# Tab 1: Gene Prediction
|
| 987 |
+
with gr.Tab("🔬 Gene Prediction"):
|
| 988 |
+
gr.Markdown("### Predict gene sequences using trained models")
|
|
|
|
| 989 |
|
| 990 |
with gr.Row():
|
| 991 |
with gr.Column(scale=2):
|
| 992 |
+
seq_input = gr.Textbox(
|
| 993 |
label="DNA Sequence",
|
| 994 |
+
placeholder="Enter DNA sequence (A, T, C, G only)...",
|
| 995 |
lines=5,
|
| 996 |
max_lines=10
|
| 997 |
)
|
| 998 |
+
predict_btn = gr.Button("🚀 Analyze Sequence", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
|
| 1000 |
with gr.Column(scale=3):
|
| 1001 |
prediction_output = gr.Textbox(
|
| 1002 |
+
label="Analysis Results",
|
| 1003 |
lines=20,
|
| 1004 |
max_lines=30,
|
| 1005 |
elem_classes=["output-text"]
|
| 1006 |
)
|
| 1007 |
|
| 1008 |
predict_btn.click(
|
| 1009 |
+
fn=predict_gene_sequence,
|
| 1010 |
+
inputs=[seq_input],
|
| 1011 |
+
outputs=[prediction_output]
|
| 1012 |
)
|
| 1013 |
|
| 1014 |
+
# Tab 2: File Processing
|
| 1015 |
+
with gr.Tab("📁 File Processing"):
|
| 1016 |
+
gr.Markdown("### Upload and analyze FASTA files")
|
|
|
|
| 1017 |
|
| 1018 |
with gr.Row():
|
| 1019 |
with gr.Column(scale=1):
|
| 1020 |
+
file_input = gr.File(
|
| 1021 |
+
label="Upload FASTA File",
|
| 1022 |
+
file_types=[".fasta", ".fa", ".fas", ".txt"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
)
|
| 1024 |
+
process_btn = gr.Button("📊 Process File", variant="primary")
|
| 1025 |
|
| 1026 |
with gr.Column(scale=2):
|
| 1027 |
+
file_output = gr.Textbox(
|
| 1028 |
+
label="Processing Results",
|
| 1029 |
+
lines=25,
|
| 1030 |
+
max_lines=35,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1031 |
elem_classes=["output-text"]
|
| 1032 |
)
|
| 1033 |
|
| 1034 |
+
process_btn.click(
|
| 1035 |
+
fn=process_fasta_file,
|
| 1036 |
+
inputs=[file_input],
|
| 1037 |
+
outputs=[file_output]
|
| 1038 |
)
|
| 1039 |
|
| 1040 |
+
# Tab 3: Phylogenetic Trees
|
| 1041 |
+
with gr.Tab("🌳 Phylogenetic Trees"):
|
| 1042 |
+
gr.Markdown("### Build and analyze phylogenetic trees")
|
| 1043 |
|
| 1044 |
+
with gr.Tabs():
|
| 1045 |
+
# Subtab: ML Tree Construction
|
| 1046 |
+
with gr.Tab("Maximum Likelihood Tree"):
|
| 1047 |
+
gr.Markdown("**Build ML tree using MAFFT + IQ-TREE**")
|
| 1048 |
+
|
| 1049 |
+
with gr.Row():
|
| 1050 |
+
with gr.Column(scale=1):
|
| 1051 |
+
ml_seq_input = gr.Textbox(
|
| 1052 |
+
label="DNA Sequence",
|
| 1053 |
+
placeholder="Enter sequence for ML tree construction...",
|
| 1054 |
+
lines=4
|
| 1055 |
+
)
|
| 1056 |
+
ml_tree_btn = gr.Button("🌳 Build ML Tree", variant="primary")
|
| 1057 |
+
|
| 1058 |
+
with gr.Column(scale=2):
|
| 1059 |
+
ml_tree_output = gr.Textbox(
|
| 1060 |
+
label="ML Tree Results",
|
| 1061 |
+
lines=20,
|
| 1062 |
+
elem_classes=["output-text"]
|
| 1063 |
+
)
|
| 1064 |
+
|
| 1065 |
+
ml_tree_btn.click(
|
| 1066 |
+
fn=build_tree_interface,
|
| 1067 |
+
inputs=[ml_seq_input],
|
| 1068 |
+
outputs=[ml_tree_output]
|
| 1069 |
+
)
|
| 1070 |
+
|
| 1071 |
+
# Subtab: Interactive Tree Analysis
|
| 1072 |
+
with gr.Tab("Interactive Analysis"):
|
| 1073 |
+
gr.Markdown("**Analyze sequence similarity with interactive tree**")
|
| 1074 |
+
|
| 1075 |
+
with gr.Row():
|
| 1076 |
+
with gr.Column(scale=1):
|
| 1077 |
+
tree_seq_input = gr.Textbox(
|
| 1078 |
+
label="Query Sequence",
|
| 1079 |
+
placeholder="Enter sequence for tree analysis...",
|
| 1080 |
+
lines=4
|
| 1081 |
+
)
|
| 1082 |
+
similarity_slider = gr.Slider(
|
| 1083 |
+
minimum=1,
|
| 1084 |
+
maximum=99,
|
| 1085 |
+
value=80,
|
| 1086 |
+
step=1,
|
| 1087 |
+
label="Similarity Threshold (%)"
|
| 1088 |
+
)
|
| 1089 |
+
tree_analyze_btn = gr.Button("🔍 Analyze Tree", variant="primary")
|
| 1090 |
+
|
| 1091 |
+
with gr.Column(scale=2):
|
| 1092 |
+
tree_analysis_output = gr.Textbox(
|
| 1093 |
+
label="Tree Analysis Results",
|
| 1094 |
+
lines=15,
|
| 1095 |
+
elem_classes=["output-text"]
|
| 1096 |
+
)
|
| 1097 |
+
tree_file_output = gr.File(
|
| 1098 |
+
label="Interactive Tree File (HTML)"
|
| 1099 |
+
)
|
| 1100 |
+
|
| 1101 |
+
tree_analyze_btn.click(
|
| 1102 |
+
fn=analyze_tree_interface,
|
| 1103 |
+
inputs=[tree_seq_input, similarity_slider],
|
| 1104 |
+
outputs=[tree_analysis_output, tree_file_output]
|
| 1105 |
+
)
|
| 1106 |
+
|
| 1107 |
+
# Tab 4: Model Information
|
| 1108 |
+
with gr.Tab("ℹ️ Model Information"):
|
| 1109 |
+
gr.Markdown("""
|
| 1110 |
+
### Model Status and Information
|
| 1111 |
|
| 1112 |
+
**Available Models:**
|
| 1113 |
+
""")
|
|
|
|
|
|
|
| 1114 |
|
| 1115 |
+
# Model status
|
| 1116 |
+
model_status = []
|
| 1117 |
+
if boundary_model:
|
| 1118 |
+
model_status.append("✅ Boundary Detection Model: Loaded")
|
| 1119 |
+
else:
|
| 1120 |
+
model_status.append("❌ Boundary Detection Model: Not Available")
|
| 1121 |
+
|
| 1122 |
+
if keras_model:
|
| 1123 |
+
model_status.append("✅ Keras Validation Model: Loaded")
|
| 1124 |
+
else:
|
| 1125 |
+
model_status.append("❌ Keras Validation Model: Not Available")
|
| 1126 |
+
|
| 1127 |
+
if verification_models:
|
| 1128 |
+
model_status.append(f"✅ Verification Models: {len(verification_models)} loaded")
|
| 1129 |
+
for model_name in verification_models.keys():
|
| 1130 |
+
model_status.append(f" - {model_name}")
|
| 1131 |
+
else:
|
| 1132 |
+
model_status.append("❌ Verification Models: None loaded")
|
| 1133 |
+
|
| 1134 |
+
if analyzer:
|
| 1135 |
+
model_status.append("✅ Tree Analyzer: Initialized")
|
| 1136 |
+
else:
|
| 1137 |
+
model_status.append("❌ Tree Analyzer: Not Available")
|
| 1138 |
+
|
| 1139 |
+
# Check external tools
|
| 1140 |
+
mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
|
| 1141 |
+
if mafft_available:
|
| 1142 |
+
model_status.append(f"✅ MAFFT: Available ({mafft_cmd})")
|
| 1143 |
+
else:
|
| 1144 |
+
model_status.append("❌ MAFFT: Not Available")
|
| 1145 |
+
|
| 1146 |
+
if iqtree_available:
|
| 1147 |
+
model_status.append(f"✅ IQ-TREE: Available ({iqtree_cmd})")
|
| 1148 |
+
else:
|
| 1149 |
+
model_status.append("❌ IQ-TREE: Not Available")
|
| 1150 |
+
|
| 1151 |
+
gr.Markdown("\n".join(model_status))
|
| 1152 |
+
|
| 1153 |
+
gr.Markdown("""
|
| 1154 |
+
### Usage Guidelines:
|
| 1155 |
+
|
| 1156 |
+
1. **Gene Prediction**: Input DNA sequences containing only A, T, C, G characters
|
| 1157 |
+
2. **File Processing**: Upload FASTA files with multiple sequences
|
| 1158 |
+
3. **ML Trees**: Requires MAFFT and IQ-TREE installation
|
| 1159 |
+
4. **Interactive Trees**: Uses simplified clustering for quick analysis
|
| 1160 |
+
|
| 1161 |
+
### System Requirements:
|
| 1162 |
+
- Python 3.8+
|
| 1163 |
+
- TensorFlow/Keras for neural network models
|
| 1164 |
+
- PyTorch for boundary detection
|
| 1165 |
+
- MAFFT and IQ-TREE for phylogenetic analysis (optional)
|
| 1166 |
+
""")
|
| 1167 |
+
|
| 1168 |
+
return interface
|
| 1169 |
|
| 1170 |
# --- Main Application ---
|
| 1171 |
if __name__ == "__main__":
|
| 1172 |
+
# Initialize logging
|
| 1173 |
+
logging.basicConfig(
|
| 1174 |
+
level=logging.INFO,
|
| 1175 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 1176 |
+
handlers=[
|
| 1177 |
+
logging.FileHandler('gene_analysis.log'),
|
| 1178 |
+
logging.StreamHandler(sys.stdout)
|
| 1179 |
+
]
|
| 1180 |
+
)
|
| 1181 |
+
|
| 1182 |
+
# Create output directories
|
| 1183 |
+
os.makedirs("output", exist_ok=True)
|
| 1184 |
+
os.makedirs("ml_tree_output", exist_ok=True)
|
| 1185 |
+
|
| 1186 |
+
# Log startup information
|
| 1187 |
+
logging.info("Starting Gene Analysis Tool")
|
| 1188 |
+
logging.info(f"Boundary model loaded: {boundary_model is not None}")
|
| 1189 |
+
logging.info(f"Keras model loaded: {keras_model is not None}")
|
| 1190 |
+
logging.info(f"Verification models loaded: {len(verification_models) if verification_models else 0}")
|
| 1191 |
+
logging.info(f"Tree analyzer initialized: {analyzer is not None}")
|
| 1192 |
+
|
| 1193 |
+
# Check external tools
|
| 1194 |
+
mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
|
| 1195 |
+
logging.info(f"MAFFT available: {mafft_available}")
|
| 1196 |
+
logging.info(f"IQ-TREE available: {iqtree_available}")
|
| 1197 |
+
|
| 1198 |
+
# Create and launch interface
|
| 1199 |
try:
|
| 1200 |
+
interface = create_gradio_interface()
|
| 1201 |
|
| 1202 |
+
# Launch with appropriate settings
|
| 1203 |
+
interface.launch(
|
| 1204 |
+
share=False, # Set to True if you want a public link
|
| 1205 |
+
server_name="0.0.0.0", # Allow external connections
|
|
|
|
|
|
|
| 1206 |
server_port=7860,
|
| 1207 |
+
show_error=True,
|
| 1208 |
+
debug=True
|
|
|
|
| 1209 |
)
|
| 1210 |
|
| 1211 |
except Exception as e:
|
| 1212 |
+
logging.error(f"Failed to launch interface: {e}")
|
| 1213 |
+
import traceback
|
| 1214 |
+
logging.error(f"Full traceback: {traceback.format_exc()}")
|
| 1215 |
+
sys.exit(1)
|