Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,20 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import numpy as np
|
|
|
|
|
|
|
| 3 |
from Bio.Seq import Seq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def calculate_gc_content(seq):
|
| 6 |
gc_count = seq.count('G') + seq.count('C')
|
|
@@ -27,7 +41,7 @@ def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
|
|
| 27 |
if in_region:
|
| 28 |
regulatory_regions.append((start, len(seq)))
|
| 29 |
|
| 30 |
-
return regulatory_regions
|
| 31 |
|
| 32 |
def analyze_dark_matter(sequence):
|
| 33 |
seq = Seq(sequence)
|
|
@@ -41,34 +55,75 @@ def analyze_dark_matter(sequence):
|
|
| 41 |
caat_box = seq.count("CCAAT")
|
| 42 |
|
| 43 |
# Find potential regulatory regions based on GC content
|
| 44 |
-
regulatory_regions = find_potential_regulatory_regions(seq)
|
| 45 |
|
| 46 |
-
return length, gc_content, tata_box, caat_box, regulatory_regions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Streamlit app
|
| 49 |
st.title("Genomic Dark Matter Analyzer")
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
if st.button("Analyze"):
|
| 54 |
if sequence:
|
| 55 |
-
length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
st.write(f"Sequence Length: {length}")
|
| 58 |
-
st.write(
|
| 59 |
-
|
| 60 |
-
st.write(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
st.subheader("Potential Regulatory Regions (based on GC content):")
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# Visualize the sequence with highlighted regions
|
|
|
|
| 67 |
highlighted_seq = list(sequence)
|
| 68 |
for start, end in regulatory_regions:
|
| 69 |
for i in range(start, min(end, len(highlighted_seq))):
|
| 70 |
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
|
| 71 |
|
| 72 |
st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
|
|
|
|
| 73 |
else:
|
| 74 |
-
st.write("Please enter a DNA sequence.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
from Bio import Entrez, SeqIO
|
| 5 |
from Bio.Seq import Seq
|
| 6 |
+
from io import StringIO
|
| 7 |
+
|
| 8 |
+
# Set your email for NCBI Entrez
|
| 9 |
+
Entrez.email = "nate@wands.ai"
|
| 10 |
+
|
| 11 |
+
def fetch_sequence_from_ncbi(accession):
|
| 12 |
+
try:
|
| 13 |
+
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
|
| 14 |
+
record = SeqIO.read(handle, "fasta")
|
| 15 |
+
return str(record.seq)
|
| 16 |
+
except:
|
| 17 |
+
return None
|
| 18 |
|
| 19 |
def calculate_gc_content(seq):
|
| 20 |
gc_count = seq.count('G') + seq.count('C')
|
|
|
|
| 41 |
if in_region:
|
| 42 |
regulatory_regions.append((start, len(seq)))
|
| 43 |
|
| 44 |
+
return regulatory_regions, gc_content
|
| 45 |
|
| 46 |
def analyze_dark_matter(sequence):
|
| 47 |
seq = Seq(sequence)
|
|
|
|
| 55 |
caat_box = seq.count("CCAAT")
|
| 56 |
|
| 57 |
# Find potential regulatory regions based on GC content
|
| 58 |
+
regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
|
| 59 |
|
| 60 |
+
return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution
|
| 61 |
+
|
| 62 |
+
def plot_gc_distribution(gc_distribution):
|
| 63 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
| 64 |
+
ax.plot(gc_distribution)
|
| 65 |
+
ax.set_xlabel('Sequence Position')
|
| 66 |
+
ax.set_ylabel('GC Content (%)')
|
| 67 |
+
ax.set_title('GC Content Distribution')
|
| 68 |
+
ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
|
| 69 |
+
ax.legend()
|
| 70 |
+
return fig
|
| 71 |
|
| 72 |
# Streamlit app
|
| 73 |
st.title("Genomic Dark Matter Analyzer")
|
| 74 |
|
| 75 |
+
sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))
|
| 76 |
+
|
| 77 |
+
if sequence_input == "Enter sequence":
|
| 78 |
+
sequence = st.text_area("Paste your DNA sequence here", height=150)
|
| 79 |
+
else:
|
| 80 |
+
accession = st.text_input("Enter NCBI accession number")
|
| 81 |
+
if accession:
|
| 82 |
+
sequence = fetch_sequence_from_ncbi(accession)
|
| 83 |
+
if sequence:
|
| 84 |
+
st.success(f"Successfully fetched sequence for {accession}")
|
| 85 |
+
else:
|
| 86 |
+
st.error("Failed to fetch sequence. Please check the accession number.")
|
| 87 |
|
| 88 |
if st.button("Analyze"):
|
| 89 |
if sequence:
|
| 90 |
+
length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
|
| 91 |
+
|
| 92 |
+
st.subheader("Analysis Results")
|
| 93 |
|
| 94 |
+
st.write(f"**Sequence Length:** {length} base pairs")
|
| 95 |
+
st.write("*Description: This is the total number of nucleotides in the sequence.*")
|
| 96 |
+
|
| 97 |
+
st.write(f"**Overall GC Content:** {gc_content:.2f}%")
|
| 98 |
+
st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
|
| 99 |
+
|
| 100 |
+
st.write(f"**TATA Box motifs:** {tata_box}")
|
| 101 |
+
st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
|
| 102 |
+
|
| 103 |
+
st.write(f"**CAAT Box motifs:** {caat_box}")
|
| 104 |
+
st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
|
| 105 |
|
| 106 |
st.subheader("Potential Regulatory Regions (based on GC content):")
|
| 107 |
+
if regulatory_regions:
|
| 108 |
+
for start, end in regulatory_regions:
|
| 109 |
+
st.write(f"Region from base {start} to {end}")
|
| 110 |
+
else:
|
| 111 |
+
st.write("No potential regulatory regions identified based on GC content.")
|
| 112 |
+
st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
|
| 113 |
+
|
| 114 |
+
st.subheader("GC Content Distribution")
|
| 115 |
+
fig = plot_gc_distribution(gc_distribution)
|
| 116 |
+
st.pyplot(fig)
|
| 117 |
+
st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
|
| 118 |
|
| 119 |
# Visualize the sequence with highlighted regions
|
| 120 |
+
st.subheader("Sequence Visualization")
|
| 121 |
highlighted_seq = list(sequence)
|
| 122 |
for start, end in regulatory_regions:
|
| 123 |
for i in range(start, min(end, len(highlighted_seq))):
|
| 124 |
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
|
| 125 |
|
| 126 |
st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
|
| 127 |
+
st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
|
| 128 |
else:
|
| 129 |
+
st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")
|