GenSeq / app.py
Accelernate's picture
Update app.py
5c01f06 verified
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from io import StringIO
# Set your email for NCBI Entrez
Entrez.email = "nate@wands.ai"
def fetch_sequence_from_ncbi(accession):
try:
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")
return str(record.seq)
except:
return None
def calculate_gc_content(seq):
gc_count = seq.count('G') + seq.count('C')
total_count = len(seq)
return (gc_count / total_count) * 100 if total_count > 0 else 0
def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
gc_content = []
for i in range(len(seq) - window_size + 1):
window = seq[i:i+window_size]
gc_content.append(calculate_gc_content(window))
regulatory_regions = []
in_region = False
start = 0
for i, gc in enumerate(gc_content):
if gc > gc_threshold and not in_region:
in_region = True
start = i
elif gc <= gc_threshold and in_region:
in_region = False
regulatory_regions.append((start, i + window_size))
if in_region:
regulatory_regions.append((start, len(seq)))
return regulatory_regions, gc_content
def analyze_dark_matter(sequence):
seq = Seq(sequence)
# Basic statistics
length = len(seq)
gc_content = calculate_gc_content(seq)
# Look for common regulatory motifs
tata_box = seq.count("TATAAA")
caat_box = seq.count("CCAAT")
# Find potential regulatory regions based on GC content
regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution
def plot_gc_distribution(gc_distribution):
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(gc_distribution)
ax.set_xlabel('Sequence Position')
ax.set_ylabel('GC Content (%)')
ax.set_title('GC Content Distribution')
ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
ax.legend()
return fig
# Streamlit app
st.title("Gene Sequence Analyzer")
sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))
if sequence_input == "Enter sequence":
sequence = st.text_area("Paste your DNA sequence here", height=150)
else:
accession = st.text_input("Enter NCBI accession number")
if accession:
sequence = fetch_sequence_from_ncbi(accession)
if sequence:
st.success(f"Successfully fetched sequence for {accession}")
else:
st.error("Failed to fetch sequence. Please check the accession number.")
if st.button("Analyze"):
if sequence:
length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
st.subheader("Analysis Results")
st.write(f"**Sequence Length:** {length} base pairs")
st.write("*Description: This is the total number of nucleotides in the sequence.*")
st.write(f"**Overall GC Content:** {gc_content:.2f}%")
st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
st.write(f"**TATA Box motifs:** {tata_box}")
st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
st.write(f"**CAAT Box motifs:** {caat_box}")
st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
st.subheader("Potential Regulatory Regions (based on GC content):")
if regulatory_regions:
for start, end in regulatory_regions:
st.write(f"Region from base {start} to {end}")
else:
st.write("No potential regulatory regions identified based on GC content.")
st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
st.subheader("GC Content Distribution")
fig = plot_gc_distribution(gc_distribution)
st.pyplot(fig)
st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
# Visualize the sequence with highlighted regions
st.subheader("Sequence Visualization")
highlighted_seq = list(sequence)
for start, end in regulatory_regions:
for i in range(start, min(end, len(highlighted_seq))):
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
else:
st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")