|
|
import streamlit as st |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
from Bio import Entrez, SeqIO |
|
|
from Bio.Seq import Seq |
|
|
from io import StringIO |
|
|
|
|
|
|
|
|
Entrez.email = "nate@wands.ai" |
|
|
|
|
|
def fetch_sequence_from_ncbi(accession): |
|
|
try: |
|
|
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text") |
|
|
record = SeqIO.read(handle, "fasta") |
|
|
return str(record.seq) |
|
|
except: |
|
|
return None |
|
|
|
|
|
def calculate_gc_content(seq): |
|
|
gc_count = seq.count('G') + seq.count('C') |
|
|
total_count = len(seq) |
|
|
return (gc_count / total_count) * 100 if total_count > 0 else 0 |
|
|
|
|
|
def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60): |
|
|
gc_content = [] |
|
|
for i in range(len(seq) - window_size + 1): |
|
|
window = seq[i:i+window_size] |
|
|
gc_content.append(calculate_gc_content(window)) |
|
|
|
|
|
regulatory_regions = [] |
|
|
in_region = False |
|
|
start = 0 |
|
|
for i, gc in enumerate(gc_content): |
|
|
if gc > gc_threshold and not in_region: |
|
|
in_region = True |
|
|
start = i |
|
|
elif gc <= gc_threshold and in_region: |
|
|
in_region = False |
|
|
regulatory_regions.append((start, i + window_size)) |
|
|
|
|
|
if in_region: |
|
|
regulatory_regions.append((start, len(seq))) |
|
|
|
|
|
return regulatory_regions, gc_content |
|
|
|
|
|
def analyze_dark_matter(sequence): |
|
|
seq = Seq(sequence) |
|
|
|
|
|
|
|
|
length = len(seq) |
|
|
gc_content = calculate_gc_content(seq) |
|
|
|
|
|
|
|
|
tata_box = seq.count("TATAAA") |
|
|
caat_box = seq.count("CCAAT") |
|
|
|
|
|
|
|
|
regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq) |
|
|
|
|
|
return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution |
|
|
|
|
|
def plot_gc_distribution(gc_distribution): |
|
|
fig, ax = plt.subplots(figsize=(10, 4)) |
|
|
ax.plot(gc_distribution) |
|
|
ax.set_xlabel('Sequence Position') |
|
|
ax.set_ylabel('GC Content (%)') |
|
|
ax.set_title('GC Content Distribution') |
|
|
ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)') |
|
|
ax.legend() |
|
|
return fig |
|
|
|
|
|
|
|
|
st.title("Gene Sequence Analyzer") |
|
|
|
|
|
sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI")) |
|
|
|
|
|
if sequence_input == "Enter sequence": |
|
|
sequence = st.text_area("Paste your DNA sequence here", height=150) |
|
|
else: |
|
|
accession = st.text_input("Enter NCBI accession number") |
|
|
if accession: |
|
|
sequence = fetch_sequence_from_ncbi(accession) |
|
|
if sequence: |
|
|
st.success(f"Successfully fetched sequence for {accession}") |
|
|
else: |
|
|
st.error("Failed to fetch sequence. Please check the accession number.") |
|
|
|
|
|
if st.button("Analyze"): |
|
|
if sequence: |
|
|
length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence) |
|
|
|
|
|
st.subheader("Analysis Results") |
|
|
|
|
|
st.write(f"**Sequence Length:** {length} base pairs") |
|
|
st.write("*Description: This is the total number of nucleotides in the sequence.*") |
|
|
|
|
|
st.write(f"**Overall GC Content:** {gc_content:.2f}%") |
|
|
st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*") |
|
|
|
|
|
st.write(f"**TATA Box motifs:** {tata_box}") |
|
|
st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*") |
|
|
|
|
|
st.write(f"**CAAT Box motifs:** {caat_box}") |
|
|
st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*") |
|
|
|
|
|
st.subheader("Potential Regulatory Regions (based on GC content):") |
|
|
if regulatory_regions: |
|
|
for start, end in regulatory_regions: |
|
|
st.write(f"Region from base {start} to {end}") |
|
|
else: |
|
|
st.write("No potential regulatory regions identified based on GC content.") |
|
|
st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*") |
|
|
|
|
|
st.subheader("GC Content Distribution") |
|
|
fig = plot_gc_distribution(gc_distribution) |
|
|
st.pyplot(fig) |
|
|
st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*") |
|
|
|
|
|
|
|
|
st.subheader("Sequence Visualization") |
|
|
highlighted_seq = list(sequence) |
|
|
for start, end in regulatory_regions: |
|
|
for i in range(start, min(end, len(highlighted_seq))): |
|
|
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>" |
|
|
|
|
|
st.markdown("".join(highlighted_seq), unsafe_allow_html=True) |
|
|
st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*") |
|
|
else: |
|
|
st.write("Please enter a DNA sequence or provide a valid NCBI accession number.") |