File size: 5,348 Bytes
28578a5 bed5689 8d7db0b bed5689 28578a5 8d7db0b ddb223f bed5689 28578a5 8d7db0b 28578a5 ddb223f bed5689 28578a5 bed5689 28578a5 5c01f06 28578a5 bed5689 28578a5 bed5689 28578a5 bed5689 28578a5 ddb223f bed5689 28578a5 bed5689 28578a5 bed5689 28578a5 bed5689 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from io import StringIO
# Set your email for NCBI Entrez
Entrez.email = "nate@wands.ai"
def fetch_sequence_from_ncbi(accession):
try:
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")
return str(record.seq)
except:
return None
def calculate_gc_content(seq):
gc_count = seq.count('G') + seq.count('C')
total_count = len(seq)
return (gc_count / total_count) * 100 if total_count > 0 else 0
def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
gc_content = []
for i in range(len(seq) - window_size + 1):
window = seq[i:i+window_size]
gc_content.append(calculate_gc_content(window))
regulatory_regions = []
in_region = False
start = 0
for i, gc in enumerate(gc_content):
if gc > gc_threshold and not in_region:
in_region = True
start = i
elif gc <= gc_threshold and in_region:
in_region = False
regulatory_regions.append((start, i + window_size))
if in_region:
regulatory_regions.append((start, len(seq)))
return regulatory_regions, gc_content
def analyze_dark_matter(sequence):
seq = Seq(sequence)
# Basic statistics
length = len(seq)
gc_content = calculate_gc_content(seq)
# Look for common regulatory motifs
tata_box = seq.count("TATAAA")
caat_box = seq.count("CCAAT")
# Find potential regulatory regions based on GC content
regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution
def plot_gc_distribution(gc_distribution):
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(gc_distribution)
ax.set_xlabel('Sequence Position')
ax.set_ylabel('GC Content (%)')
ax.set_title('GC Content Distribution')
ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
ax.legend()
return fig
# Streamlit app
st.title("Gene Sequence Analyzer")
sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))
if sequence_input == "Enter sequence":
sequence = st.text_area("Paste your DNA sequence here", height=150)
else:
accession = st.text_input("Enter NCBI accession number")
if accession:
sequence = fetch_sequence_from_ncbi(accession)
if sequence:
st.success(f"Successfully fetched sequence for {accession}")
else:
st.error("Failed to fetch sequence. Please check the accession number.")
if st.button("Analyze"):
if sequence:
length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
st.subheader("Analysis Results")
st.write(f"**Sequence Length:** {length} base pairs")
st.write("*Description: This is the total number of nucleotides in the sequence.*")
st.write(f"**Overall GC Content:** {gc_content:.2f}%")
st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
st.write(f"**TATA Box motifs:** {tata_box}")
st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
st.write(f"**CAAT Box motifs:** {caat_box}")
st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
st.subheader("Potential Regulatory Regions (based on GC content):")
if regulatory_regions:
for start, end in regulatory_regions:
st.write(f"Region from base {start} to {end}")
else:
st.write("No potential regulatory regions identified based on GC content.")
st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
st.subheader("GC Content Distribution")
fig = plot_gc_distribution(gc_distribution)
st.pyplot(fig)
st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
# Visualize the sequence with highlighted regions
st.subheader("Sequence Visualization")
highlighted_seq = list(sequence)
for start, end in regulatory_regions:
for i in range(start, min(end, len(highlighted_seq))):
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
else:
st.write("Please enter a DNA sequence or provide a valid NCBI accession number.") |