import streamlit as st import numpy as np import matplotlib.pyplot as plt from Bio import Entrez, SeqIO from Bio.Seq import Seq from io import StringIO # Set your email for NCBI Entrez Entrez.email = "nate@wands.ai" def fetch_sequence_from_ncbi(accession): try: handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text") record = SeqIO.read(handle, "fasta") return str(record.seq) except: return None def calculate_gc_content(seq): gc_count = seq.count('G') + seq.count('C') total_count = len(seq) return (gc_count / total_count) * 100 if total_count > 0 else 0 def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60): gc_content = [] for i in range(len(seq) - window_size + 1): window = seq[i:i+window_size] gc_content.append(calculate_gc_content(window)) regulatory_regions = [] in_region = False start = 0 for i, gc in enumerate(gc_content): if gc > gc_threshold and not in_region: in_region = True start = i elif gc <= gc_threshold and in_region: in_region = False regulatory_regions.append((start, i + window_size)) if in_region: regulatory_regions.append((start, len(seq))) return regulatory_regions, gc_content def analyze_dark_matter(sequence): seq = Seq(sequence) # Basic statistics length = len(seq) gc_content = calculate_gc_content(seq) # Look for common regulatory motifs tata_box = seq.count("TATAAA") caat_box = seq.count("CCAAT") # Find potential regulatory regions based on GC content regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq) return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution def plot_gc_distribution(gc_distribution): fig, ax = plt.subplots(figsize=(10, 4)) ax.plot(gc_distribution) ax.set_xlabel('Sequence Position') ax.set_ylabel('GC Content (%)') ax.set_title('GC Content Distribution') ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)') ax.legend() return fig # Streamlit app st.title("Gene Sequence Analyzer") sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI")) if sequence_input == "Enter sequence": sequence = st.text_area("Paste your DNA sequence here", height=150) else: accession = st.text_input("Enter NCBI accession number") if accession: sequence = fetch_sequence_from_ncbi(accession) if sequence: st.success(f"Successfully fetched sequence for {accession}") else: st.error("Failed to fetch sequence. Please check the accession number.") if st.button("Analyze"): if sequence: length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence) st.subheader("Analysis Results") st.write(f"**Sequence Length:** {length} base pairs") st.write("*Description: This is the total number of nucleotides in the sequence.*") st.write(f"**Overall GC Content:** {gc_content:.2f}%") st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*") st.write(f"**TATA Box motifs:** {tata_box}") st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*") st.write(f"**CAAT Box motifs:** {caat_box}") st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*") st.subheader("Potential Regulatory Regions (based on GC content):") if regulatory_regions: for start, end in regulatory_regions: st.write(f"Region from base {start} to {end}") else: st.write("No potential regulatory regions identified based on GC content.") st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*") st.subheader("GC Content Distribution") fig = plot_gc_distribution(gc_distribution) st.pyplot(fig) st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*") # Visualize the sequence with highlighted regions st.subheader("Sequence Visualization") highlighted_seq = list(sequence) for start, end in regulatory_regions: for i in range(start, min(end, len(highlighted_seq))): highlighted_seq[i] = f"{highlighted_seq[i]}" st.markdown("".join(highlighted_seq), unsafe_allow_html=True) st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*") else: st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")