import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from io import StringIO

# Set your email for NCBI Entrez
Entrez.email = "nate@wands.ai"  

def fetch_sequence_from_ncbi(accession):
    try:
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        return str(record.seq)
    except:
        return None

def calculate_gc_content(seq):
    gc_count = seq.count('G') + seq.count('C')
    total_count = len(seq)
    return (gc_count / total_count) * 100 if total_count > 0 else 0

def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
    gc_content = []
    for i in range(len(seq) - window_size + 1):
        window = seq[i:i+window_size]
        gc_content.append(calculate_gc_content(window))
    
    regulatory_regions = []
    in_region = False
    start = 0
    for i, gc in enumerate(gc_content):
        if gc > gc_threshold and not in_region:
            in_region = True
            start = i
        elif gc <= gc_threshold and in_region:
            in_region = False
            regulatory_regions.append((start, i + window_size))
    
    if in_region:
        regulatory_regions.append((start, len(seq)))
    
    return regulatory_regions, gc_content

def analyze_dark_matter(sequence):
    seq = Seq(sequence)
    
    # Basic statistics
    length = len(seq)
    gc_content = calculate_gc_content(seq)
    
    # Look for common regulatory motifs
    tata_box = seq.count("TATAAA")
    caat_box = seq.count("CCAAT")
    
    # Find potential regulatory regions based on GC content
    regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
    
    return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution

def plot_gc_distribution(gc_distribution):
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.plot(gc_distribution)
    ax.set_xlabel('Sequence Position')
    ax.set_ylabel('GC Content (%)')
    ax.set_title('GC Content Distribution')
    ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
    ax.legend()
    return fig

# Streamlit app
st.title("Gene Sequence Analyzer")

sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))

if sequence_input == "Enter sequence":
    sequence = st.text_area("Paste your DNA sequence here", height=150)
else:
    accession = st.text_input("Enter NCBI accession number")
    if accession:
        sequence = fetch_sequence_from_ncbi(accession)
        if sequence:
            st.success(f"Successfully fetched sequence for {accession}")
        else:
            st.error("Failed to fetch sequence. Please check the accession number.")

if st.button("Analyze"):
    if sequence:
        length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
        
        st.subheader("Analysis Results")
        
        st.write(f"**Sequence Length:** {length} base pairs")
        st.write("*Description: This is the total number of nucleotides in the sequence.*")
        
        st.write(f"**Overall GC Content:** {gc_content:.2f}%")
        st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
        
        st.write(f"**TATA Box motifs:** {tata_box}")
        st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
        
        st.write(f"**CAAT Box motifs:** {caat_box}")
        st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
        
        st.subheader("Potential Regulatory Regions (based on GC content):")
        if regulatory_regions:
            for start, end in regulatory_regions:
                st.write(f"Region from base {start} to {end}")
        else:
            st.write("No potential regulatory regions identified based on GC content.")
        st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
        
        st.subheader("GC Content Distribution")
        fig = plot_gc_distribution(gc_distribution)
        st.pyplot(fig)
        st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
        
        # Visualize the sequence with highlighted regions
        st.subheader("Sequence Visualization")
        highlighted_seq = list(sequence)
        for start, end in regulatory_regions:
            for i in range(start, min(end, len(highlighted_seq))):
                highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
        
        st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
        st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
    else:
        st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")