File size: 5,348 Bytes
28578a5
 
bed5689
 
8d7db0b
bed5689
 
 
 
 
 
 
 
 
 
 
 
28578a5
8d7db0b
 
 
 
 
ddb223f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed5689
28578a5
 
 
 
 
 
8d7db0b
28578a5
 
 
 
 
ddb223f
bed5689
28578a5
bed5689
 
 
 
 
 
 
 
 
 
 
28578a5
 
5c01f06
28578a5
bed5689
 
 
 
 
 
 
 
 
 
 
 
28578a5
 
 
bed5689
 
 
28578a5
bed5689
 
 
 
 
 
 
 
 
 
 
28578a5
ddb223f
bed5689
 
 
 
 
 
 
 
 
 
 
28578a5
 
bed5689
28578a5
 
 
 
 
 
bed5689
28578a5
bed5689
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from io import StringIO

# Set your email for NCBI Entrez
Entrez.email = "nate@wands.ai"  

def fetch_sequence_from_ncbi(accession):
    try:
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        return str(record.seq)
    except:
        return None

def calculate_gc_content(seq):
    gc_count = seq.count('G') + seq.count('C')
    total_count = len(seq)
    return (gc_count / total_count) * 100 if total_count > 0 else 0

def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
    gc_content = []
    for i in range(len(seq) - window_size + 1):
        window = seq[i:i+window_size]
        gc_content.append(calculate_gc_content(window))
    
    regulatory_regions = []
    in_region = False
    start = 0
    for i, gc in enumerate(gc_content):
        if gc > gc_threshold and not in_region:
            in_region = True
            start = i
        elif gc <= gc_threshold and in_region:
            in_region = False
            regulatory_regions.append((start, i + window_size))
    
    if in_region:
        regulatory_regions.append((start, len(seq)))
    
    return regulatory_regions, gc_content

def analyze_dark_matter(sequence):
    seq = Seq(sequence)
    
    # Basic statistics
    length = len(seq)
    gc_content = calculate_gc_content(seq)
    
    # Look for common regulatory motifs
    tata_box = seq.count("TATAAA")
    caat_box = seq.count("CCAAT")
    
    # Find potential regulatory regions based on GC content
    regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
    
    return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution

def plot_gc_distribution(gc_distribution):
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.plot(gc_distribution)
    ax.set_xlabel('Sequence Position')
    ax.set_ylabel('GC Content (%)')
    ax.set_title('GC Content Distribution')
    ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
    ax.legend()
    return fig

# Streamlit app
st.title("Gene Sequence Analyzer")

sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))

if sequence_input == "Enter sequence":
    sequence = st.text_area("Paste your DNA sequence here", height=150)
else:
    accession = st.text_input("Enter NCBI accession number")
    if accession:
        sequence = fetch_sequence_from_ncbi(accession)
        if sequence:
            st.success(f"Successfully fetched sequence for {accession}")
        else:
            st.error("Failed to fetch sequence. Please check the accession number.")

if st.button("Analyze"):
    if sequence:
        length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
        
        st.subheader("Analysis Results")
        
        st.write(f"**Sequence Length:** {length} base pairs")
        st.write("*Description: This is the total number of nucleotides in the sequence.*")
        
        st.write(f"**Overall GC Content:** {gc_content:.2f}%")
        st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
        
        st.write(f"**TATA Box motifs:** {tata_box}")
        st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
        
        st.write(f"**CAAT Box motifs:** {caat_box}")
        st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
        
        st.subheader("Potential Regulatory Regions (based on GC content):")
        if regulatory_regions:
            for start, end in regulatory_regions:
                st.write(f"Region from base {start} to {end}")
        else:
            st.write("No potential regulatory regions identified based on GC content.")
        st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
        
        st.subheader("GC Content Distribution")
        fig = plot_gc_distribution(gc_distribution)
        st.pyplot(fig)
        st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
        
        # Visualize the sequence with highlighted regions
        st.subheader("Sequence Visualization")
        highlighted_seq = list(sequence)
        for start, end in regulatory_regions:
            for i in range(start, min(end, len(highlighted_seq))):
                highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
        
        st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
        st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
    else:
        st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")