Spaces:

WANDSAI
/

GenSeq

Sleeping

App Files Files Community

Accelernate commited on Jul 12, 2024

Commit

bed5689

verified ·

1 Parent(s): ca3e4c2

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -12

app.py CHANGED Viewed

@@ -1,6 +1,20 @@
 import streamlit as st
 import numpy as np
 from Bio.Seq import Seq
 def calculate_gc_content(seq):
     gc_count = seq.count('G') + seq.count('C')
@@ -27,7 +41,7 @@ def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
     if in_region:
         regulatory_regions.append((start, len(seq)))
-    return regulatory_regions
 def analyze_dark_matter(sequence):
     seq = Seq(sequence)
@@ -41,34 +55,75 @@ def analyze_dark_matter(sequence):
     caat_box = seq.count("CCAAT")
     # Find potential regulatory regions based on GC content
-    regulatory_regions = find_potential_regulatory_regions(seq)
-    return length, gc_content, tata_box, caat_box, regulatory_regions
 # Streamlit app
 st.title("Genomic Dark Matter Analyzer")
-sequence = st.text_area("Paste your DNA sequence here", height=150)
 if st.button("Analyze"):
     if sequence:
-        length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
-        st.write(f"Sequence Length: {length}")
-        st.write(f"Overall GC Content: {gc_content:.2f}%")
-        st.write(f"TATA Box motifs: {tata_box}")
-        st.write(f"CAAT Box motifs: {caat_box}")
         st.subheader("Potential Regulatory Regions (based on GC content):")
-        for start, end in regulatory_regions:
-            st.write(f"Region from base {start} to {end}")
         # Visualize the sequence with highlighted regions
         highlighted_seq = list(sequence)
         for start, end in regulatory_regions:
             for i in range(start, min(end, len(highlighted_seq))):
                 highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
         st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
     else:
-        st.write("Please enter a DNA sequence.")

 import streamlit as st
 import numpy as np
+import matplotlib.pyplot as plt
+from Bio import Entrez, SeqIO
 from Bio.Seq import Seq
+from io import StringIO
+# Set your email for NCBI Entrez
+Entrez.email = "nate@wands.ai"
+def fetch_sequence_from_ncbi(accession):
+    try:
+        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
+        record = SeqIO.read(handle, "fasta")
+        return str(record.seq)
+    except:
+        return None
 def calculate_gc_content(seq):
     gc_count = seq.count('G') + seq.count('C')
     if in_region:
         regulatory_regions.append((start, len(seq)))
+    return regulatory_regions, gc_content
 def analyze_dark_matter(sequence):
     seq = Seq(sequence)
     caat_box = seq.count("CCAAT")
     # Find potential regulatory regions based on GC content
+    regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
+    return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution
+def plot_gc_distribution(gc_distribution):
+    fig, ax = plt.subplots(figsize=(10, 4))
+    ax.plot(gc_distribution)
+    ax.set_xlabel('Sequence Position')
+    ax.set_ylabel('GC Content (%)')
+    ax.set_title('GC Content Distribution')
+    ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
+    ax.legend()
+    return fig
 # Streamlit app
 st.title("Genomic Dark Matter Analyzer")
+sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))
+if sequence_input == "Enter sequence":
+    sequence = st.text_area("Paste your DNA sequence here", height=150)
+else:
+    accession = st.text_input("Enter NCBI accession number")
+    if accession:
+        sequence = fetch_sequence_from_ncbi(accession)
+        if sequence:
+            st.success(f"Successfully fetched sequence for {accession}")
+        else:
+            st.error("Failed to fetch sequence. Please check the accession number.")
 if st.button("Analyze"):
     if sequence:
+        length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
+        st.subheader("Analysis Results")
+        st.write(f"**Sequence Length:** {length} base pairs")
+        st.write("*Description: This is the total number of nucleotides in the sequence.*")
+        st.write(f"**Overall GC Content:** {gc_content:.2f}%")
+        st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
+        st.write(f"**TATA Box motifs:** {tata_box}")
+        st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
+        st.write(f"**CAAT Box motifs:** {caat_box}")
+        st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
         st.subheader("Potential Regulatory Regions (based on GC content):")
+        if regulatory_regions:
+            for start, end in regulatory_regions:
+                st.write(f"Region from base {start} to {end}")
+        else:
+            st.write("No potential regulatory regions identified based on GC content.")
+        st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
+        st.subheader("GC Content Distribution")
+        fig = plot_gc_distribution(gc_distribution)
+        st.pyplot(fig)
+        st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
         # Visualize the sequence with highlighted regions
+        st.subheader("Sequence Visualization")
         highlighted_seq = list(sequence)
         for start, end in regulatory_regions:
             for i in range(start, min(end, len(highlighted_seq))):
                 highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
         st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
+        st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
     else:
+        st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")