Spaces:

WANDSAI
/

GenSeq

Sleeping

App Files Files Community

GenSeq / app.py

Accelernate

Update app.py

5c01f06 verified over 1 year ago

raw

history blame contribute delete

5.35 kB

	import streamlit as st
	import numpy as np
	import matplotlib.pyplot as plt
	from Bio import Entrez, SeqIO
	from Bio.Seq import Seq
	from io import StringIO

	# Set your email for NCBI Entrez
	Entrez.email = "nate@wands.ai"

	def fetch_sequence_from_ncbi(accession):
	try:
	handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
	record = SeqIO.read(handle, "fasta")
	return str(record.seq)
	except:
	return None

	def calculate_gc_content(seq):
	gc_count = seq.count('G') + seq.count('C')
	total_count = len(seq)
	return (gc_count / total_count) * 100 if total_count > 0 else 0

	def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
	gc_content = []
	for i in range(len(seq) - window_size + 1):
	window = seq[i:i+window_size]
	gc_content.append(calculate_gc_content(window))

	regulatory_regions = []
	in_region = False
	start = 0
	for i, gc in enumerate(gc_content):
	if gc > gc_threshold and not in_region:
	in_region = True
	start = i
	elif gc <= gc_threshold and in_region:
	in_region = False
	regulatory_regions.append((start, i + window_size))

	if in_region:
	regulatory_regions.append((start, len(seq)))

	return regulatory_regions, gc_content

	def analyze_dark_matter(sequence):
	seq = Seq(sequence)

	# Basic statistics
	length = len(seq)
	gc_content = calculate_gc_content(seq)

	# Look for common regulatory motifs
	tata_box = seq.count("TATAAA")
	caat_box = seq.count("CCAAT")

	# Find potential regulatory regions based on GC content
	regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)

	return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution

	def plot_gc_distribution(gc_distribution):
	fig, ax = plt.subplots(figsize=(10, 4))
	ax.plot(gc_distribution)
	ax.set_xlabel('Sequence Position')
	ax.set_ylabel('GC Content (%)')
	ax.set_title('GC Content Distribution')
	ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
	ax.legend()
	return fig

	# Streamlit app
	st.title("Gene Sequence Analyzer")

	sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))

	if sequence_input == "Enter sequence":
	sequence = st.text_area("Paste your DNA sequence here", height=150)
	else:
	accession = st.text_input("Enter NCBI accession number")
	if accession:
	sequence = fetch_sequence_from_ncbi(accession)
	if sequence:
	st.success(f"Successfully fetched sequence for {accession}")
	else:
	st.error("Failed to fetch sequence. Please check the accession number.")

	if st.button("Analyze"):
	if sequence:
	length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)

	st.subheader("Analysis Results")

	st.write(f"Sequence Length: {length} base pairs")
	st.write("Description: This is the total number of nucleotides in the sequence.")

	st.write(f"Overall GC Content: {gc_content:.2f}%")
	st.write("Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.")

	st.write(f"TATA Box motifs: {tata_box}")
	st.write("Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.")

	st.write(f"CAAT Box motifs: {caat_box}")
	st.write("Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.")

	st.subheader("Potential Regulatory Regions (based on GC content):")
	if regulatory_regions:
	for start, end in regulatory_regions:
	st.write(f"Region from base {start} to {end}")
	else:
	st.write("No potential regulatory regions identified based on GC content.")
	st.write("Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.")

	st.subheader("GC Content Distribution")
	fig = plot_gc_distribution(gc_distribution)
	st.pyplot(fig)
	st.write("Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.")

	# Visualize the sequence with highlighted regions
	st.subheader("Sequence Visualization")
	highlighted_seq = list(sequence)
	for start, end in regulatory_regions:
	for i in range(start, min(end, len(highlighted_seq))):
	highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"

	st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
	st.write("Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.")
	else:
	st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")