Spaces:

yashm
/

OpenGene_V1.1

Sleeping

App Files Files Community

OpenGene_V1.1 / app.py

yashm

Update app.py

b68b0f3 verified over 1 year ago

raw

history blame contribute delete

10.9 kB

	# Import libraries
	import streamlit as st
	import pandas as pd
	from Bio import SeqIO
	from Bio.SeqUtils.ProtParam import ProteinAnalysis
	from Bio.Graphics.GenomeDiagram import Diagram, Track, FeatureSet
	from reportlab.lib import colors
	from reportlab.lib.units import cm
	from io import StringIO
	from collections import Counter
	import numpy as np
	import altair as alt
	import os
	import re
	import plotly.express as px

	# Ensure the 'temp' directory exists for saving temporary files
	temp_dir = "temp"
	os.makedirs(temp_dir, exist_ok=True)

	# Function to parse GenBank file
	def parse_genbank(uploaded_file):
	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
	record = SeqIO.read(stringio, "genbank")
	organism = record.annotations.get('organism', 'Unknown Organism')
	features = record.features
	feature_types = Counter([feature.type for feature in features])

	genes, cds = [], []
	for feature in features:
	if feature.type == "gene":
	genes.append(feature)
	elif feature.type == "CDS":
	cds.append(feature)

	gene_info = [{
	'Gene': gene_feature.qualifiers.get('gene', ['N/A'])[0],
	'Length': len(gene_feature),
	'Location': str(gene_feature.location),
	'Sequence': str(gene_feature.extract(record.seq))
	} for gene_feature in genes]

	cds_info = [{
	'Gene': cds_feature.qualifiers.get('gene', ['N/A'])[0],
	'Protein': cds_feature.qualifiers.get('translation', ['N/A'])[0],
	'Length': len(cds_feature),
	'Location': str(cds_feature.location)
	} for cds_feature in cds]

	gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100

	return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)

	# Function to calculate GC content over genome
	def calculate_gc_content(sequence, window_size=1000):
	gc_content = [
	(sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
	for i in range(0, len(sequence) - window_size + 1, window_size)
	]
	return gc_content

	# Function to calculate k-mers
	def calculate_kmers(sequence, k):
	kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
	return kmers

	# Function to add molecular weight and isoelectric point to CDS information
	def add_protein_features(cds_info):
	for cds in cds_info:
	if cds['Protein'] != 'N/A':
	prot_analysis = ProteinAnalysis(cds['Protein'])
	cds['Molecular Weight'] = prot_analysis.molecular_weight()
	cds['Isoelectric Point'] = prot_analysis.isoelectric_point()
	else:
	cds['Molecular Weight'] = 'N/A'
	cds['Isoelectric Point'] = 'N/A'
	return cds_info

	# Updated Function to generate genome diagram
	def create_genome_diagram(genbank_content, output_file_path, colors_dict, diagram_type="linear", diagram_size=(30, 10)):
	from Bio.Graphics.GenomeDiagram import Diagram, Track, FeatureSet
	record = SeqIO.read(StringIO(genbank_content), "genbank")
	gd_diagram = Diagram(record.id)

	# Create separate tracks for different feature types
	max_tracks = len(colors_dict)
	track_indices = {feature_type: idx+1 for idx, feature_type in enumerate(colors_dict.keys())}
	feature_tracks = {}
	feature_sets = {} # Dictionary to store FeatureSets
	for feature_type, idx in track_indices.items():
	feature_tracks[feature_type] = gd_diagram.new_track(
	idx, name=feature_type, scale=False, greytrack=False, height=0.5
	)
	# Store the FeatureSet
	feature_sets[feature_type] = feature_tracks[feature_type].new_set()

	for feature in record.features:
	feature_type = feature.type
	if feature_type in colors_dict:
	color = colors.HexColor(colors_dict[feature_type])
	# Retrieve the FeatureSet from the dictionary
	feature_set = feature_sets[feature_type]
	feature_set.add_feature(
	feature,
	color=color,
	label=True,
	label_size=6, # Decreased label size
	label_angle=0,
	label_position="start", # Position label at the start of the feature
	label_strand=1 if feature.strand == 1 else -1, # Position labels according to strand
	sigil="ARROW", # Use arrows to represent features
	arrowshaft_height=1.0,
	arrowhead_length=1.0,
	)

	if diagram_type.lower() == "circular":
	gd_diagram.draw(
	format="circular",
	circular=True,
	pagesize=(diagram_size[0]cm, diagram_size[1]cm),
	start=0,
	end=len(record),
	circle_core=0.7
	)
	else:
	gd_diagram.draw(
	format="linear",
	pagesize=(diagram_size[0]cm, diagram_size[1]cm),
	fragments=1,
	start=0,
	end=len(record),
	tracklines=False # Remove track lines to reduce clutter
	)
	gd_diagram.write(output_file_path, "SVG")

	# Function to search for a motif or pattern within the DNA sequence
	def search_motif(sequence, motif):
	matches = [(match.start(), match.end()) for match in re.finditer(motif, sequence)]
	return matches

	# Function to calculate codon usage frequencies
	def calculate_codon_usage(sequence):
	codons = [sequence[i:i+3] for i in range(0, len(sequence)-2, 3)]
	codons = [codon for codon in codons if len(codon) == 3] # Ensure codons are of length 3
	codon_freq = Counter(codons)
	total_codons = sum(codon_freq.values())
	codon_freq_percent = {codon: (count / total_codons) * 100 for codon, count in codon_freq.items()}
	return codon_freq_percent

	# Streamlit UI setup
	st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
	uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])

	if uploaded_file is not None:
	organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
	cds_info = add_protein_features(cds_info)
	gene_df = pd.DataFrame(gene_info)
	cds_df = pd.DataFrame(cds_info)

	# Sidebar
	with st.sidebar:
	st.title('Genomic Data Dashboard')
	st.write(f'Organism: {organism}')
	window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
	k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)
	motif = st.text_input("Enter motif or pattern to search")

	st.markdown("### Genome Diagram Settings")
	diagram_type = st.radio("Select Diagram Type:", ["Linear", "Circular"], index=0)
	diagram_width = st.number_input("Diagram Width (cm):", min_value=1, max_value=50, value=30)
	diagram_height = st.number_input("Diagram Height (cm):", min_value=1, max_value=50, value=10)

	# Sidebar options for diagram customization
	st.markdown("### Feature Colors")
	color_gene = st.color_picker("Pick a color for genes", '#ff9999')
	color_cds = st.color_picker("Pick a color for CDS", '#66b3ff')
	color_trna = st.color_picker("Pick a color for tRNA", '#99ff99')
	color_rrna = st.color_picker("Pick a color for rRNA", '#ffcc99')

	# Main content
	col1, col2 = st.columns(2)
	with col1:
	st.markdown('### General Information')
	st.write(f'Organism: {organism}')
	st.write(f'Sequence Length: {sequence_length} bp')
	st.write(f'GC Content: {gc_content:.2f}%')
	st.write(f'Number of Genes: {len(gene_df)}')
	st.write(f'Number of Coding Sequences (CDS): {len(cds_df)}')
	st.markdown('### Feature Counts')
	for feature_type, count in feature_types.items():
	st.write(f"{feature_type}: {count}")

	with col2:
	st.markdown('### GC Content Over Genome')
	gc_content_over_genome = calculate_gc_content(sequence, window_size)
	gc_chart = alt.Chart(pd.DataFrame({
	'GC Content': gc_content_over_genome,
	'Position': np.arange(len(gc_content_over_genome)) * window_size
	})).mark_line().encode(
	x='Position:Q',
	y='GC Content:Q'
	).properties(height=200)
	st.altair_chart(gc_chart, use_container_width=True)

	st.markdown('### K-mer Analysis')
	kmers = calculate_kmers(sequence, k)
	kmer_df = pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20)
	st.bar_chart(kmer_df)

	# Construct the colors dictionary
	feature_colors = {
	'gene': color_gene,
	'CDS': color_cds,
	'tRNA': color_trna,
	'rRNA': color_rrna
	}

	# Generate and display genome diagram with user-selected feature colors
	output_file_path_svg = os.path.join(temp_dir, "genome_diagram.svg")
	create_genome_diagram(
	uploaded_file.getvalue().decode("utf-8"),
	output_file_path_svg,
	feature_colors,
	diagram_type=diagram_type.lower(),
	diagram_size=(diagram_width, diagram_height)
	)
	st.image(output_file_path_svg, caption='Genome Diagram')

	# Motif Search
	if motif:
	matches = search_motif(sequence, motif)
	st.markdown(f"### Motif Search Results for '{motif}'")
	if matches:
	st.write("Matches found at positions:")
	matches_df = pd.DataFrame(matches, columns=["Start", "End"])
	st.dataframe(matches_df)
	else:
	st.write("No matches found.")

	# Codon Usage Analysis
	st.markdown("### Codon Usage Analysis")
	codon_usage_freq = calculate_codon_usage(sequence)
	codon_usage_df = pd.DataFrame.from_dict(codon_usage_freq, orient='index', columns=['Frequency (%)'])
	codon_usage_df.index.name = 'Codon'
	codon_usage_df.reset_index(inplace=True)
	st.dataframe(codon_usage_df)

	# Interactive Visualization
	st.markdown("### Interactive Visualization")
	fig = px.bar(
	codon_usage_df,
	x='Codon',
	y='Frequency (%)',
	labels={'Codon': 'Codon', 'Frequency (%)': 'Frequency (%)'},
	title="Codon Usage Frequency"
	)
	st.plotly_chart(fig)

	# Additional Information
	with st.expander("View All Genes"):
	st.dataframe(gene_df)
	with st.expander("View All Coding Sequences"):
	st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
	else:
	st.warning("Please upload a GenBank file.")

	# Add copyright information section at the end of the main page
	st.markdown("""
	---
	Copyright Notice: © 2024 Dr. Yash Munnalal Gupta. All rights reserved.

	For inquiries or permissions, contact: [yash.610@live.com](mailto:yash.610@live.com)
	""", unsafe_allow_html=True)