Spaces:

yashm
/

OpenGene

Sleeping

App Files Files Community

OpenGene / app.py

yashm

Update app.py

30d50dc verified almost 2 years ago

raw

history blame contribute delete

7.18 kB

	# Import libraries
	import streamlit as st
	import pandas as pd
	from Bio import SeqIO
	from Bio.SeqUtils.ProtParam import ProteinAnalysis
	from Bio.Graphics import GenomeDiagram
	from reportlab.lib.colors import Color, lightblue, blue
	from reportlab.lib import colors
	from reportlab.lib.units import cm
	from io import StringIO
	from collections import Counter
	import numpy as np
	import altair as alt
	import os

	# Ensure the 'temp' directory exists for saving temporary files
	temp_dir = "temp"
	os.makedirs(temp_dir, exist_ok=True)

	# Function to parse GenBank file
	def parse_genbank(uploaded_file):
	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
	record = SeqIO.read(stringio, "genbank")
	organism = record.annotations['organism']
	features = record.features
	feature_types = Counter([feature.type for feature in features])

	genes, cds = [], []
	for feature in features:
	if feature.type == "gene":
	genes.append(feature)
	elif feature.type == "CDS":
	cds.append(feature)

	gene_info = [{'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
	'Length': len(gene),
	'Location': str(gene.location)} for gene in genes]

	cds_info = [{'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
	'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
	'Length': len(cds),
	'Location': str(cds.location)} for cds in cds]

	gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100

	return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)

	# Additional functions (calculate_gc_content, calculate_kmers, add_protein_features)
	# Function to calculate GC content over genome
	def calculate_gc_content(sequence, window_size=1000):
	gc_content = [
	(sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
	for i in range(0, len(sequence) - window_size + 1, window_size)
	]
	return gc_content

	# Function to calculate k-mers
	def calculate_kmers(sequence, k):
	kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
	return kmers

	# Function to add molecular weight and isoelectric point to CDS information
	def add_protein_features(cds_info):
	for cds in cds_info:
	if cds['Protein'] != 'N/A':
	prot_analysis = ProteinAnalysis(cds['Protein'])
	cds['Molecular Weight'] = prot_analysis.molecular_weight()
	cds['Isoelectric Point'] = prot_analysis.isoelectric_point()
	else:
	cds['Molecular Weight'] = 'N/A'
	cds['Isoelectric Point'] = 'N/A'
	return cds_info

	# Function to generate genome diagram
	def create_genome_diagram(genbank_content, output_file_path, colors_dict):
	record = SeqIO.read(StringIO(genbank_content), "genbank")
	gd_diagram = GenomeDiagram.Diagram(record.id)
	gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
	gd_feature_set = gd_track_for_features.new_set()

	for feature in record.features:
	feature_type = feature.type
	if feature_type in colors_dict:
	color = colors.HexColor(colors_dict[feature_type])
	gd_feature_set.add_feature(feature, color=color, label=True, label_size=10, label_angle=0)

	gd_diagram.draw(format="circular", circular=True, pagesize=(20cm, 20cm), start=0, end=len(record), circle_core=0.7)
	gd_diagram.write(output_file_path, "SVG")



	# Streamlit UI setup
	st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
	uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])

	if uploaded_file is not None:
	organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
	cds_info = add_protein_features(cds_info) # Ensure this function is defined as per previous instructions
	gene_df = pd.DataFrame(gene_info)
	cds_df = pd.DataFrame(cds_info)

	# Sidebar
	with st.sidebar:
	st.title('Genomic Data Dashboard')
	st.write(f'Organism: {organism}')
	window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
	k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)

	# Sidebar options for diagram customization
	with st.sidebar:
	color_gene = st.color_picker("Pick a color for genes", '#ff9999')
	color_cds = st.color_picker("Pick a color for CDS", '#66b3ff')
	color_trna = st.color_picker("Pick a color for tRNA", '#99ff99')
	color_rrna = st.color_picker("Pick a color for rRNA", '#ffcc99')
	# Option to select what to display on the diagram
	#display_options = st.multiselect("Select features to display:", ['gene', 'tRNA', 'CDS', 'rRNA'], default=['gene', 'CDS'])


	# Main content
	col1, col2 = st.columns(2)
	with col1:
	st.markdown('### General Information')
	st.write(f'Organism: {organism}')
	st.write(f'Sequence Length: {sequence_length} bp')
	st.write(f'GC Content: {gc_content:.2f}%')
	st.write(f'Number of Genes: {len(gene_df)}')
	st.write(f'Number of Coding Sequences (CDS): {len(cds_df)}')
	st.markdown('### Feature Counts')
	for feature_type, count in feature_types.items():
	st.write(f"{feature_type}: {count}")

	with col2:
	st.markdown('### GC Content Over Genome')
	gc_content_over_genome = calculate_gc_content(sequence, window_size) # Ensure this function is defined as per previous instructions
	gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
	x='Position:Q',
	y='GC Content:Q'
	).properties(height=200)
	st.altair_chart(gc_chart, use_container_width=True)

	st.markdown('### K-mer Analysis')
	kmers = calculate_kmers(sequence, k) # Ensure this function is defined as per previous instructions
	st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))

	# Construct the colors dictionary
	# Colors dictionary for genome diagram
	feature_colors = {
	'gene': color_gene,
	'CDS': color_cds,
	'tRNA': color_trna,
	'rRNA': color_rrna
	# Add more as needed
	}

	# Generate and display genome diagram with user-selected feature colors
	output_file_path_svg = os.path.join(temp_dir, "genome_diagram.svg")
	create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path_svg, feature_colors)
	st.image(output_file_path_svg, caption='Genome Diagram')

	# Additional Information
	with st.expander("View All Genes"):
	st.dataframe(gene_df)
	with st.expander("View All Coding Sequences"):
	st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
	else:
	st.warning("Please upload a GenBank file.")