|
|
|
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
from Bio import SeqIO |
|
|
from Bio.SeqUtils.ProtParam import ProteinAnalysis |
|
|
from Bio.Graphics import GenomeDiagram |
|
|
from reportlab.lib.colors import Color, lightblue, blue |
|
|
from reportlab.lib import colors |
|
|
from reportlab.lib.units import cm |
|
|
from io import StringIO |
|
|
from collections import Counter |
|
|
import numpy as np |
|
|
import altair as alt |
|
|
import os |
|
|
|
|
|
|
|
|
temp_dir = "temp" |
|
|
os.makedirs(temp_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
def parse_genbank(uploaded_file): |
|
|
stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) |
|
|
record = SeqIO.read(stringio, "genbank") |
|
|
organism = record.annotations['organism'] |
|
|
features = record.features |
|
|
feature_types = Counter([feature.type for feature in features]) |
|
|
|
|
|
genes, cds = [], [] |
|
|
for feature in features: |
|
|
if feature.type == "gene": |
|
|
genes.append(feature) |
|
|
elif feature.type == "CDS": |
|
|
cds.append(feature) |
|
|
|
|
|
gene_info = [{'Gene': gene.qualifiers.get('gene', ['N/A'])[0], |
|
|
'Length': len(gene), |
|
|
'Location': str(gene.location)} for gene in genes] |
|
|
|
|
|
cds_info = [{'Gene': cds.qualifiers.get('gene', ['N/A'])[0], |
|
|
'Protein': cds.qualifiers.get('translation', ['N/A'])[0], |
|
|
'Length': len(cds), |
|
|
'Location': str(cds.location)} for cds in cds] |
|
|
|
|
|
gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100 |
|
|
|
|
|
return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq) |
|
|
|
|
|
|
|
|
|
|
|
def calculate_gc_content(sequence, window_size=1000): |
|
|
gc_content = [ |
|
|
(sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100 |
|
|
for i in range(0, len(sequence) - window_size + 1, window_size) |
|
|
] |
|
|
return gc_content |
|
|
|
|
|
|
|
|
def calculate_kmers(sequence, k): |
|
|
kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)]) |
|
|
return kmers |
|
|
|
|
|
|
|
|
def add_protein_features(cds_info): |
|
|
for cds in cds_info: |
|
|
if cds['Protein'] != 'N/A': |
|
|
prot_analysis = ProteinAnalysis(cds['Protein']) |
|
|
cds['Molecular Weight'] = prot_analysis.molecular_weight() |
|
|
cds['Isoelectric Point'] = prot_analysis.isoelectric_point() |
|
|
else: |
|
|
cds['Molecular Weight'] = 'N/A' |
|
|
cds['Isoelectric Point'] = 'N/A' |
|
|
return cds_info |
|
|
|
|
|
|
|
|
def create_genome_diagram(genbank_content, output_file_path, colors_dict): |
|
|
record = SeqIO.read(StringIO(genbank_content), "genbank") |
|
|
gd_diagram = GenomeDiagram.Diagram(record.id) |
|
|
gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") |
|
|
gd_feature_set = gd_track_for_features.new_set() |
|
|
|
|
|
for feature in record.features: |
|
|
feature_type = feature.type |
|
|
if feature_type in colors_dict: |
|
|
color = colors.HexColor(colors_dict[feature_type]) |
|
|
gd_feature_set.add_feature(feature, color=color, label=True, label_size=10, label_angle=0) |
|
|
|
|
|
gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7) |
|
|
gd_diagram.write(output_file_path, "SVG") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide") |
|
|
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk']) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file) |
|
|
cds_info = add_protein_features(cds_info) |
|
|
gene_df = pd.DataFrame(gene_info) |
|
|
cds_df = pd.DataFrame(cds_info) |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.title('Genomic Data Dashboard') |
|
|
st.write(f'Organism: {organism}') |
|
|
window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000) |
|
|
k = st.number_input('k-mer size', min_value=1, max_value=10, value=6) |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
color_gene = st.color_picker("Pick a color for genes", '#ff9999') |
|
|
color_cds = st.color_picker("Pick a color for CDS", '#66b3ff') |
|
|
color_trna = st.color_picker("Pick a color for tRNA", '#99ff99') |
|
|
color_rrna = st.color_picker("Pick a color for rRNA", '#ffcc99') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.markdown('### General Information') |
|
|
st.write(f'**Organism:** {organism}') |
|
|
st.write(f'**Sequence Length:** {sequence_length} bp') |
|
|
st.write(f'**GC Content:** {gc_content:.2f}%') |
|
|
st.write(f'**Number of Genes:** {len(gene_df)}') |
|
|
st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}') |
|
|
st.markdown('### Feature Counts') |
|
|
for feature_type, count in feature_types.items(): |
|
|
st.write(f"**{feature_type}:** {count}") |
|
|
|
|
|
with col2: |
|
|
st.markdown('### GC Content Over Genome') |
|
|
gc_content_over_genome = calculate_gc_content(sequence, window_size) |
|
|
gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode( |
|
|
x='Position:Q', |
|
|
y='GC Content:Q' |
|
|
).properties(height=200) |
|
|
st.altair_chart(gc_chart, use_container_width=True) |
|
|
|
|
|
st.markdown('### K-mer Analysis') |
|
|
kmers = calculate_kmers(sequence, k) |
|
|
st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20)) |
|
|
|
|
|
|
|
|
|
|
|
feature_colors = { |
|
|
'gene': color_gene, |
|
|
'CDS': color_cds, |
|
|
'tRNA': color_trna, |
|
|
'rRNA': color_rrna |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
output_file_path_svg = os.path.join(temp_dir, "genome_diagram.svg") |
|
|
create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path_svg, feature_colors) |
|
|
st.image(output_file_path_svg, caption='Genome Diagram') |
|
|
|
|
|
|
|
|
with st.expander("View All Genes"): |
|
|
st.dataframe(gene_df) |
|
|
with st.expander("View All Coding Sequences"): |
|
|
st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']]) |
|
|
else: |
|
|
st.warning("Please upload a GenBank file.") |
|
|
|