File size: 7,183 Bytes
51b8fa9 2ed9be8 b39ac01 62bc997 2ed9be8 674c0a1 07777e9 2ed9be8 51b8fa9 6402171 674c0a1 a0b2313 51b8fa9 07777e9 2ed9be8 51b8fa9 2ed9be8 07777e9 2ed9be8 07777e9 51b8fa9 af5dcbb 07777e9 2ed9be8 a2c2fa5 04fd093 a2c2fa5 04fd093 b39ac01 04fd093 9171b34 a2c2fa5 2ed9be8 07777e9 51b8fa9 07777e9 51b8fa9 07777e9 2ed9be8 51b8fa9 2ed9be8 07777e9 51b8fa9 62bc997 a2c2fa5 9171b34 30d50dc 62bc997 07777e9 51b8fa9 07777e9 2ed9be8 07777e9 51b8fa9 07777e9 2ed9be8 07777e9 51b8fa9 9171b34 a2c2fa5 9171b34 a2c2fa5 9171b34 a2c2fa5 b39ac01 a2c2fa5 d79da87 62bc997 07777e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# Import libraries
import streamlit as st
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Graphics import GenomeDiagram
from reportlab.lib.colors import Color, lightblue, blue
from reportlab.lib import colors
from reportlab.lib.units import cm
from io import StringIO
from collections import Counter
import numpy as np
import altair as alt
import os
# Ensure the 'temp' directory exists for saving temporary files
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
# Function to parse GenBank file
def parse_genbank(uploaded_file):
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
record = SeqIO.read(stringio, "genbank")
organism = record.annotations['organism']
features = record.features
feature_types = Counter([feature.type for feature in features])
genes, cds = [], []
for feature in features:
if feature.type == "gene":
genes.append(feature)
elif feature.type == "CDS":
cds.append(feature)
gene_info = [{'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
'Length': len(gene),
'Location': str(gene.location)} for gene in genes]
cds_info = [{'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
'Length': len(cds),
'Location': str(cds.location)} for cds in cds]
gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
# Additional functions (calculate_gc_content, calculate_kmers, add_protein_features)
# Function to calculate GC content over genome
def calculate_gc_content(sequence, window_size=1000):
gc_content = [
(sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
for i in range(0, len(sequence) - window_size + 1, window_size)
]
return gc_content
# Function to calculate k-mers
def calculate_kmers(sequence, k):
kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
return kmers
# Function to add molecular weight and isoelectric point to CDS information
def add_protein_features(cds_info):
for cds in cds_info:
if cds['Protein'] != 'N/A':
prot_analysis = ProteinAnalysis(cds['Protein'])
cds['Molecular Weight'] = prot_analysis.molecular_weight()
cds['Isoelectric Point'] = prot_analysis.isoelectric_point()
else:
cds['Molecular Weight'] = 'N/A'
cds['Isoelectric Point'] = 'N/A'
return cds_info
# Function to generate genome diagram
def create_genome_diagram(genbank_content, output_file_path, colors_dict):
record = SeqIO.read(StringIO(genbank_content), "genbank")
gd_diagram = GenomeDiagram.Diagram(record.id)
gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
gd_feature_set = gd_track_for_features.new_set()
for feature in record.features:
feature_type = feature.type
if feature_type in colors_dict:
color = colors.HexColor(colors_dict[feature_type])
gd_feature_set.add_feature(feature, color=color, label=True, label_size=10, label_angle=0)
gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7)
gd_diagram.write(output_file_path, "SVG")
# Streamlit UI setup
st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
if uploaded_file is not None:
organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
cds_info = add_protein_features(cds_info) # Ensure this function is defined as per previous instructions
gene_df = pd.DataFrame(gene_info)
cds_df = pd.DataFrame(cds_info)
# Sidebar
with st.sidebar:
st.title('Genomic Data Dashboard')
st.write(f'Organism: {organism}')
window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)
# Sidebar options for diagram customization
with st.sidebar:
color_gene = st.color_picker("Pick a color for genes", '#ff9999')
color_cds = st.color_picker("Pick a color for CDS", '#66b3ff')
color_trna = st.color_picker("Pick a color for tRNA", '#99ff99')
color_rrna = st.color_picker("Pick a color for rRNA", '#ffcc99')
# Option to select what to display on the diagram
#display_options = st.multiselect("Select features to display:", ['gene', 'tRNA', 'CDS', 'rRNA'], default=['gene', 'CDS'])
# Main content
col1, col2 = st.columns(2)
with col1:
st.markdown('### General Information')
st.write(f'**Organism:** {organism}')
st.write(f'**Sequence Length:** {sequence_length} bp')
st.write(f'**GC Content:** {gc_content:.2f}%')
st.write(f'**Number of Genes:** {len(gene_df)}')
st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
st.markdown('### Feature Counts')
for feature_type, count in feature_types.items():
st.write(f"**{feature_type}:** {count}")
with col2:
st.markdown('### GC Content Over Genome')
gc_content_over_genome = calculate_gc_content(sequence, window_size) # Ensure this function is defined as per previous instructions
gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
x='Position:Q',
y='GC Content:Q'
).properties(height=200)
st.altair_chart(gc_chart, use_container_width=True)
st.markdown('### K-mer Analysis')
kmers = calculate_kmers(sequence, k) # Ensure this function is defined as per previous instructions
st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
# Construct the colors dictionary
# Colors dictionary for genome diagram
feature_colors = {
'gene': color_gene,
'CDS': color_cds,
'tRNA': color_trna,
'rRNA': color_rrna
# Add more as needed
}
# Generate and display genome diagram with user-selected feature colors
output_file_path_svg = os.path.join(temp_dir, "genome_diagram.svg")
create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path_svg, feature_colors)
st.image(output_file_path_svg, caption='Genome Diagram')
# Additional Information
with st.expander("View All Genes"):
st.dataframe(gene_df)
with st.expander("View All Coding Sequences"):
st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
else:
st.warning("Please upload a GenBank file.")
|