Update app.py
Browse files
app.py
CHANGED
|
@@ -1,84 +1,113 @@
|
|
| 1 |
# Import libraries
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
-
from Bio import SeqIO
|
| 5 |
-
import altair as alt
|
| 6 |
-
import numpy as np
|
| 7 |
from io import StringIO
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
# Function to parse GenBank file
|
| 10 |
# Function to parse GenBank file
|
| 11 |
def parse_genbank(uploaded_file):
|
| 12 |
-
# Convert binary to text for SeqIO
|
| 13 |
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
| 14 |
record = SeqIO.read(stringio, "genbank")
|
| 15 |
organism = record.annotations['organism']
|
| 16 |
features = record.features
|
|
|
|
|
|
|
| 17 |
genes, cds = [], []
|
| 18 |
for feature in features:
|
| 19 |
if feature.type == "gene":
|
| 20 |
genes.append(feature)
|
| 21 |
elif feature.type == "CDS":
|
| 22 |
cds.append(feature)
|
| 23 |
-
|
| 24 |
gene_info = [{
|
| 25 |
'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
|
| 26 |
'Length': len(gene),
|
| 27 |
'Location': str(gene.location)} for gene in genes]
|
|
|
|
| 28 |
cds_info = [{
|
| 29 |
'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
|
| 30 |
'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
|
| 31 |
'Length': len(cds),
|
| 32 |
'Location': str(cds.location)} for cds in cds]
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
|
|
|
|
| 40 |
if uploaded_file is not None:
|
| 41 |
-
organism, gene_info, cds_info, gc_content, sequence_length = parse_genbank(uploaded_file)
|
|
|
|
| 42 |
gene_df = pd.DataFrame(gene_info)
|
| 43 |
cds_df = pd.DataFrame(cds_info)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
st.
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
st.write(f'Organism: {organism}')
|
| 52 |
-
# You can add more interactive widgets here as needed
|
| 53 |
|
| 54 |
-
# Main content
|
| 55 |
-
col1, col2 = st.columns(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
with
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
if gene_selected:
|
| 69 |
-
selected_gene = gene_df[gene_df['Gene'] == gene_selected]
|
| 70 |
-
if not selected_gene.empty:
|
| 71 |
-
st.write(f"**Gene Details:** {selected_gene.to_dict('records')[0]}")
|
| 72 |
-
selected_cds = cds_df[cds_df['Gene'] == gene_selected]
|
| 73 |
-
if not selected_cds.empty:
|
| 74 |
-
st.write(f"**CDS Details:** {selected_cds.to_dict('records')[0]}")
|
| 75 |
-
|
| 76 |
-
# Display data tables (optional)
|
| 77 |
-
with st.expander("View All Genes"):
|
| 78 |
-
st.dataframe(gene_df)
|
| 79 |
-
with st.expander("View All Coding Sequences"):
|
| 80 |
-
st.dataframe(cds_df)
|
| 81 |
-
|
| 82 |
-
# You can extend the app with more functionalities like visualizations,
|
| 83 |
-
# k-mer analysis, or other genomic metrics based on your requirements.
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Import libraries
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
+
from Bio import SeqIO, SeqUtils
|
|
|
|
|
|
|
| 5 |
from io import StringIO
|
| 6 |
+
from collections import Counter
|
| 7 |
+
import numpy as np
|
| 8 |
+
import altair as alt
|
| 9 |
|
|
|
|
| 10 |
# Function to parse GenBank file
|
| 11 |
def parse_genbank(uploaded_file):
|
|
|
|
| 12 |
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
| 13 |
record = SeqIO.read(stringio, "genbank")
|
| 14 |
organism = record.annotations['organism']
|
| 15 |
features = record.features
|
| 16 |
+
feature_types = Counter([feature.type for feature in features])
|
| 17 |
+
|
| 18 |
genes, cds = [], []
|
| 19 |
for feature in features:
|
| 20 |
if feature.type == "gene":
|
| 21 |
genes.append(feature)
|
| 22 |
elif feature.type == "CDS":
|
| 23 |
cds.append(feature)
|
| 24 |
+
|
| 25 |
gene_info = [{
|
| 26 |
'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
|
| 27 |
'Length': len(gene),
|
| 28 |
'Location': str(gene.location)} for gene in genes]
|
| 29 |
+
|
| 30 |
cds_info = [{
|
| 31 |
'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
|
| 32 |
'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
|
| 33 |
'Length': len(cds),
|
| 34 |
'Location': str(cds.location)} for cds in cds]
|
| 35 |
+
|
| 36 |
+
gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
|
| 37 |
+
|
| 38 |
+
return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
|
| 39 |
|
| 40 |
+
# Function to calculate GC content over genome
|
| 41 |
+
def calculate_gc_content(sequence, window_size=1000):
|
| 42 |
+
gc_content = [
|
| 43 |
+
(sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
|
| 44 |
+
for i in range(0, len(sequence) - window_size + 1, window_size)
|
| 45 |
+
]
|
| 46 |
+
return gc_content
|
| 47 |
+
|
| 48 |
+
# Function to calculate k-mers
|
| 49 |
+
def calculate_kmers(sequence, k):
|
| 50 |
+
kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
|
| 51 |
+
return kmers
|
| 52 |
|
| 53 |
+
# Function to add molecular weight and isoelectric point to CDS information
|
| 54 |
+
def add_protein_features(cds_info):
|
| 55 |
+
for cds in cds_info:
|
| 56 |
+
if cds['Protein'] != 'N/A':
|
| 57 |
+
cds['Molecular Weight'] = SeqUtils.molecular_weight(cds['Protein'], seq_type='protein')
|
| 58 |
+
cds['Isoelectric Point'] = SeqUtils.IsoelectricPoint.IsoelectricPoint(cds['Protein']).pi()
|
| 59 |
+
else:
|
| 60 |
+
cds['Molecular Weight'] = 'N/A'
|
| 61 |
+
cds['Isoelectric Point'] = 'N/A'
|
| 62 |
+
return cds_info
|
| 63 |
+
|
| 64 |
+
# Streamlit UI
|
| 65 |
+
st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
|
| 66 |
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
|
| 67 |
+
|
| 68 |
if uploaded_file is not None:
|
| 69 |
+
organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
|
| 70 |
+
cds_info = add_protein_features(cds_info)
|
| 71 |
gene_df = pd.DataFrame(gene_info)
|
| 72 |
cds_df = pd.DataFrame(cds_info)
|
| 73 |
+
|
| 74 |
+
# Sidebar
|
| 75 |
+
with st.sidebar:
|
| 76 |
+
st.title('Genomic Data Dashboard')
|
| 77 |
+
st.write(f'Organism: {organism}')
|
| 78 |
+
window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
|
| 79 |
+
k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
# Main content
|
| 82 |
+
col1, col2 = st.columns(2)
|
| 83 |
+
with col1:
|
| 84 |
+
st.markdown('### General Information')
|
| 85 |
+
st.write(f'**Organism:** {organism}')
|
| 86 |
+
st.write(f'**Sequence Length:** {sequence_length} bp')
|
| 87 |
+
st.write(f'**GC Content:** {gc_content:.2f}%')
|
| 88 |
+
st.write(f'**Number of Genes:** {len(gene_df)}')
|
| 89 |
+
st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
|
| 90 |
+
st.markdown('### Feature Counts')
|
| 91 |
+
for feature_type, count in feature_types.items():
|
| 92 |
+
st.write(f"**{feature_type}:** {count}")
|
| 93 |
|
| 94 |
+
with col2:
|
| 95 |
+
st.markdown('### GC Content Over Genome')
|
| 96 |
+
gc_content_over_genome = calculate_gc_content(sequence, window_size)
|
| 97 |
+
gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
|
| 98 |
+
x='Position:Q',
|
| 99 |
+
y='GC Content:Q'
|
| 100 |
+
).properties(height=200)
|
| 101 |
+
st.altair_chart(gc_chart, use_container_width=True)
|
| 102 |
|
| 103 |
+
st.markdown('### K-mer Analysis')
|
| 104 |
+
kmers = calculate_kmers(sequence, k)
|
| 105 |
+
st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
+
# Additional Information
|
| 108 |
+
with st.expander("View All Genes"):
|
| 109 |
+
st.dataframe(gene_df)
|
| 110 |
+
with st.expander("View All Coding Sequences"):
|
| 111 |
+
st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
|
| 112 |
+
else:
|
| 113 |
+
st.warning("Please upload a GenBank file.")
|