Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,20 @@
|
|
| 1 |
# Import libraries
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
-
from Bio import SeqIO
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from io import StringIO
|
| 6 |
from collections import Counter
|
| 7 |
import numpy as np
|
| 8 |
import altair as alt
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Function to parse GenBank file
|
| 12 |
def parse_genbank(uploaded_file):
|
|
@@ -15,55 +23,30 @@ def parse_genbank(uploaded_file):
|
|
| 15 |
organism = record.annotations['organism']
|
| 16 |
features = record.features
|
| 17 |
feature_types = Counter([feature.type for feature in features])
|
| 18 |
-
|
| 19 |
genes, cds = [], []
|
| 20 |
for feature in features:
|
| 21 |
if feature.type == "gene":
|
| 22 |
genes.append(feature)
|
| 23 |
elif feature.type == "CDS":
|
| 24 |
cds.append(feature)
|
| 25 |
-
|
| 26 |
-
gene_info = [{
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
'Location': str(cds.location)} for cds in cds]
|
| 36 |
-
|
| 37 |
gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
|
| 38 |
-
|
| 39 |
return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
def calculate_gc_content(sequence, window_size=1000):
|
| 43 |
-
gc_content = [
|
| 44 |
-
(sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
|
| 45 |
-
for i in range(0, len(sequence) - window_size + 1, window_size)
|
| 46 |
-
]
|
| 47 |
-
return gc_content
|
| 48 |
-
|
| 49 |
-
# Function to calculate k-mers
|
| 50 |
-
def calculate_kmers(sequence, k):
|
| 51 |
-
kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
|
| 52 |
-
return kmers
|
| 53 |
-
|
| 54 |
-
# Function to add molecular weight and isoelectric point to CDS information
|
| 55 |
-
def add_protein_features(cds_info):
|
| 56 |
-
for cds in cds_info:
|
| 57 |
-
if cds['Protein'] != 'N/A':
|
| 58 |
-
prot_analysis = ProteinAnalysis(cds['Protein'])
|
| 59 |
-
cds['Molecular Weight'] = prot_analysis.molecular_weight()
|
| 60 |
-
cds['Isoelectric Point'] = prot_analysis.isoelectric_point()
|
| 61 |
-
else:
|
| 62 |
-
cds['Molecular Weight'] = 'N/A'
|
| 63 |
-
cds['Isoelectric Point'] = 'N/A'
|
| 64 |
-
return cds_info
|
| 65 |
|
| 66 |
-
# Function to
|
| 67 |
def create_genome_diagram(genbank_content, output_file_path):
|
| 68 |
record = SeqIO.read(StringIO(genbank_content), "genbank")
|
| 69 |
gd_diagram = GenomeDiagram.Diagram(record.id)
|
|
@@ -82,17 +65,16 @@ def create_genome_diagram(genbank_content, output_file_path):
|
|
| 82 |
gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7)
|
| 83 |
gd_diagram.write(output_file_path, "PNG")
|
| 84 |
|
| 85 |
-
|
| 86 |
-
# Streamlit UI
|
| 87 |
st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
|
| 88 |
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
|
| 89 |
|
| 90 |
if uploaded_file is not None:
|
| 91 |
organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
|
| 92 |
-
cds_info = add_protein_features(cds_info)
|
| 93 |
gene_df = pd.DataFrame(gene_info)
|
| 94 |
cds_df = pd.DataFrame(cds_info)
|
| 95 |
-
|
| 96 |
# Sidebar
|
| 97 |
with st.sidebar:
|
| 98 |
st.title('Genomic Data Dashboard')
|
|
@@ -115,7 +97,7 @@ if uploaded_file is not None:
|
|
| 115 |
|
| 116 |
with col2:
|
| 117 |
st.markdown('### GC Content Over Genome')
|
| 118 |
-
gc_content_over_genome = calculate_gc_content(sequence, window_size)
|
| 119 |
gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
|
| 120 |
x='Position:Q',
|
| 121 |
y='GC Content:Q'
|
|
@@ -123,16 +105,14 @@ if uploaded_file is not None:
|
|
| 123 |
st.altair_chart(gc_chart, use_container_width=True)
|
| 124 |
|
| 125 |
st.markdown('### K-mer Analysis')
|
| 126 |
-
kmers = calculate_kmers(sequence, k)
|
| 127 |
st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
| 132 |
|
| 133 |
-
# Display genome diagram
|
| 134 |
-
st.image(output_file_path, caption='Genome Diagram')
|
| 135 |
-
|
| 136 |
# Additional Information
|
| 137 |
with st.expander("View All Genes"):
|
| 138 |
st.dataframe(gene_df)
|
|
|
|
| 1 |
# Import libraries
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
+
from Bio import SeqIO
|
| 5 |
+
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
| 6 |
+
from Bio.Graphics import GenomeDiagram
|
| 7 |
+
from reportlab.lib import colors
|
| 8 |
+
from reportlab.lib.units import cm
|
| 9 |
from io import StringIO
|
| 10 |
from collections import Counter
|
| 11 |
import numpy as np
|
| 12 |
import altair as alt
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# Ensure the 'temp' directory exists for saving temporary files
|
| 16 |
+
temp_dir = "temp"
|
| 17 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 18 |
|
| 19 |
# Function to parse GenBank file
|
| 20 |
def parse_genbank(uploaded_file):
|
|
|
|
| 23 |
organism = record.annotations['organism']
|
| 24 |
features = record.features
|
| 25 |
feature_types = Counter([feature.type for feature in features])
|
| 26 |
+
|
| 27 |
genes, cds = [], []
|
| 28 |
for feature in features:
|
| 29 |
if feature.type == "gene":
|
| 30 |
genes.append(feature)
|
| 31 |
elif feature.type == "CDS":
|
| 32 |
cds.append(feature)
|
| 33 |
+
|
| 34 |
+
gene_info = [{'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
|
| 35 |
+
'Length': len(gene),
|
| 36 |
+
'Location': str(gene.location)} for gene in genes]
|
| 37 |
+
|
| 38 |
+
cds_info = [{'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
|
| 39 |
+
'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
|
| 40 |
+
'Length': len(cds),
|
| 41 |
+
'Location': str(cds.location)} for cds in cds]
|
| 42 |
+
|
|
|
|
|
|
|
| 43 |
gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
|
| 44 |
+
|
| 45 |
return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
|
| 46 |
|
| 47 |
+
# Additional functions (calculate_gc_content, calculate_kmers, add_protein_features) as before
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# Function to generate genome diagram
|
| 50 |
def create_genome_diagram(genbank_content, output_file_path):
|
| 51 |
record = SeqIO.read(StringIO(genbank_content), "genbank")
|
| 52 |
gd_diagram = GenomeDiagram.Diagram(record.id)
|
|
|
|
| 65 |
gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7)
|
| 66 |
gd_diagram.write(output_file_path, "PNG")
|
| 67 |
|
| 68 |
+
# Streamlit UI setup
|
|
|
|
| 69 |
st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
|
| 70 |
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
|
| 71 |
|
| 72 |
if uploaded_file is not None:
|
| 73 |
organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
|
| 74 |
+
cds_info = add_protein_features(cds_info) # Ensure this function is defined as per previous instructions
|
| 75 |
gene_df = pd.DataFrame(gene_info)
|
| 76 |
cds_df = pd.DataFrame(cds_info)
|
| 77 |
+
|
| 78 |
# Sidebar
|
| 79 |
with st.sidebar:
|
| 80 |
st.title('Genomic Data Dashboard')
|
|
|
|
| 97 |
|
| 98 |
with col2:
|
| 99 |
st.markdown('### GC Content Over Genome')
|
| 100 |
+
gc_content_over_genome = calculate_gc_content(sequence, window_size) # Ensure this function is defined as per previous instructions
|
| 101 |
gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
|
| 102 |
x='Position:Q',
|
| 103 |
y='GC Content:Q'
|
|
|
|
| 105 |
st.altair_chart(gc_chart, use_container_width=True)
|
| 106 |
|
| 107 |
st.markdown('### K-mer Analysis')
|
| 108 |
+
kmers = calculate_kmers(sequence, k) # Ensure this function is defined as per previous instructions
|
| 109 |
st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
|
| 110 |
|
| 111 |
+
# Generate and display genome diagram
|
| 112 |
+
output_file_path = os.path.join(temp_dir, "genome_diagram.png")
|
| 113 |
+
create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path)
|
| 114 |
+
st.image(output_file_path, caption='Genome Diagram')
|
| 115 |
|
|
|
|
|
|
|
|
|
|
| 116 |
# Additional Information
|
| 117 |
with st.expander("View All Genes"):
|
| 118 |
st.dataframe(gene_df)
|