File size: 7,183 Bytes
51b8fa9
 
 
2ed9be8
 
 
b39ac01
62bc997
2ed9be8
674c0a1
07777e9
 
 
2ed9be8
 
 
 
 
51b8fa9
6402171
 
674c0a1
a0b2313
51b8fa9
 
07777e9
2ed9be8
51b8fa9
 
 
 
 
 
2ed9be8
 
 
 
 
 
 
 
 
 
07777e9
2ed9be8
07777e9
51b8fa9
af5dcbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07777e9
2ed9be8
a2c2fa5
04fd093
 
 
 
 
 
a2c2fa5
 
 
 
04fd093
 
b39ac01
04fd093
9171b34
a2c2fa5
2ed9be8
07777e9
51b8fa9
07777e9
51b8fa9
07777e9
2ed9be8
51b8fa9
 
2ed9be8
07777e9
 
 
 
 
 
51b8fa9
62bc997
 
a2c2fa5
 
 
 
9171b34
30d50dc
62bc997
 
07777e9
 
 
 
 
 
 
 
 
 
 
 
51b8fa9
07777e9
 
2ed9be8
07777e9
 
 
 
 
51b8fa9
07777e9
2ed9be8
07777e9
51b8fa9
9171b34
a2c2fa5
9171b34
 
 
 
a2c2fa5
 
9171b34
 
a2c2fa5
b39ac01
a2c2fa5
d79da87
62bc997
07777e9
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Import libraries
import streamlit as st
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Graphics import GenomeDiagram
from reportlab.lib.colors import Color, lightblue, blue
from reportlab.lib import colors
from reportlab.lib.units import cm
from io import StringIO
from collections import Counter
import numpy as np
import altair as alt
import os

# Ensure the 'temp' directory exists for saving temporary files
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)

# Function to parse GenBank file
def parse_genbank(uploaded_file):
    stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
    record = SeqIO.read(stringio, "genbank")
    organism = record.annotations['organism']
    features = record.features
    feature_types = Counter([feature.type for feature in features])

    genes, cds = [], []
    for feature in features:
        if feature.type == "gene":
            genes.append(feature)
        elif feature.type == "CDS":
            cds.append(feature)

    gene_info = [{'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
                  'Length': len(gene),
                  'Location': str(gene.location)} for gene in genes]

    cds_info = [{'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
                 'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
                 'Length': len(cds),
                 'Location': str(cds.location)} for cds in cds]

    gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100

    return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)

# Additional functions (calculate_gc_content, calculate_kmers, add_protein_features) 
# Function to calculate GC content over genome
def calculate_gc_content(sequence, window_size=1000):
    gc_content = [
        (sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
        for i in range(0, len(sequence) - window_size + 1, window_size)
    ]
    return gc_content

# Function to calculate k-mers
def calculate_kmers(sequence, k):
    kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
    return kmers

# Function to add molecular weight and isoelectric point to CDS information
def add_protein_features(cds_info):
    for cds in cds_info:
        if cds['Protein'] != 'N/A':
            prot_analysis = ProteinAnalysis(cds['Protein'])
            cds['Molecular Weight'] = prot_analysis.molecular_weight()
            cds['Isoelectric Point'] = prot_analysis.isoelectric_point()
        else:
            cds['Molecular Weight'] = 'N/A'
            cds['Isoelectric Point'] = 'N/A'
    return cds_info

# Function to generate genome diagram
def create_genome_diagram(genbank_content, output_file_path, colors_dict):
    record = SeqIO.read(StringIO(genbank_content), "genbank")
    gd_diagram = GenomeDiagram.Diagram(record.id)
    gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
    gd_feature_set = gd_track_for_features.new_set()

    for feature in record.features:
        feature_type = feature.type
        if feature_type in colors_dict:
            color = colors.HexColor(colors_dict[feature_type])
            gd_feature_set.add_feature(feature, color=color, label=True, label_size=10, label_angle=0)

    gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7)
    gd_diagram.write(output_file_path, "SVG")



# Streamlit UI setup
st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])

if uploaded_file is not None:
    organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
    cds_info = add_protein_features(cds_info)  # Ensure this function is defined as per previous instructions
    gene_df = pd.DataFrame(gene_info)
    cds_df = pd.DataFrame(cds_info)

    # Sidebar
    with st.sidebar:
        st.title('Genomic Data Dashboard')
        st.write(f'Organism: {organism}')
        window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
        k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)

    # Sidebar options for diagram customization
    with st.sidebar:
        color_gene = st.color_picker("Pick a color for genes", '#ff9999')
        color_cds = st.color_picker("Pick a color for CDS", '#66b3ff')
        color_trna = st.color_picker("Pick a color for tRNA", '#99ff99')
        color_rrna = st.color_picker("Pick a color for rRNA", '#ffcc99')
        # Option to select what to display on the diagram
        #display_options = st.multiselect("Select features to display:", ['gene', 'tRNA', 'CDS', 'rRNA'], default=['gene', 'CDS'])


    # Main content
    col1, col2 = st.columns(2)
    with col1:
        st.markdown('### General Information')
        st.write(f'**Organism:** {organism}')
        st.write(f'**Sequence Length:** {sequence_length} bp')
        st.write(f'**GC Content:** {gc_content:.2f}%')
        st.write(f'**Number of Genes:** {len(gene_df)}')
        st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
        st.markdown('### Feature Counts')
        for feature_type, count in feature_types.items():
            st.write(f"**{feature_type}:** {count}")

    with col2:
        st.markdown('### GC Content Over Genome')
        gc_content_over_genome = calculate_gc_content(sequence, window_size)  # Ensure this function is defined as per previous instructions
        gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
            x='Position:Q',
            y='GC Content:Q'
        ).properties(height=200)
        st.altair_chart(gc_chart, use_container_width=True)

        st.markdown('### K-mer Analysis')
        kmers = calculate_kmers(sequence, k)  # Ensure this function is defined as per previous instructions
        st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))

    # Construct the colors dictionary
        # Colors dictionary for genome diagram
    feature_colors = {
        'gene': color_gene,
        'CDS': color_cds,
        'tRNA': color_trna,
        'rRNA': color_rrna
        # Add more as needed
    }
    
    # Generate and display genome diagram with user-selected feature colors
    output_file_path_svg = os.path.join(temp_dir, "genome_diagram.svg")
    create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path_svg, feature_colors)
    st.image(output_file_path_svg, caption='Genome Diagram')
    
    # Additional Information
    with st.expander("View All Genes"):
        st.dataframe(gene_df)
    with st.expander("View All Coding Sequences"):
        st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
else:
    st.warning("Please upload a GenBank file.")