Spaces:

yashm
/

OpenGene

Sleeping

App Files Files Community

yashm commited on Mar 9, 2024

Commit

07777e9

verified ·

1 Parent(s): 6402171

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -48

app.py CHANGED Viewed

@@ -1,84 +1,113 @@
 # Import libraries
 import streamlit as st
 import pandas as pd
-from Bio import SeqIO
-import altair as alt
-import numpy as np
 from io import StringIO
-# Function to parse GenBank file
 # Function to parse GenBank file
 def parse_genbank(uploaded_file):
-    # Convert binary to text for SeqIO
     stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
     record = SeqIO.read(stringio, "genbank")
     organism = record.annotations['organism']
     features = record.features
     genes, cds = [], []
     for feature in features:
         if feature.type == "gene":
             genes.append(feature)
         elif feature.type == "CDS":
             cds.append(feature)
-    gc_content = (record.seq.count('G') + record.seq.count('C')) / len(record.seq) * 100
     gene_info = [{
         'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
         'Length': len(gene),
         'Location': str(gene.location)} for gene in genes]
     cds_info = [{
         'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
         'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
         'Length': len(cds),
         'Location': str(cds.location)} for cds in cds]
-    return organism, gene_info, cds_info, gc_content, len(record.seq)
-# Page setup
-st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
-# Upload GenBank file
 uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
 if uploaded_file is not None:
-    organism, gene_info, cds_info, gc_content, sequence_length = parse_genbank(uploaded_file)
     gene_df = pd.DataFrame(gene_info)
     cds_df = pd.DataFrame(cds_info)
-else:
-    st.warning("Please upload a GenBank file.")
-    st.stop()
-# Sidebar information
-with st.sidebar:
-    st.title('Genomic Data Dashboard')
-    st.write(f'Organism: {organism}')
-    # You can add more interactive widgets here as needed
-# Main content
-col1, col2 = st.columns(2)
-with col1:
-    st.markdown('### General Information')
-    st.write(f'**Organism:** {organism}')
-    st.write(f'**Sequence Length:** {sequence_length} bp')
-    st.write(f'**GC Content:** {gc_content:.2f}%')
-    st.write(f'**Number of Genes:** {len(gene_df)}')
-    st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
-with col2:
-    st.markdown('### Genes and Proteins')
-    gene_selected = st.selectbox('Select a gene to view details:', options=gene_df['Gene'])
-    if gene_selected:
-        selected_gene = gene_df[gene_df['Gene'] == gene_selected]
-        if not selected_gene.empty:
-            st.write(f"**Gene Details:** {selected_gene.to_dict('records')[0]}")
-        selected_cds = cds_df[cds_df['Gene'] == gene_selected]
-        if not selected_cds.empty:
-            st.write(f"**CDS Details:** {selected_cds.to_dict('records')[0]}")
-# Display data tables (optional)
-with st.expander("View All Genes"):
-    st.dataframe(gene_df)
-with st.expander("View All Coding Sequences"):
-    st.dataframe(cds_df)
-# You can extend the app with more functionalities like visualizations,
-# k-mer analysis, or other genomic metrics based on your requirements.

 # Import libraries
 import streamlit as st
 import pandas as pd
+from Bio import SeqIO, SeqUtils
 from io import StringIO
+from collections import Counter
+import numpy as np
+import altair as alt
 # Function to parse GenBank file
 def parse_genbank(uploaded_file):
     stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
     record = SeqIO.read(stringio, "genbank")
     organism = record.annotations['organism']
     features = record.features
+    feature_types = Counter([feature.type for feature in features])
     genes, cds = [], []
     for feature in features:
         if feature.type == "gene":
             genes.append(feature)
         elif feature.type == "CDS":
             cds.append(feature)
     gene_info = [{
         'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
         'Length': len(gene),
         'Location': str(gene.location)} for gene in genes]
     cds_info = [{
         'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
         'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
         'Length': len(cds),
         'Location': str(cds.location)} for cds in cds]
+    gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
+    return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
+# Function to calculate GC content over genome
+def calculate_gc_content(sequence, window_size=1000):
+    gc_content = [
+        (sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
+        for i in range(0, len(sequence) - window_size + 1, window_size)
+    ]
+    return gc_content
+# Function to calculate k-mers
+def calculate_kmers(sequence, k):
+    kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
+    return kmers
+# Function to add molecular weight and isoelectric point to CDS information
+def add_protein_features(cds_info):
+    for cds in cds_info:
+        if cds['Protein'] != 'N/A':
+            cds['Molecular Weight'] = SeqUtils.molecular_weight(cds['Protein'], seq_type='protein')
+            cds['Isoelectric Point'] = SeqUtils.IsoelectricPoint.IsoelectricPoint(cds['Protein']).pi()
+        else:
+            cds['Molecular Weight'] = 'N/A'
+            cds['Isoelectric Point'] = 'N/A'
+    return cds_info
+# Streamlit UI
+st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
 uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
 if uploaded_file is not None:
+    organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
+    cds_info = add_protein_features(cds_info)
     gene_df = pd.DataFrame(gene_info)
     cds_df = pd.DataFrame(cds_info)
+    # Sidebar
+    with st.sidebar:
+        st.title('Genomic Data Dashboard')
+        st.write(f'Organism: {organism}')
+        window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
+        k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)
+    # Main content
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown('### General Information')
+        st.write(f'**Organism:** {organism}')
+        st.write(f'**Sequence Length:** {sequence_length} bp')
+        st.write(f'**GC Content:** {gc_content:.2f}%')
+        st.write(f'**Number of Genes:** {len(gene_df)}')
+        st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
+        st.markdown('### Feature Counts')
+        for feature_type, count in feature_types.items():
+            st.write(f"**{feature_type}:** {count}")
+    with col2:
+        st.markdown('### GC Content Over Genome')
+        gc_content_over_genome = calculate_gc_content(sequence, window_size)
+        gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
+            x='Position:Q',
+            y='GC Content:Q'
+        ).properties(height=200)
+        st.altair_chart(gc_chart, use_container_width=True)
+        st.markdown('### K-mer Analysis')
+        kmers = calculate_kmers(sequence, k)
+        st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
+    # Additional Information
+    with st.expander("View All Genes"):
+        st.dataframe(gene_df)
+    with st.expander("View All Coding Sequences"):
+        st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
+else:
+    st.warning("Please upload a GenBank file.")