yashm commited on
Commit
07777e9
·
verified ·
1 Parent(s): 6402171

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -48
app.py CHANGED
@@ -1,84 +1,113 @@
1
  # Import libraries
2
  import streamlit as st
3
  import pandas as pd
4
- from Bio import SeqIO
5
- import altair as alt
6
- import numpy as np
7
  from io import StringIO
 
 
 
8
 
9
- # Function to parse GenBank file
10
  # Function to parse GenBank file
11
  def parse_genbank(uploaded_file):
12
- # Convert binary to text for SeqIO
13
  stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
14
  record = SeqIO.read(stringio, "genbank")
15
  organism = record.annotations['organism']
16
  features = record.features
 
 
17
  genes, cds = [], []
18
  for feature in features:
19
  if feature.type == "gene":
20
  genes.append(feature)
21
  elif feature.type == "CDS":
22
  cds.append(feature)
23
- gc_content = (record.seq.count('G') + record.seq.count('C')) / len(record.seq) * 100
24
  gene_info = [{
25
  'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
26
  'Length': len(gene),
27
  'Location': str(gene.location)} for gene in genes]
 
28
  cds_info = [{
29
  'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
30
  'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
31
  'Length': len(cds),
32
  'Location': str(cds.location)} for cds in cds]
33
- return organism, gene_info, cds_info, gc_content, len(record.seq)
 
 
 
34
 
35
- # Page setup
36
- st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Upload GenBank file
 
 
 
 
 
 
 
 
 
 
 
 
39
  uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
 
40
  if uploaded_file is not None:
41
- organism, gene_info, cds_info, gc_content, sequence_length = parse_genbank(uploaded_file)
 
42
  gene_df = pd.DataFrame(gene_info)
43
  cds_df = pd.DataFrame(cds_info)
44
- else:
45
- st.warning("Please upload a GenBank file.")
46
- st.stop()
47
-
48
- # Sidebar information
49
- with st.sidebar:
50
- st.title('Genomic Data Dashboard')
51
- st.write(f'Organism: {organism}')
52
- # You can add more interactive widgets here as needed
53
 
54
- # Main content
55
- col1, col2 = st.columns(2)
 
 
 
 
 
 
 
 
 
 
56
 
57
- with col1:
58
- st.markdown('### General Information')
59
- st.write(f'**Organism:** {organism}')
60
- st.write(f'**Sequence Length:** {sequence_length} bp')
61
- st.write(f'**GC Content:** {gc_content:.2f}%')
62
- st.write(f'**Number of Genes:** {len(gene_df)}')
63
- st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
 
64
 
65
- with col2:
66
- st.markdown('### Genes and Proteins')
67
- gene_selected = st.selectbox('Select a gene to view details:', options=gene_df['Gene'])
68
- if gene_selected:
69
- selected_gene = gene_df[gene_df['Gene'] == gene_selected]
70
- if not selected_gene.empty:
71
- st.write(f"**Gene Details:** {selected_gene.to_dict('records')[0]}")
72
- selected_cds = cds_df[cds_df['Gene'] == gene_selected]
73
- if not selected_cds.empty:
74
- st.write(f"**CDS Details:** {selected_cds.to_dict('records')[0]}")
75
-
76
- # Display data tables (optional)
77
- with st.expander("View All Genes"):
78
- st.dataframe(gene_df)
79
- with st.expander("View All Coding Sequences"):
80
- st.dataframe(cds_df)
81
-
82
- # You can extend the app with more functionalities like visualizations,
83
- # k-mer analysis, or other genomic metrics based on your requirements.
84
 
 
 
 
 
 
 
 
 
1
  # Import libraries
2
  import streamlit as st
3
  import pandas as pd
4
+ from Bio import SeqIO, SeqUtils
 
 
5
  from io import StringIO
6
+ from collections import Counter
7
+ import numpy as np
8
+ import altair as alt
9
 
 
10
  # Function to parse GenBank file
11
  def parse_genbank(uploaded_file):
 
12
  stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
13
  record = SeqIO.read(stringio, "genbank")
14
  organism = record.annotations['organism']
15
  features = record.features
16
+ feature_types = Counter([feature.type for feature in features])
17
+
18
  genes, cds = [], []
19
  for feature in features:
20
  if feature.type == "gene":
21
  genes.append(feature)
22
  elif feature.type == "CDS":
23
  cds.append(feature)
24
+
25
  gene_info = [{
26
  'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
27
  'Length': len(gene),
28
  'Location': str(gene.location)} for gene in genes]
29
+
30
  cds_info = [{
31
  'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
32
  'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
33
  'Length': len(cds),
34
  'Location': str(cds.location)} for cds in cds]
35
+
36
+ gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
37
+
38
+ return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
39
 
40
+ # Function to calculate GC content over genome
41
+ def calculate_gc_content(sequence, window_size=1000):
42
+ gc_content = [
43
+ (sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
44
+ for i in range(0, len(sequence) - window_size + 1, window_size)
45
+ ]
46
+ return gc_content
47
+
48
+ # Function to calculate k-mers
49
+ def calculate_kmers(sequence, k):
50
+ kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
51
+ return kmers
52
 
53
+ # Function to add molecular weight and isoelectric point to CDS information
54
+ def add_protein_features(cds_info):
55
+ for cds in cds_info:
56
+ if cds['Protein'] != 'N/A':
57
+ cds['Molecular Weight'] = SeqUtils.molecular_weight(cds['Protein'], seq_type='protein')
58
+ cds['Isoelectric Point'] = SeqUtils.IsoelectricPoint.IsoelectricPoint(cds['Protein']).pi()
59
+ else:
60
+ cds['Molecular Weight'] = 'N/A'
61
+ cds['Isoelectric Point'] = 'N/A'
62
+ return cds_info
63
+
64
+ # Streamlit UI
65
+ st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
66
  uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
67
+
68
  if uploaded_file is not None:
69
+ organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
70
+ cds_info = add_protein_features(cds_info)
71
  gene_df = pd.DataFrame(gene_info)
72
  cds_df = pd.DataFrame(cds_info)
73
+
74
+ # Sidebar
75
+ with st.sidebar:
76
+ st.title('Genomic Data Dashboard')
77
+ st.write(f'Organism: {organism}')
78
+ window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000)
79
+ k = st.number_input('k-mer size', min_value=1, max_value=10, value=6)
 
 
80
 
81
+ # Main content
82
+ col1, col2 = st.columns(2)
83
+ with col1:
84
+ st.markdown('### General Information')
85
+ st.write(f'**Organism:** {organism}')
86
+ st.write(f'**Sequence Length:** {sequence_length} bp')
87
+ st.write(f'**GC Content:** {gc_content:.2f}%')
88
+ st.write(f'**Number of Genes:** {len(gene_df)}')
89
+ st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}')
90
+ st.markdown('### Feature Counts')
91
+ for feature_type, count in feature_types.items():
92
+ st.write(f"**{feature_type}:** {count}")
93
 
94
+ with col2:
95
+ st.markdown('### GC Content Over Genome')
96
+ gc_content_over_genome = calculate_gc_content(sequence, window_size)
97
+ gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
98
+ x='Position:Q',
99
+ y='GC Content:Q'
100
+ ).properties(height=200)
101
+ st.altair_chart(gc_chart, use_container_width=True)
102
 
103
+ st.markdown('### K-mer Analysis')
104
+ kmers = calculate_kmers(sequence, k)
105
+ st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ # Additional Information
108
+ with st.expander("View All Genes"):
109
+ st.dataframe(gene_df)
110
+ with st.expander("View All Coding Sequences"):
111
+ st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']])
112
+ else:
113
+ st.warning("Please upload a GenBank file.")