yashm commited on
Commit
2ed9be8
·
verified ·
1 Parent(s): 04fd093

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -53
app.py CHANGED
@@ -1,12 +1,20 @@
1
  # Import libraries
2
  import streamlit as st
3
  import pandas as pd
4
- from Bio import SeqIO, SeqUtils
 
 
 
 
5
  from io import StringIO
6
  from collections import Counter
7
  import numpy as np
8
  import altair as alt
9
- from Bio.SeqUtils.ProtParam import ProteinAnalysis
 
 
 
 
10
 
11
  # Function to parse GenBank file
12
  def parse_genbank(uploaded_file):
@@ -15,55 +23,30 @@ def parse_genbank(uploaded_file):
15
  organism = record.annotations['organism']
16
  features = record.features
17
  feature_types = Counter([feature.type for feature in features])
18
-
19
  genes, cds = [], []
20
  for feature in features:
21
  if feature.type == "gene":
22
  genes.append(feature)
23
  elif feature.type == "CDS":
24
  cds.append(feature)
25
-
26
- gene_info = [{
27
- 'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
28
- 'Length': len(gene),
29
- 'Location': str(gene.location)} for gene in genes]
30
-
31
- cds_info = [{
32
- 'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
33
- 'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
34
- 'Length': len(cds),
35
- 'Location': str(cds.location)} for cds in cds]
36
-
37
  gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
38
-
39
  return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
40
 
41
- # Function to calculate GC content over genome
42
- def calculate_gc_content(sequence, window_size=1000):
43
- gc_content = [
44
- (sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100
45
- for i in range(0, len(sequence) - window_size + 1, window_size)
46
- ]
47
- return gc_content
48
-
49
- # Function to calculate k-mers
50
- def calculate_kmers(sequence, k):
51
- kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
52
- return kmers
53
-
54
- # Function to add molecular weight and isoelectric point to CDS information
55
- def add_protein_features(cds_info):
56
- for cds in cds_info:
57
- if cds['Protein'] != 'N/A':
58
- prot_analysis = ProteinAnalysis(cds['Protein'])
59
- cds['Molecular Weight'] = prot_analysis.molecular_weight()
60
- cds['Isoelectric Point'] = prot_analysis.isoelectric_point()
61
- else:
62
- cds['Molecular Weight'] = 'N/A'
63
- cds['Isoelectric Point'] = 'N/A'
64
- return cds_info
65
 
66
- # Function to add genome_diagram
67
  def create_genome_diagram(genbank_content, output_file_path):
68
  record = SeqIO.read(StringIO(genbank_content), "genbank")
69
  gd_diagram = GenomeDiagram.Diagram(record.id)
@@ -82,17 +65,16 @@ def create_genome_diagram(genbank_content, output_file_path):
82
  gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7)
83
  gd_diagram.write(output_file_path, "PNG")
84
 
85
-
86
- # Streamlit UI
87
  st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
88
  uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
89
 
90
  if uploaded_file is not None:
91
  organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
92
- cds_info = add_protein_features(cds_info)
93
  gene_df = pd.DataFrame(gene_info)
94
  cds_df = pd.DataFrame(cds_info)
95
-
96
  # Sidebar
97
  with st.sidebar:
98
  st.title('Genomic Data Dashboard')
@@ -115,7 +97,7 @@ if uploaded_file is not None:
115
 
116
  with col2:
117
  st.markdown('### GC Content Over Genome')
118
- gc_content_over_genome = calculate_gc_content(sequence, window_size)
119
  gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
120
  x='Position:Q',
121
  y='GC Content:Q'
@@ -123,16 +105,14 @@ if uploaded_file is not None:
123
  st.altair_chart(gc_chart, use_container_width=True)
124
 
125
  st.markdown('### K-mer Analysis')
126
- kmers = calculate_kmers(sequence, k)
127
  st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
128
 
129
- # Generate genome diagram
130
- output_file_path = os.path.join(st.session_state.get("temp_dir", "."), "genome_diagram.png")
131
- create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path)
 
132
 
133
- # Display genome diagram
134
- st.image(output_file_path, caption='Genome Diagram')
135
-
136
  # Additional Information
137
  with st.expander("View All Genes"):
138
  st.dataframe(gene_df)
 
1
  # Import libraries
2
  import streamlit as st
3
  import pandas as pd
4
+ from Bio import SeqIO
5
+ from Bio.SeqUtils.ProtParam import ProteinAnalysis
6
+ from Bio.Graphics import GenomeDiagram
7
+ from reportlab.lib import colors
8
+ from reportlab.lib.units import cm
9
  from io import StringIO
10
  from collections import Counter
11
  import numpy as np
12
  import altair as alt
13
+ import os
14
+
15
+ # Ensure the 'temp' directory exists for saving temporary files
16
+ temp_dir = "temp"
17
+ os.makedirs(temp_dir, exist_ok=True)
18
 
19
  # Function to parse GenBank file
20
  def parse_genbank(uploaded_file):
 
23
  organism = record.annotations['organism']
24
  features = record.features
25
  feature_types = Counter([feature.type for feature in features])
26
+
27
  genes, cds = [], []
28
  for feature in features:
29
  if feature.type == "gene":
30
  genes.append(feature)
31
  elif feature.type == "CDS":
32
  cds.append(feature)
33
+
34
+ gene_info = [{'Gene': gene.qualifiers.get('gene', ['N/A'])[0],
35
+ 'Length': len(gene),
36
+ 'Location': str(gene.location)} for gene in genes]
37
+
38
+ cds_info = [{'Gene': cds.qualifiers.get('gene', ['N/A'])[0],
39
+ 'Protein': cds.qualifiers.get('translation', ['N/A'])[0],
40
+ 'Length': len(cds),
41
+ 'Location': str(cds.location)} for cds in cds]
42
+
 
 
43
  gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100
44
+
45
  return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq)
46
 
47
+ # Additional functions (calculate_gc_content, calculate_kmers, add_protein_features) as before
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # Function to generate genome diagram
50
  def create_genome_diagram(genbank_content, output_file_path):
51
  record = SeqIO.read(StringIO(genbank_content), "genbank")
52
  gd_diagram = GenomeDiagram.Diagram(record.id)
 
65
  gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm, 20*cm), start=0, end=len(record), circle_core=0.7)
66
  gd_diagram.write(output_file_path, "PNG")
67
 
68
+ # Streamlit UI setup
 
69
  st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide")
70
  uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk'])
71
 
72
  if uploaded_file is not None:
73
  organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file)
74
+ cds_info = add_protein_features(cds_info) # Ensure this function is defined as per previous instructions
75
  gene_df = pd.DataFrame(gene_info)
76
  cds_df = pd.DataFrame(cds_info)
77
+
78
  # Sidebar
79
  with st.sidebar:
80
  st.title('Genomic Data Dashboard')
 
97
 
98
  with col2:
99
  st.markdown('### GC Content Over Genome')
100
+ gc_content_over_genome = calculate_gc_content(sequence, window_size) # Ensure this function is defined as per previous instructions
101
  gc_chart = alt.Chart(pd.DataFrame({'GC Content': gc_content_over_genome, 'Position': np.arange(len(gc_content_over_genome)) * window_size})).mark_line().encode(
102
  x='Position:Q',
103
  y='GC Content:Q'
 
105
  st.altair_chart(gc_chart, use_container_width=True)
106
 
107
  st.markdown('### K-mer Analysis')
108
+ kmers = calculate_kmers(sequence, k) # Ensure this function is defined as per previous instructions
109
  st.bar_chart(pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20))
110
 
111
+ # Generate and display genome diagram
112
+ output_file_path = os.path.join(temp_dir, "genome_diagram.png")
113
+ create_genome_diagram(uploaded_file.getvalue().decode("utf-8"), output_file_path)
114
+ st.image(output_file_path, caption='Genome Diagram')
115
 
 
 
 
116
  # Additional Information
117
  with st.expander("View All Genes"):
118
  st.dataframe(gene_df)