lyimo commited on
Commit
129decb
·
verified ·
1 Parent(s): af7dc6d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -0
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ from Bio import pairwise2
4
+ import re
5
+ from collections import defaultdict
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import io
9
+
10
+ def read_fasta_from_upload(uploaded_file):
11
+ """
12
+ Read a FASTA file from Streamlit upload
13
+ """
14
+ try:
15
+ content = uploaded_file.getvalue().decode('utf-8').strip()
16
+ parts = content.split('\n', 1)
17
+ sequence = ''.join(parts[1].split('\n')).replace(' ', '')
18
+ return sequence.upper()
19
+ except Exception as e:
20
+ st.error(f"Error reading uploaded file: {str(e)}")
21
+ return None
22
+
23
+ def extract_gene_region(genome_seq, gene_start, gene_end):
24
+ """
25
+ Extract a gene region with additional context
26
+ """
27
+ try:
28
+ flank = 200
29
+ start = max(0, gene_start - flank)
30
+ end = min(len(genome_seq), gene_end + flank)
31
+ return genome_seq[start:end], start
32
+ except Exception as e:
33
+ st.error(f"Error extracting gene region: {str(e)}")
34
+ return None, None
35
+
36
+ def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
37
+ """
38
+ Find mutations with sequence context
39
+ """
40
+ try:
41
+ alignments = pairwise2.align.globalms(ref_seq, query_seq,
42
+ match=2,
43
+ mismatch=-3,
44
+ open=-10,
45
+ extend=-0.5)
46
+
47
+ if not alignments:
48
+ st.warning("No alignments found")
49
+ return []
50
+
51
+ alignment = alignments[0]
52
+ ref_aligned, query_aligned = alignment[0], alignment[1]
53
+
54
+ mutations = []
55
+ real_pos = 0
56
+
57
+ for i in range(len(ref_aligned)):
58
+ if ref_aligned[i] != '-':
59
+ real_pos += 1
60
+
61
+ if ref_aligned[i] != query_aligned[i]:
62
+ adj_pos = offset + real_pos
63
+ if gene_start <= adj_pos <= gene_end:
64
+ mut = {
65
+ 'position': adj_pos,
66
+ 'gene_position': adj_pos - gene_start + 1,
67
+ 'ref_base': ref_aligned[i],
68
+ 'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
69
+ 'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
70
+ 'codon_position': (real_pos - 1) % 3 + 1,
71
+ 'context': {
72
+ 'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
73
+ 'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
74
+ }
75
+ }
76
+ mutations.append(mut)
77
+
78
+ return mutations
79
+ except Exception as e:
80
+ st.error(f"Error in mutation analysis: {str(e)}")
81
+ return []
82
+
83
+ # Dictionary of important M. tuberculosis genes and their positions
84
+ IMPORTANT_GENES = {
85
+ 'rpoB': {'start': 759807, 'end': 763325, 'description': 'RNA polymerase β subunit (Rifampicin resistance)'},
86
+ 'katG': {'start': 2153889, 'end': 2156111, 'description': 'Catalase-peroxidase (Isoniazid resistance)'},
87
+ 'inhA': {'start': 1674202, 'end': 1675011, 'description': 'Enoyl-ACP reductase (Isoniazid resistance)'},
88
+ 'gyrA': {'start': 7302, 'end': 9818, 'description': 'DNA gyrase subunit A (Fluoroquinolone resistance)'}
89
+ }
90
+
91
+ def create_mutation_dataframe(mutations):
92
+ """
93
+ Convert mutations list to pandas DataFrame
94
+ """
95
+ if not mutations:
96
+ return pd.DataFrame()
97
+
98
+ data = []
99
+ for mut in mutations:
100
+ data.append({
101
+ 'Position': mut['position'],
102
+ 'Gene Position': mut['gene_position'],
103
+ 'Type': mut['type'],
104
+ 'Reference': mut['ref_base'],
105
+ 'Query': mut['query_base'],
106
+ 'Codon Position': mut['codon_position']
107
+ })
108
+ return pd.DataFrame(data)
109
+
110
+ def plot_mutation_distribution(df):
111
+ """
112
+ Create a visualization of mutation distribution
113
+ """
114
+ if df.empty:
115
+ return None
116
+
117
+ fig = px.scatter(df,
118
+ x='Position',
119
+ y='Type',
120
+ color='Type',
121
+ title='Mutation Distribution',
122
+ labels={'Position': 'Genome Position', 'Type': 'Mutation Type'})
123
+ return fig
124
+
125
+ def main():
126
+ st.title("M. tuberculosis Genome Comparison Tool")
127
+
128
+ st.markdown("""
129
+ This tool compares two M. tuberculosis genomes and identifies mutations in important genes.
130
+ Upload your reference genome (typically H37Rv) and your query genome (wild type/clinical isolate) in FASTA format.
131
+ """)
132
+
133
+ # File upload section
134
+ col1, col2 = st.columns(2)
135
+ with col1:
136
+ reference_file = st.file_uploader("Upload Reference Genome (FASTA)", type=['fasta', 'fa'])
137
+ with col2:
138
+ query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
139
+
140
+ # Gene selection
141
+ selected_gene = st.selectbox(
142
+ "Select gene to analyze",
143
+ options=list(IMPORTANT_GENES.keys()),
144
+ format_func=lambda x: f"{x} - {IMPORTANT_GENES[x]['description']}"
145
+ )
146
+
147
+ if reference_file and query_file:
148
+ if st.button("Analyze Genomes"):
149
+ with st.spinner("Analyzing genomes..."):
150
+ # Read sequences
151
+ ref_genome = read_fasta_from_upload(reference_file)
152
+ query_genome = read_fasta_from_upload(query_file)
153
+
154
+ if ref_genome and query_genome:
155
+ # Get gene coordinates
156
+ gene_start = IMPORTANT_GENES[selected_gene]['start']
157
+ gene_end = IMPORTANT_GENES[selected_gene]['end']
158
+
159
+ # Extract and analyze gene regions
160
+ ref_region, ref_start = extract_gene_region(ref_genome, gene_start, gene_end)
161
+ query_region, _ = extract_gene_region(query_genome, gene_start, gene_end)
162
+
163
+ if ref_region and query_region:
164
+ # Find mutations
165
+ mutations = find_mutations_with_context(
166
+ ref_region, query_region,
167
+ gene_start, gene_end,
168
+ ref_start
169
+ )
170
+
171
+ # Create results section
172
+ st.subheader("Analysis Results")
173
+
174
+ # Summary statistics
175
+ st.markdown("### Summary Statistics")
176
+ total_mutations = len(mutations)
177
+ snps = len([m for m in mutations if m['type'] == 'SNP'])
178
+ indels = len([m for m in mutations if m['type'] == 'INDEL'])
179
+
180
+ col1, col2, col3 = st.columns(3)
181
+ col1.metric("Total Mutations", total_mutations)
182
+ col2.metric("SNPs", snps)
183
+ col3.metric("INDELs", indels)
184
+
185
+ # Convert mutations to DataFrame
186
+ df = create_mutation_dataframe(mutations)
187
+
188
+ if not df.empty:
189
+ # Plot mutation distribution
190
+ st.plotly_chart(plot_mutation_distribution(df))
191
+
192
+ # Detailed mutation table
193
+ st.markdown("### Detailed Mutation Analysis")
194
+ st.dataframe(df)
195
+
196
+ # Download results
197
+ csv = df.to_csv(index=False)
198
+ st.download_button(
199
+ "Download Results CSV",
200
+ csv,
201
+ "mutations.csv",
202
+ "text/csv",
203
+ key='download-csv'
204
+ )
205
+ else:
206
+ st.info(f"No mutations found in {selected_gene}")
207
+ else:
208
+ st.error("Error extracting gene regions")
209
+ else:
210
+ st.error("Error reading genome files")
211
+
212
+ if __name__ == "__main__":
213
+ main()