Spaces:

lyimo
/

dnaseq

Build error

App Files Files Community

lyimo commited on Jan 6, 2025

Commit

129decb

verified ·

1 Parent(s): af7dc6d

Create app.py

Browse files

Files changed (1) hide show

app.py +213 -0

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# app.py
+import streamlit as st
+from Bio import pairwise2
+import re
+from collections import defaultdict
+import pandas as pd
+import plotly.express as px
+import io
+def read_fasta_from_upload(uploaded_file):
+    """
+    Read a FASTA file from Streamlit upload
+    """
+    try:
+        content = uploaded_file.getvalue().decode('utf-8').strip()
+        parts = content.split('\n', 1)
+        sequence = ''.join(parts[1].split('\n')).replace(' ', '')
+        return sequence.upper()
+    except Exception as e:
+        st.error(f"Error reading uploaded file: {str(e)}")
+        return None
+def extract_gene_region(genome_seq, gene_start, gene_end):
+    """
+    Extract a gene region with additional context
+    """
+    try:
+        flank = 200
+        start = max(0, gene_start - flank)
+        end = min(len(genome_seq), gene_end + flank)
+        return genome_seq[start:end], start
+    except Exception as e:
+        st.error(f"Error extracting gene region: {str(e)}")
+        return None, None
+def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
+    """
+    Find mutations with sequence context
+    """
+    try:
+        alignments = pairwise2.align.globalms(ref_seq, query_seq,
+                                            match=2,
+                                            mismatch=-3,
+                                            open=-10,
+                                            extend=-0.5)
+        if not alignments:
+            st.warning("No alignments found")
+            return []
+        alignment = alignments[0]
+        ref_aligned, query_aligned = alignment[0], alignment[1]
+        mutations = []
+        real_pos = 0
+        for i in range(len(ref_aligned)):
+            if ref_aligned[i] != '-':
+                real_pos += 1
+            if ref_aligned[i] != query_aligned[i]:
+                adj_pos = offset + real_pos
+                if gene_start <= adj_pos <= gene_end:
+                    mut = {
+                        'position': adj_pos,
+                        'gene_position': adj_pos - gene_start + 1,
+                        'ref_base': ref_aligned[i],
+                        'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
+                        'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
+                        'codon_position': (real_pos - 1) % 3 + 1,
+                        'context': {
+                            'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
+                            'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
+                        }
+                    }
+                    mutations.append(mut)
+        return mutations
+    except Exception as e:
+        st.error(f"Error in mutation analysis: {str(e)}")
+        return []
+# Dictionary of important M. tuberculosis genes and their positions
+IMPORTANT_GENES = {
+    'rpoB': {'start': 759807, 'end': 763325, 'description': 'RNA polymerase β subunit (Rifampicin resistance)'},
+    'katG': {'start': 2153889, 'end': 2156111, 'description': 'Catalase-peroxidase (Isoniazid resistance)'},
+    'inhA': {'start': 1674202, 'end': 1675011, 'description': 'Enoyl-ACP reductase (Isoniazid resistance)'},
+    'gyrA': {'start': 7302, 'end': 9818, 'description': 'DNA gyrase subunit A (Fluoroquinolone resistance)'}
+}
+def create_mutation_dataframe(mutations):
+    """
+    Convert mutations list to pandas DataFrame
+    """
+    if not mutations:
+        return pd.DataFrame()
+    data = []
+    for mut in mutations:
+        data.append({
+            'Position': mut['position'],
+            'Gene Position': mut['gene_position'],
+            'Type': mut['type'],
+            'Reference': mut['ref_base'],
+            'Query': mut['query_base'],
+            'Codon Position': mut['codon_position']
+        })
+    return pd.DataFrame(data)
+def plot_mutation_distribution(df):
+    """
+    Create a visualization of mutation distribution
+    """
+    if df.empty:
+        return None
+    fig = px.scatter(df,
+                    x='Position',
+                    y='Type',
+                    color='Type',
+                    title='Mutation Distribution',
+                    labels={'Position': 'Genome Position', 'Type': 'Mutation Type'})
+    return fig
+def main():
+    st.title("M. tuberculosis Genome Comparison Tool")
+    st.markdown("""
+    This tool compares two M. tuberculosis genomes and identifies mutations in important genes.
+    Upload your reference genome (typically H37Rv) and your query genome (wild type/clinical isolate) in FASTA format.
+    """)
+    # File upload section
+    col1, col2 = st.columns(2)
+    with col1:
+        reference_file = st.file_uploader("Upload Reference Genome (FASTA)", type=['fasta', 'fa'])
+    with col2:
+        query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
+    # Gene selection
+    selected_gene = st.selectbox(
+        "Select gene to analyze",
+        options=list(IMPORTANT_GENES.keys()),
+        format_func=lambda x: f"{x} - {IMPORTANT_GENES[x]['description']}"
+    )
+    if reference_file and query_file:
+        if st.button("Analyze Genomes"):
+            with st.spinner("Analyzing genomes..."):
+                # Read sequences
+                ref_genome = read_fasta_from_upload(reference_file)
+                query_genome = read_fasta_from_upload(query_file)
+                if ref_genome and query_genome:
+                    # Get gene coordinates
+                    gene_start = IMPORTANT_GENES[selected_gene]['start']
+                    gene_end = IMPORTANT_GENES[selected_gene]['end']
+                    # Extract and analyze gene regions
+                    ref_region, ref_start = extract_gene_region(ref_genome, gene_start, gene_end)
+                    query_region, _ = extract_gene_region(query_genome, gene_start, gene_end)
+                    if ref_region and query_region:
+                        # Find mutations
+                        mutations = find_mutations_with_context(
+                            ref_region, query_region,
+                            gene_start, gene_end,
+                            ref_start
+                        )
+                        # Create results section
+                        st.subheader("Analysis Results")
+                        # Summary statistics
+                        st.markdown("### Summary Statistics")
+                        total_mutations = len(mutations)
+                        snps = len([m for m in mutations if m['type'] == 'SNP'])
+                        indels = len([m for m in mutations if m['type'] == 'INDEL'])
+                        col1, col2, col3 = st.columns(3)
+                        col1.metric("Total Mutations", total_mutations)
+                        col2.metric("SNPs", snps)
+                        col3.metric("INDELs", indels)
+                        # Convert mutations to DataFrame
+                        df = create_mutation_dataframe(mutations)
+                        if not df.empty:
+                            # Plot mutation distribution
+                            st.plotly_chart(plot_mutation_distribution(df))
+                            # Detailed mutation table
+                            st.markdown("### Detailed Mutation Analysis")
+                            st.dataframe(df)
+                            # Download results
+                            csv = df.to_csv(index=False)
+                            st.download_button(
+                                "Download Results CSV",
+                                csv,
+                                "mutations.csv",
+                                "text/csv",
+                                key='download-csv'
+                            )
+                        else:
+                            st.info(f"No mutations found in {selected_gene}")
+                    else:
+                        st.error("Error extracting gene regions")
+                else:
+                    st.error("Error reading genome files")
+if __name__ == "__main__":
+    main()