Spaces:
Sleeping
Sleeping
| # Import libraries | |
| import streamlit as st | |
| import pandas as pd | |
| from Bio import SeqIO | |
| from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
| from Bio.Graphics.GenomeDiagram import Diagram, Track, FeatureSet | |
| from reportlab.lib import colors | |
| from reportlab.lib.units import cm | |
| from io import StringIO | |
| from collections import Counter | |
| import numpy as np | |
| import altair as alt | |
| import os | |
| import re | |
| import plotly.express as px | |
| # Ensure the 'temp' directory exists for saving temporary files | |
| temp_dir = "temp" | |
| os.makedirs(temp_dir, exist_ok=True) | |
| # Function to parse GenBank file | |
| def parse_genbank(uploaded_file): | |
| stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) | |
| record = SeqIO.read(stringio, "genbank") | |
| organism = record.annotations.get('organism', 'Unknown Organism') | |
| features = record.features | |
| feature_types = Counter([feature.type for feature in features]) | |
| genes, cds = [], [] | |
| for feature in features: | |
| if feature.type == "gene": | |
| genes.append(feature) | |
| elif feature.type == "CDS": | |
| cds.append(feature) | |
| gene_info = [{ | |
| 'Gene': gene_feature.qualifiers.get('gene', ['N/A'])[0], | |
| 'Length': len(gene_feature), | |
| 'Location': str(gene_feature.location), | |
| 'Sequence': str(gene_feature.extract(record.seq)) | |
| } for gene_feature in genes] | |
| cds_info = [{ | |
| 'Gene': cds_feature.qualifiers.get('gene', ['N/A'])[0], | |
| 'Protein': cds_feature.qualifiers.get('translation', ['N/A'])[0], | |
| 'Length': len(cds_feature), | |
| 'Location': str(cds_feature.location) | |
| } for cds_feature in cds] | |
| gc_content = (str(record.seq).count('G') + str(record.seq).count('C')) / len(record.seq) * 100 | |
| return organism, gene_info, cds_info, gc_content, len(record.seq), feature_types, str(record.seq) | |
| # Function to calculate GC content over genome | |
| def calculate_gc_content(sequence, window_size=1000): | |
| gc_content = [ | |
| (sequence[i:i+window_size].count('G') + sequence[i:i+window_size].count('C')) / window_size * 100 | |
| for i in range(0, len(sequence) - window_size + 1, window_size) | |
| ] | |
| return gc_content | |
| # Function to calculate k-mers | |
| def calculate_kmers(sequence, k): | |
| kmers = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)]) | |
| return kmers | |
| # Function to add molecular weight and isoelectric point to CDS information | |
| def add_protein_features(cds_info): | |
| for cds in cds_info: | |
| if cds['Protein'] != 'N/A': | |
| prot_analysis = ProteinAnalysis(cds['Protein']) | |
| cds['Molecular Weight'] = prot_analysis.molecular_weight() | |
| cds['Isoelectric Point'] = prot_analysis.isoelectric_point() | |
| else: | |
| cds['Molecular Weight'] = 'N/A' | |
| cds['Isoelectric Point'] = 'N/A' | |
| return cds_info | |
| # Updated Function to generate genome diagram | |
| def create_genome_diagram(genbank_content, output_file_path, colors_dict, diagram_type="linear", diagram_size=(30, 10)): | |
| from Bio.Graphics.GenomeDiagram import Diagram, Track, FeatureSet | |
| record = SeqIO.read(StringIO(genbank_content), "genbank") | |
| gd_diagram = Diagram(record.id) | |
| # Create separate tracks for different feature types | |
| max_tracks = len(colors_dict) | |
| track_indices = {feature_type: idx+1 for idx, feature_type in enumerate(colors_dict.keys())} | |
| feature_tracks = {} | |
| feature_sets = {} # Dictionary to store FeatureSets | |
| for feature_type, idx in track_indices.items(): | |
| feature_tracks[feature_type] = gd_diagram.new_track( | |
| idx, name=feature_type, scale=False, greytrack=False, height=0.5 | |
| ) | |
| # Store the FeatureSet | |
| feature_sets[feature_type] = feature_tracks[feature_type].new_set() | |
| for feature in record.features: | |
| feature_type = feature.type | |
| if feature_type in colors_dict: | |
| color = colors.HexColor(colors_dict[feature_type]) | |
| # Retrieve the FeatureSet from the dictionary | |
| feature_set = feature_sets[feature_type] | |
| feature_set.add_feature( | |
| feature, | |
| color=color, | |
| label=True, | |
| label_size=6, # Decreased label size | |
| label_angle=0, | |
| label_position="start", # Position label at the start of the feature | |
| label_strand=1 if feature.strand == 1 else -1, # Position labels according to strand | |
| sigil="ARROW", # Use arrows to represent features | |
| arrowshaft_height=1.0, | |
| arrowhead_length=1.0, | |
| ) | |
| if diagram_type.lower() == "circular": | |
| gd_diagram.draw( | |
| format="circular", | |
| circular=True, | |
| pagesize=(diagram_size[0]*cm, diagram_size[1]*cm), | |
| start=0, | |
| end=len(record), | |
| circle_core=0.7 | |
| ) | |
| else: | |
| gd_diagram.draw( | |
| format="linear", | |
| pagesize=(diagram_size[0]*cm, diagram_size[1]*cm), | |
| fragments=1, | |
| start=0, | |
| end=len(record), | |
| tracklines=False # Remove track lines to reduce clutter | |
| ) | |
| gd_diagram.write(output_file_path, "SVG") | |
| # Function to search for a motif or pattern within the DNA sequence | |
| def search_motif(sequence, motif): | |
| matches = [(match.start(), match.end()) for match in re.finditer(motif, sequence)] | |
| return matches | |
| # Function to calculate codon usage frequencies | |
| def calculate_codon_usage(sequence): | |
| codons = [sequence[i:i+3] for i in range(0, len(sequence)-2, 3)] | |
| codons = [codon for codon in codons if len(codon) == 3] # Ensure codons are of length 3 | |
| codon_freq = Counter(codons) | |
| total_codons = sum(codon_freq.values()) | |
| codon_freq_percent = {codon: (count / total_codons) * 100 for codon, count in codon_freq.items()} | |
| return codon_freq_percent | |
| # Streamlit UI setup | |
| st.set_page_config(page_title="Genomic Data Dashboard", page_icon="🧬", layout="wide") | |
| uploaded_file = st.file_uploader("Upload a GenBank file", type=['gb', 'gbk']) | |
| if uploaded_file is not None: | |
| organism, gene_info, cds_info, gc_content, sequence_length, feature_types, sequence = parse_genbank(uploaded_file) | |
| cds_info = add_protein_features(cds_info) | |
| gene_df = pd.DataFrame(gene_info) | |
| cds_df = pd.DataFrame(cds_info) | |
| # Sidebar | |
| with st.sidebar: | |
| st.title('Genomic Data Dashboard') | |
| st.write(f'**Organism:** {organism}') | |
| window_size = st.number_input('GC content sliding window size', min_value=100, max_value=10000, value=1000) | |
| k = st.number_input('k-mer size', min_value=1, max_value=10, value=6) | |
| motif = st.text_input("Enter motif or pattern to search") | |
| st.markdown("### Genome Diagram Settings") | |
| diagram_type = st.radio("Select Diagram Type:", ["Linear", "Circular"], index=0) | |
| diagram_width = st.number_input("Diagram Width (cm):", min_value=1, max_value=50, value=30) | |
| diagram_height = st.number_input("Diagram Height (cm):", min_value=1, max_value=50, value=10) | |
| # Sidebar options for diagram customization | |
| st.markdown("### Feature Colors") | |
| color_gene = st.color_picker("Pick a color for genes", '#ff9999') | |
| color_cds = st.color_picker("Pick a color for CDS", '#66b3ff') | |
| color_trna = st.color_picker("Pick a color for tRNA", '#99ff99') | |
| color_rrna = st.color_picker("Pick a color for rRNA", '#ffcc99') | |
| # Main content | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown('### General Information') | |
| st.write(f'**Organism:** {organism}') | |
| st.write(f'**Sequence Length:** {sequence_length} bp') | |
| st.write(f'**GC Content:** {gc_content:.2f}%') | |
| st.write(f'**Number of Genes:** {len(gene_df)}') | |
| st.write(f'**Number of Coding Sequences (CDS):** {len(cds_df)}') | |
| st.markdown('### Feature Counts') | |
| for feature_type, count in feature_types.items(): | |
| st.write(f"**{feature_type}:** {count}") | |
| with col2: | |
| st.markdown('### GC Content Over Genome') | |
| gc_content_over_genome = calculate_gc_content(sequence, window_size) | |
| gc_chart = alt.Chart(pd.DataFrame({ | |
| 'GC Content': gc_content_over_genome, | |
| 'Position': np.arange(len(gc_content_over_genome)) * window_size | |
| })).mark_line().encode( | |
| x='Position:Q', | |
| y='GC Content:Q' | |
| ).properties(height=200) | |
| st.altair_chart(gc_chart, use_container_width=True) | |
| st.markdown('### K-mer Analysis') | |
| kmers = calculate_kmers(sequence, k) | |
| kmer_df = pd.DataFrame.from_dict(kmers, orient='index', columns=['Frequency']).sort_values('Frequency', ascending=False).head(20) | |
| st.bar_chart(kmer_df) | |
| # Construct the colors dictionary | |
| feature_colors = { | |
| 'gene': color_gene, | |
| 'CDS': color_cds, | |
| 'tRNA': color_trna, | |
| 'rRNA': color_rrna | |
| } | |
| # Generate and display genome diagram with user-selected feature colors | |
| output_file_path_svg = os.path.join(temp_dir, "genome_diagram.svg") | |
| create_genome_diagram( | |
| uploaded_file.getvalue().decode("utf-8"), | |
| output_file_path_svg, | |
| feature_colors, | |
| diagram_type=diagram_type.lower(), | |
| diagram_size=(diagram_width, diagram_height) | |
| ) | |
| st.image(output_file_path_svg, caption='Genome Diagram') | |
| # Motif Search | |
| if motif: | |
| matches = search_motif(sequence, motif) | |
| st.markdown(f"### Motif Search Results for '{motif}'") | |
| if matches: | |
| st.write("Matches found at positions:") | |
| matches_df = pd.DataFrame(matches, columns=["Start", "End"]) | |
| st.dataframe(matches_df) | |
| else: | |
| st.write("No matches found.") | |
| # Codon Usage Analysis | |
| st.markdown("### Codon Usage Analysis") | |
| codon_usage_freq = calculate_codon_usage(sequence) | |
| codon_usage_df = pd.DataFrame.from_dict(codon_usage_freq, orient='index', columns=['Frequency (%)']) | |
| codon_usage_df.index.name = 'Codon' | |
| codon_usage_df.reset_index(inplace=True) | |
| st.dataframe(codon_usage_df) | |
| # Interactive Visualization | |
| st.markdown("### Interactive Visualization") | |
| fig = px.bar( | |
| codon_usage_df, | |
| x='Codon', | |
| y='Frequency (%)', | |
| labels={'Codon': 'Codon', 'Frequency (%)': 'Frequency (%)'}, | |
| title="Codon Usage Frequency" | |
| ) | |
| st.plotly_chart(fig) | |
| # Additional Information | |
| with st.expander("View All Genes"): | |
| st.dataframe(gene_df) | |
| with st.expander("View All Coding Sequences"): | |
| st.dataframe(cds_df[['Gene', 'Length', 'Molecular Weight', 'Isoelectric Point']]) | |
| else: | |
| st.warning("Please upload a GenBank file.") | |
| # Add copyright information section at the end of the main page | |
| st.markdown(""" | |
| --- | |
| **Copyright Notice**: © 2024 Dr. Yash Munnalal Gupta. All rights reserved. | |
| For inquiries or permissions, contact: [yash.610@live.com](mailto:yash.610@live.com) | |
| """, unsafe_allow_html=True) | |