Spaces:
Running
Running
Halper-Stromberg
Add window-based variant placement strategy for Direct Probe Coordinates mode
2a68195 | import streamlit as st | |
| import os | |
| import tempfile | |
| import subprocess | |
| from pathlib import Path | |
| import pipeline as pl | |
| import json | |
| import shutil | |
| def render_igv(res): | |
| work_dir_name = res.get("work_dir_name") | |
| genome_version = res.get("genome_version", "hg38") | |
| if not work_dir_name: | |
| st.warning("No static files path found for IGV.js. Restart the pipeline to generate.") | |
| return | |
| igv_bed_path = Path(res["igv_bed_path"]) | |
| variants = [] | |
| if igv_bed_path.exists(): | |
| with open(igv_bed_path, "r") as f: | |
| for line in f: | |
| if line.startswith("#") or not line.strip(): | |
| continue | |
| parts = line.strip().split("\t") | |
| if len(parts) >= 4: | |
| chrom = parts[0] | |
| start = int(parts[1]) | |
| end = int(parts[2]) | |
| label = parts[3] | |
| locus = f"{chrom}:{max(1, start-50)}-{end+50}" | |
| variants.append({ | |
| "locus": locus, | |
| "name": label.split("_")[-1], | |
| "label": label, | |
| "pos_label": f"{chrom}:{start+1}" | |
| }) | |
| variants_json = json.dumps(variants) | |
| bam_url = f"/app/static/{work_dir_name}/synthetic.sorted.bam" | |
| bai_url = f"/app/static/{work_dir_name}/synthetic.sorted.bam.bai" | |
| vcf_url = f"/app/static/{work_dir_name}/synthetic.vcf" | |
| navigator_url = f"/app/static/{work_dir_name}/igv_variant_navigator.bed" | |
| probes_url = f"/app/static/{work_dir_name}/fully_covered_exons.bed" | |
| mane_url = f"/app/static/{work_dir_name}/mane_transcripts.bed" | |
| probes_bed_exists = res.get("fully_covered_bed_path") is not None and Path(res["fully_covered_bed_path"]).exists() | |
| mane_transcripts_exists = res.get("mane_transcripts_bed_path") is not None and Path(res["mane_transcripts_bed_path"]).exists() | |
| tracks = [ | |
| { | |
| "name": "Reference", | |
| "type": "sequence", | |
| "order": 1 | |
| } | |
| ] | |
| if mane_transcripts_exists: | |
| tracks.append({ | |
| "name": "MANE Transcripts", | |
| "type": "annotation", | |
| "format": "bed", | |
| "url": mane_url, | |
| "indexed": False, | |
| "order": 1.5, | |
| "color": "green", | |
| "displayMode": "EXPANDED" | |
| }) | |
| if probes_bed_exists: | |
| tracks.append({ | |
| "name": "Probes BED", | |
| "type": "annotation", | |
| "format": "bed", | |
| "url": probes_url, | |
| "indexed": False, | |
| "order": 2, | |
| "color": "blue" | |
| }) | |
| tracks.extend([ | |
| { | |
| "name": "Variant Navigator BED", | |
| "type": "annotation", | |
| "format": "bed", | |
| "url": navigator_url, | |
| "indexed": False, | |
| "order": 3, | |
| "color": "red" | |
| }, | |
| { | |
| "name": "Synthetic VCF", | |
| "type": "variant", | |
| "format": "vcf", | |
| "url": vcf_url, | |
| "indexed": False, | |
| "order": 4 | |
| }, | |
| { | |
| "name": "Synthetic BAM", | |
| "type": "alignment", | |
| "format": "bam", | |
| "url": bam_url, | |
| "indexURL": bai_url, | |
| "order": 5, | |
| "height": 300 | |
| } | |
| ]) | |
| tracks_json = json.dumps(tracks) | |
| html_content = f""" | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css"> | |
| <style> | |
| body {{ | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; | |
| margin: 0; | |
| padding: 0; | |
| display: flex; | |
| height: 600px; | |
| background-color: #ffffff; | |
| }} | |
| #sidebar {{ | |
| width: 250px; | |
| border-right: 1px solid #e0e0e0; | |
| display: flex; | |
| flex-direction: column; | |
| height: 100%; | |
| background-color: #f8f9fa; | |
| }} | |
| #sidebar-header {{ | |
| padding: 10px; | |
| background-color: #0e1117; | |
| color: white; | |
| font-weight: bold; | |
| font-size: 14px; | |
| }} | |
| #variant-list {{ | |
| flex-grow: 1; | |
| overflow-y: auto; | |
| padding: 5px; | |
| }} | |
| .variant-item {{ | |
| padding: 8px 10px; | |
| margin-bottom: 4px; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| border: 1px solid #e9ecef; | |
| background-color: white; | |
| font-size: 12px; | |
| transition: background-color 0.2s; | |
| }} | |
| .variant-item:hover {{ | |
| background-color: #e9ecef; | |
| }} | |
| .variant-name {{ | |
| font-weight: bold; | |
| color: #ff4b4b; | |
| }} | |
| .variant-pos {{ | |
| color: #6c757d; | |
| margin-top: 2px; | |
| }} | |
| #igv-container {{ | |
| flex-grow: 1; | |
| height: 100%; | |
| overflow: hidden; | |
| }} | |
| #igv-div {{ | |
| height: 600px; | |
| width: 100%; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <div id="sidebar"> | |
| <div id="sidebar-header"><i class="fas fa-list"></i> Variant Navigator ({len(variants)})</div> | |
| <div id="variant-list"></div> | |
| </div> | |
| <div id="igv-container"> | |
| <div id="igv-div"></div> | |
| </div> | |
| <script src="https://cdn.jsdelivr.net/npm/igv@2.15.5/dist/igv.min.js"></script> | |
| <script> | |
| var variants = {variants_json}; | |
| var listContainer = document.getElementById("variant-list"); | |
| variants.forEach(function(v, index) {{ | |
| var item = document.createElement("div"); | |
| item.className = "variant-item"; | |
| item.innerHTML = '<div class="variant-name">' + v.name.toUpperCase() + '</div>' + | |
| '<div class="variant-pos">' + v.pos_label + '</div>'; | |
| item.onclick = function() {{ | |
| if (window.igvBrowser) {{ | |
| window.igvBrowser.search(v.locus); | |
| }} | |
| }}; | |
| listContainer.appendChild(item); | |
| }}); | |
| var options = {{ | |
| genome: "{genome_version}", | |
| locus: variants.length > 0 ? variants[0].locus : "chr1:1787315-1787437", | |
| tracks: {tracks_json} | |
| }}; | |
| var igvDiv = document.getElementById("igv-div"); | |
| igv.createBrowser(igvDiv, options) | |
| .then(function (browser) {{ | |
| window.igvBrowser = browser; | |
| console.log("IGV browser created successfully."); | |
| }}) | |
| .catch(function(err) {{ | |
| console.error("Error creating IGV browser:", err); | |
| document.getElementById("igv-div").innerHTML = | |
| "<div style='color:#721c24; background-color:#f8d7da; border:1px solid #f5c6cb; padding:20px; border-radius:4px; font-family:sans-serif; margin:20px;'>" + | |
| "<h3>β Error Loading IGV Browser</h3>" + | |
| "<p><b>Message:</b> " + err.toString() + "</p>" + | |
| "<p>This usually indicates static file serving is not enabled or files are not accessible.</p>" + | |
| "<p><b>Paths attempted:</b></p>" + | |
| "<ul>" + | |
| "<li>BAM: <code>" + options.tracks[4].url + "</code></li>" + | |
| "<li>VCF: <code>" + options.tracks[3].url + "</code></li>" + | |
| "<li>BED: <code>" + options.tracks[1].url + "</code></li>" + | |
| "</ul>" + | |
| "<p>Please verify that <code>enableStaticServing = true</code> is active and the Hugging Face Space has fully rebuilt.</p>" + | |
| "</div>"; | |
| }}); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| st.components.v1.html(html_content, height=620, scrolling=False) | |
| st.set_page_config( | |
| page_title="In Silico Controls Generator", | |
| page_icon="π§¬", | |
| layout="wide", | |
| ) | |
| st.title("𧬠In Silico Controls Generator") | |
| st.caption( | |
| "Generate synthetic BAM + VCF files with realistic variants " | |
| "derived from your probe panel and MANE exon annotations." | |
| ) | |
| # ββ Sidebar: parameters ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| uploaded_bed = st.session_state.get("uploaded_bed_file") | |
| with st.sidebar: | |
| st.header("Pipeline Parameters") | |
| st.subheader("Sequencing Parameters") | |
| depth = st.number_input("Target read depth per variant", min_value=1, max_value=10000, value=100, step=10) | |
| vaf = st.slider("Variant allele frequency (VAF)", min_value=0.01, max_value=1.0, value=0.20, step=0.01, format="%.2f") | |
| read_length = st.number_input("Read length (bp)", min_value=50, max_value=300, value=150, step=10) | |
| st.subheader("Sequencing Technology") | |
| seq_mode = st.radio( | |
| "Sequencing assay style", | |
| options=["Hybrid Capture (Staggered reads)", "PCR Amplicon (Identical start/ends)"], | |
| index=0, | |
| help="Hybrid Capture simulates sheared fragments with staggered read start/end coordinates. PCR Amplicon simulates amplicon sequencing where all reads start and end exactly at the probe/target coordinate boundaries." | |
| ) | |
| if seq_mode == "Hybrid Capture (Staggered reads)": | |
| st.subheader("Fragment Insert Size") | |
| insert_size = st.number_input("Mean insert size (bp)", min_value=100, max_value=1000, value=379, step=10) | |
| insert_std = st.number_input("Insert size std dev (bp)", min_value=0, max_value=200, value=20, step=5) | |
| else: | |
| insert_size = 250 | |
| insert_std = 0 | |
| st.subheader("Indel Parameters") | |
| indel_interval = st.number_input( | |
| "Indel interval (0 = SNVs only)", | |
| min_value=0, max_value=100, value=10, step=1, | |
| help="Make every Nth variant an indel. Set to 0 to generate only SNVs.", | |
| ) | |
| st.divider() | |
| st.subheader("Reference Genome") | |
| genome_version = st.selectbox( | |
| "Genome assembly", | |
| options=["hg38", "hg19"], | |
| index=0, | |
| help="Choose the reference genome version (hg38 or hg19)." | |
| ) | |
| ref_mode = st.radio( | |
| "FASTA source", | |
| options=["Use cached / download", "Custom path"], | |
| help="Downloads and caches the selected assembly, or lets you point to a custom local path.", | |
| ) | |
| custom_ref_path = "" | |
| if ref_mode == "Custom path": | |
| custom_ref_path = st.text_input( | |
| f"Path to {genome_version}.fa", | |
| placeholder=f"/data/references/{genome_version}.fa", | |
| help="Must be an indexed FASTA (.fa + .fa.fai).", | |
| ) | |
| st.divider() | |
| st.subheader("Targeting Mode") | |
| if genome_version == "hg19": | |
| st.info("βΉοΈ MANE transcript annotations are hg38-only. Targeting mode is set to Direct Probe Coordinates for hg19.") | |
| target_mode = "Direct Probe Coordinates" | |
| else: | |
| target_mode = st.radio( | |
| "Variant targeting logic", | |
| options=["MANE Transcript Exons/Introns", "Direct Probe Coordinates"], | |
| help="MANE Transcripts Mode places variants in coding exons and flanking introns of protein-coding genes. Direct Probe Mode places a single variant inside each probe coordinate itself, completely ignoring gene annotations." | |
| ) | |
| if target_mode == "MANE Transcript Exons/Introns": | |
| st.subheader("Variant Locations") | |
| include_cds = st.checkbox("Generate CDS variants", value=True, help="Place variants in the coding sequence (CDS) of MANE exons.") | |
| include_intron = st.checkbox("Generate flanking intronic variants", value=True, help="Place variants in the flanking introns of MANE exons.") | |
| include_offtarget = st.checkbox("Generate off-target (unused probe) variants", value=True, help="Place variants in the midpoint of probes with no MANE exon coverage.") | |
| direct_window_size = 0 | |
| else: | |
| include_cds = False | |
| include_intron = False | |
| include_offtarget = False | |
| st.subheader("Direct Probe Settings") | |
| direct_variant_strategy = st.radio( | |
| "Variant placement strategy", | |
| options=["One variant per N bp window", "Single random variant per probe"], | |
| index=0, | |
| help="Choose whether to generate one variant per N bp window across the probe coordinates or a single random variant per probe." | |
| ) | |
| if direct_variant_strategy == "One variant per N bp window": | |
| direct_window_size = st.number_input( | |
| "Window size (bp)", | |
| min_value=1, max_value=1000, value=10, step=1, | |
| help="Place one variant randomly inside each non-overlapping window of this size." | |
| ) | |
| else: | |
| direct_window_size = 0 | |
| st.subheader("Read Group") | |
| rg_id = st.text_input("Read Group ID", value="CPDV2510843-SEQ-251103") | |
| rg_sm = st.text_input("Sample Name", value="CPDV2510843-SEQ-251103") | |
| st.divider() | |
| st.subheader("π οΈ Debug Info") | |
| st.caption("Helpful diagnostics for troubleshooting deployment status.") | |
| st.write("Streamlit Version:", st.__version__) | |
| st.write("File Uploaded:", uploaded_bed is not None) | |
| if uploaded_bed: | |
| st.write("Filename:", uploaded_bed.name) | |
| st.write("probes_df in state:", "probes_df" in st.session_state) | |
| if "probes_df" in st.session_state: | |
| st.write("probes_df count:", len(st.session_state["probes_df"])) | |
| import os | |
| st.write("CWD:", os.getcwd()) | |
| st.write("Script Path:", __file__) | |
| st.write("static/ exists:", os.path.exists("static")) | |
| st.write("src/static/ exists:", os.path.exists("src/static")) | |
| if os.path.exists("static"): | |
| st.write("static/ folders:", os.listdir("static")[:5]) | |
| if os.path.exists("src/static"): | |
| st.write("src/static/ folders:", os.listdir("src/static")[:5]) | |
| # ββ Main area ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| col_upload, col_info = st.columns([2, 1]) | |
| with col_upload: | |
| st.header("1 Β· Upload Probes BED") | |
| uploaded_bed = st.file_uploader( | |
| "Upload your probes BED file", | |
| type=["bed"], | |
| key="uploaded_bed_file", | |
| help="Standard BED3+ format (chrom, start, end, ...)", | |
| ) | |
| with col_info: | |
| st.header("Cache Status") | |
| mane_cached = pl.MANE_BED12.exists() | |
| ref_cached = (pl.HG38_FA.exists() and pl.HG38_FAI.exists()) if genome_version == "hg38" else (pl.HG19_FA.exists() and pl.HG19_FAI.exists()) | |
| bigbed_cached = pl.BIGBEDTOBED_PATH.exists() | |
| st.markdown(f"{'β ' if bigbed_cached else 'β¬'} bigBedToBed") | |
| st.markdown(f"{'β ' if mane_cached else 'β¬'} MANE annotation") | |
| st.markdown(f"{'β ' if ref_cached else 'β¬'} {genome_version} reference") | |
| if not ref_cached and ref_mode == "Use cached / download": | |
| st.warning(f"{genome_version} not cached. First run will download and index the assembly, which may take 5β10 minutes.") | |
| # ββ Step 1.5: Customize Probes ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if uploaded_bed: | |
| if "probes_df" not in st.session_state or st.session_state.get("uploaded_file_name") != uploaded_bed.name: | |
| import pandas as pd | |
| import io | |
| try: | |
| uploaded_bed.seek(0) | |
| content = uploaded_bed.read().decode("utf-8", errors="ignore") | |
| # Strip comments and headers | |
| lines = [line for line in content.splitlines() if line.strip() and not line.startswith("#") and not line.startswith("track")] | |
| if lines: | |
| df = pd.read_csv(io.StringIO("\n".join(lines)), sep="\t", header=None) | |
| cols = ["chrom", "start", "end"] | |
| if len(df.columns) > 3: | |
| cols += [f"col_{i}" for i in range(3, len(df.columns))] | |
| df.columns = cols[:len(df.columns)] | |
| df.insert(0, "Select", True) | |
| st.session_state["probes_df"] = df | |
| st.session_state["uploaded_file_name"] = uploaded_bed.name | |
| st.session_state.pop("sample_seed", None) | |
| st.session_state.pop("prev_frac", None) | |
| else: | |
| st.error("Uploaded BED file appears to be empty or contains only comments.") | |
| except Exception as e: | |
| st.error(f"Error parsing BED file: {e}") | |
| if "probes_df" in st.session_state: | |
| df = st.session_state["probes_df"] | |
| st.header("1.5 Β· Customize Probes") | |
| st.caption(f"Loaded {len(df):,} probes from {uploaded_bed.name}. Customize which regions will be processed below.") | |
| col_mode, col_rand = st.columns([1, 1]) | |
| with col_mode: | |
| subset_mode = st.radio( | |
| "Selection mode", | |
| options=["All Probes", "Manual Selection (below)", "Random Sampling"], | |
| index=0, | |
| help="Choose whether to run all probes, manually check/uncheck probes in the list, or select a random fraction of the probes." | |
| ) | |
| with col_rand: | |
| if subset_mode == "Random Sampling": | |
| sample_frac = st.slider("Fraction of probes to keep", min_value=0.01, max_value=1.00, value=0.10, step=0.01) | |
| resample_btn = st.button("π² Resample") | |
| if "sample_seed" not in st.session_state or resample_btn or st.session_state.get("prev_frac") != sample_frac: | |
| import random | |
| st.session_state["sample_seed"] = random.randint(0, 100000) | |
| st.session_state["prev_frac"] = sample_frac | |
| sampled_df = df.sample(frac=sample_frac, random_state=st.session_state["sample_seed"]) | |
| df["Select"] = df.index.isin(sampled_df.index) | |
| elif subset_mode == "All Probes": | |
| df["Select"] = True | |
| # Render table editor | |
| st.markdown("#### π Probes List") | |
| st.caption("Double-click a cell to search, or check/uncheck boxes to filter targets.") | |
| edited_df = st.data_editor( | |
| df, | |
| use_container_width=True, | |
| hide_index=True, | |
| disabled=[col for col in df.columns if col != "Select"], | |
| column_config={ | |
| "Select": st.column_config.CheckboxColumn( | |
| "Select", | |
| help="Uncheck to exclude this region from variant generation", | |
| default=True | |
| ) | |
| } | |
| ) | |
| st.session_state["probes_df"] = edited_df | |
| total_selected = len(edited_df[edited_df["Select"] == True]) | |
| st.info(f"Selected {total_selected:,} of {len(df):,} probes ({total_selected/len(df)*100:.1f}%) for variant generation.") | |
| st.divider() | |
| # ββ Step 2: Run pipeline ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.header("2 Β· Run Pipeline") | |
| if not uploaded_bed: | |
| st.info("Upload a probes BED file to enable the pipeline.") | |
| st.stop() | |
| run_btn = st.button("βΆ Run Pipeline", type="primary", use_container_width=True) | |
| # Clear results when a new run is requested | |
| if run_btn: | |
| st.session_state.pop("results", None) | |
| st.session_state.pop("log_lines", None) | |
| # ββ Execute pipeline ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if run_btn: | |
| fasta_path = (pl.HG38_FA if genome_version == "hg38" else pl.HG19_FA) if ref_mode == "Use cached / download" else Path(custom_ref_path) | |
| if ref_mode == "Custom path": | |
| if not custom_ref_path: | |
| st.error("Please provide a path to your hg38.fa file.") | |
| st.stop() | |
| if not fasta_path.exists(): | |
| st.error(f"FASTA file not found: {fasta_path}") | |
| st.stop() | |
| fai = Path(str(fasta_path) + ".fai") | |
| if not fai.exists(): | |
| st.warning("No .fai index found. Attempting to index with samtools faidx...") | |
| subprocess.run(f"samtools faidx {fasta_path}", shell=True, capture_output=True) | |
| # Filter for selected probes | |
| if "probes_df" in st.session_state: | |
| df = st.session_state["probes_df"] | |
| selected_df = df[df["Select"] == True] | |
| else: | |
| st.error("No probe data found in session state.") | |
| st.stop() | |
| if len(selected_df) == 0: | |
| st.error("No probes selected! Please select at least one probe in Step 1.5.") | |
| st.stop() | |
| # Convert back to BED format (tab-separated, without the 'Select' column) | |
| bed_cols = [col for col in selected_df.columns if col != "Select"] | |
| bed_text = selected_df[bed_cols].to_csv(sep="\t", header=False, index=False) | |
| work_dir = Path(tempfile.mkdtemp(prefix="insilicocontrols_")) | |
| probes_bed = work_dir / "probes.bed" | |
| probes_bed.write_text(bed_text) | |
| log_expander = st.expander("Pipeline log", expanded=True) | |
| log_area = log_expander.empty() | |
| log_lines = [] | |
| def append_log(msg): | |
| log_lines.append(str(msg)) | |
| log_area.code("\n".join(log_lines[-80:]), language=None) | |
| progress_bar = st.progress(0.0, text="Starting...") | |
| def update_progress(fraction, label=""): | |
| progress_bar.progress(min(fraction, 1.0), text=label) | |
| try: | |
| # Pre-ensure reference genome | |
| if ref_mode == "Use cached / download": | |
| update_progress(0.08, f"Ensuring {genome_version} reference...") | |
| append_log(f"\n=== {genome_version} Reference ===") | |
| pl.ensure_reference(genome_version=genome_version, log_func=append_log) | |
| if target_mode == "MANE Transcript Exons/Introns": | |
| update_progress(0.02, "Setting up tools...") | |
| append_log("=== Setting up tools ===") | |
| pl.ensure_bigbedtobed(append_log) | |
| update_progress(0.05, "Ensuring MANE annotation...") | |
| append_log("\n=== MANE Annotation ===") | |
| pl.ensure_mane(append_log) | |
| update_progress(0.15, "Parsing MANE exons...") | |
| append_log("\n=== Parsing MANE Exons ===") | |
| exons_bed = pl.parse_mane_exons(work_dir, append_log) | |
| update_progress(0.25, "Analyzing probe coverage...") | |
| append_log("\n=== Coverage Analysis ===") | |
| stats, fully_bed, partial_bed, unused_bed = pl.analyze_coverage( | |
| work_dir, probes_bed, exons_bed, append_log | |
| ) | |
| append_log("\n============================================") | |
| append_log(" COVERAGE SUMMARY ") | |
| append_log("============================================") | |
| append_log(f"Exons with >95% coverage (USED): {stats['fully_covered']}") | |
| append_log(f"Exons with partial coverage (USED): {stats['partially_covered']}") | |
| append_log(f"Probes with no exon coverage: {stats['probes_no_exons']}") | |
| append_log(f"Unused contiguous probes (ADDED): {stats['unused_probes']}") | |
| append_log("============================================") | |
| # Subset MANE transcripts intersecting with target probes | |
| merged_probes = work_dir / "merged_probes.bed" | |
| mane_transcripts_bed = work_dir / "mane_transcripts.bed" | |
| update_progress(0.30, "Subsetting MANE transcripts...") | |
| append_log("\n=== Subsetting MANE Transcripts ===") | |
| pl.run_cmd(f"bedtools intersect -a {pl.MANE_BED12} -b {merged_probes} -wa -u > {mane_transcripts_bed}", append_log) | |
| update_progress(0.35, "Generating target SNVs...") | |
| append_log("\n=== Generating Target SNVs ===") | |
| snvs_bed, total_snvs = pl.generate_target_snvs( | |
| work_dir=work_dir, | |
| fully_bed=fully_bed, | |
| partial_bed=partial_bed, | |
| unused_bed=unused_bed, | |
| include_cds=include_cds, | |
| include_intron=include_intron, | |
| include_offtarget=include_offtarget, | |
| mode="mane", | |
| log_func=append_log | |
| ) | |
| else: # Direct Probe Coordinates mode | |
| stats = { | |
| "fully_covered": 0, | |
| "partially_covered": 0, | |
| "probes_no_exons": 0, | |
| "unused_probes": 0, | |
| } | |
| fully_bed = None | |
| partial_bed = None | |
| unused_bed = None | |
| mane_transcripts_bed = None | |
| update_progress(0.35, "Generating target SNVs...") | |
| append_log("\n=== Generating Target SNVs (Direct BED Mode) ===") | |
| snvs_bed, total_snvs = pl.generate_target_snvs( | |
| work_dir=work_dir, | |
| fully_bed=None, | |
| partial_bed=None, | |
| unused_bed=None, | |
| mode="direct_bed", | |
| probes_bed=probes_bed, | |
| direct_window_size=direct_window_size, | |
| log_func=append_log | |
| ) | |
| append_log("\n============================================") | |
| append_log(" VARIANT SUMMARY ") | |
| append_log("============================================") | |
| append_log(f"Total SNVs generated for BAM: {total_snvs}") | |
| append_log("============================================") | |
| update_progress(0.40, "Generating synthetic BAM...") | |
| append_log("\n=== Generating Synthetic BAM ===") | |
| def bam_progress(fraction, label): | |
| update_progress(0.40 + fraction * 0.55, label) | |
| sorted_bam, output_vcf = pl.generate_synthetic_bam( | |
| work_dir=work_dir, | |
| snvs_bed=snvs_bed, | |
| fasta_path=fasta_path, | |
| depth=depth, | |
| vaf=vaf, | |
| rg_id=rg_id, | |
| rg_sm=rg_sm, | |
| insert_size=insert_size, | |
| insert_std=insert_std, | |
| indel_interval=indel_interval, | |
| read_length=read_length, | |
| sequencing_mode="pcr_amplicon" if seq_mode.startswith("PCR Amplicon") else "hybrid_capture", | |
| log_func=append_log, | |
| progress_func=bam_progress, | |
| ) | |
| update_progress(1.0, "Done!") | |
| append_log("\nβ Pipeline complete.") | |
| bai_path = Path(str(sorted_bam) + ".bai") | |
| vcf_path = Path(output_vcf) if not isinstance(output_vcf, Path) else output_vcf | |
| igv_bed_path = Path(snvs_bed) if not isinstance(snvs_bed, Path) else snvs_bed | |
| fully_bed_path = Path(fully_bed) if fully_bed and not isinstance(fully_bed, Path) else fully_bed | |
| # Copy to static directories for IGV.js visualization (both root and script-relative) | |
| work_dir_name = work_dir.name | |
| static_dest_cwd = Path("static") / work_dir_name | |
| static_dest_script = Path(__file__).parent / "static" / work_dir_name | |
| for dest in [static_dest_cwd, static_dest_script]: | |
| dest.mkdir(parents=True, exist_ok=True) | |
| shutil.copy(sorted_bam, dest / "synthetic.sorted.bam") | |
| if bai_path.exists(): | |
| shutil.copy(bai_path, dest / "synthetic.sorted.bam.bai") | |
| shutil.copy(vcf_path, dest / "synthetic.vcf") | |
| shutil.copy(igv_bed_path, dest / "igv_variant_navigator.bed") | |
| if fully_bed_path and fully_bed_path.exists(): | |
| shutil.copy(fully_bed_path, dest / "fully_covered_exons.bed") | |
| if mane_transcripts_bed and mane_transcripts_bed.exists(): | |
| shutil.copy(mane_transcripts_bed, dest / "mane_transcripts.bed") | |
| # Store paths only β never load large files into session_state memory | |
| st.session_state["results"] = { | |
| "stats": stats, | |
| "total_snvs": total_snvs, | |
| "bam_path": str(sorted_bam), | |
| "bai_path": str(bai_path) if bai_path.exists() else None, | |
| "vcf_path": str(vcf_path), | |
| "igv_bed_path": str(igv_bed_path), | |
| "fully_covered_bed_path": str(fully_bed_path) if fully_bed_path else None, | |
| "mane_transcripts_bed_path": str(mane_transcripts_bed) if mane_transcripts_bed else None, | |
| "work_dir_name": work_dir_name, | |
| "genome_version": genome_version, | |
| } | |
| st.session_state["log_lines"] = log_lines[:] | |
| except Exception as e: | |
| st.error(f"Pipeline failed: {e}") | |
| append_log(f"\nβ ERROR: {e}") | |
| raise | |
| # ββ Results section (persists across reruns via session_state) ββββββββββββββββ | |
| if "results" in st.session_state: | |
| res = st.session_state["results"] | |
| stats = res["stats"] | |
| total_snvs = res["total_snvs"] | |
| st.success("Pipeline completed successfully!") | |
| # Show log if available and pipeline didn't just run | |
| if not run_btn and "log_lines" in st.session_state: | |
| with st.expander("Pipeline log", expanded=False): | |
| st.code("\n".join(st.session_state["log_lines"][-80:]), language=None) | |
| st.header("3 Β· Results") | |
| m1, m2, m3, m4 = st.columns(4) | |
| m1.metric("Fully Covered Exons", f"{stats['fully_covered']:,}") | |
| m2.metric("Partially Covered Exons", f"{stats['partially_covered']:,}") | |
| m3.metric("Off-target Probes", f"{stats['probes_no_exons']:,}") | |
| m4.metric("Total SNVs Generated", f"{total_snvs:,}") | |
| st.header("π Interactive Variant Browser") | |
| st.caption("Inspect the generated synthetic alignments and mutations directly in the browser. Click on a variant in the navigator panel to jump to its locus.") | |
| render_igv(res) | |
| st.header("4 Β· Download Outputs") | |
| dl1, dl2, dl3 = st.columns(3) | |
| bam_path = Path(res["bam_path"]) | |
| bai_path = Path(res["bai_path"]) if res["bai_path"] else None | |
| vcf_path = Path(res["vcf_path"]) | |
| igv_bed_path = Path(res["igv_bed_path"]) | |
| fully_bed_path = Path(res["fully_covered_bed_path"]) if res.get("fully_covered_bed_path") else None | |
| with dl1: | |
| st.markdown("**Synthetic BAM**") | |
| if bam_path.exists(): | |
| with open(bam_path, "rb") as f: | |
| st.download_button( | |
| "β¬ Download BAM", | |
| data=f, | |
| file_name="synthetic.sorted.bam", | |
| mime="application/octet-stream", | |
| use_container_width=True, | |
| ) | |
| with dl2: | |
| st.markdown("**BAM Index (.bai)**") | |
| if bai_path and bai_path.exists(): | |
| with open(bai_path, "rb") as f: | |
| st.download_button( | |
| "β¬ Download BAI", | |
| data=f, | |
| file_name="synthetic.sorted.bam.bai", | |
| mime="application/octet-stream", | |
| use_container_width=True, | |
| ) | |
| with dl3: | |
| st.markdown("**Synthetic VCF**") | |
| if vcf_path.exists(): | |
| with open(vcf_path, "rb") as f: | |
| st.download_button( | |
| "β¬ Download VCF", | |
| data=f, | |
| file_name="synthetic.vcf", | |
| mime="text/plain", | |
| use_container_width=True, | |
| ) | |
| dl4, dl5, dl6 = st.columns(3) | |
| with dl4: | |
| st.markdown("**IGV Variant Navigator BED**") | |
| if igv_bed_path.exists(): | |
| with open(igv_bed_path, "rb") as f: | |
| st.download_button( | |
| "β¬ Download IGV BED", | |
| data=f, | |
| file_name="igv_variant_navigator.bed", | |
| mime="text/plain", | |
| use_container_width=True, | |
| ) | |
| with dl5: | |
| st.markdown("**Fully Covered Exons BED**") | |
| if fully_bed_path and fully_bed_path.exists(): | |
| with open(fully_bed_path, "rb") as f: | |
| st.download_button( | |
| "β¬ Download Fully Covered Exons", | |
| data=f, | |
| file_name="fully_covered_exons.bed", | |
| mime="text/plain", | |
| use_container_width=True, | |
| ) | |
| with dl6: | |
| st.markdown("**MANE Transcripts BED12**") | |
| mane_transcripts_bed_path = Path(res["mane_transcripts_bed_path"]) if res.get("mane_transcripts_bed_path") else None | |
| if mane_transcripts_bed_path and mane_transcripts_bed_path.exists(): | |
| with open(mane_transcripts_bed_path, "rb") as f: | |
| st.download_button( | |
| "β¬ Download MANE Transcripts", | |
| data=f, | |
| file_name="mane_transcripts.bed", | |
| mime="text/plain", | |
| use_container_width=True, | |
| ) | |
| # ββ Footer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.divider() | |
| st.caption( | |
| "**How it works:** Your probe BED is intersected with MANE CDS exons. " | |
| "For each covered exon, synthetic SNVs are placed in the CDS and flanking " | |
| "intronic positions. For unused probes, a variant is placed at the midpoint. " | |
| "Paired-end reads are generated at the target depth and VAF, then written to " | |
| "a sorted, indexed BAM alongside a matching VCF." | |
| ) |