import streamlit as st import os import tempfile import subprocess from pathlib import Path import pipeline as pl import json import shutil def render_igv(res): work_dir_name = res.get("work_dir_name") genome_version = res.get("genome_version", "hg38") if not work_dir_name: st.warning("No static files path found for IGV.js. Restart the pipeline to generate.") return igv_bed_path = Path(res["igv_bed_path"]) variants = [] if igv_bed_path.exists(): with open(igv_bed_path, "r") as f: for line in f: if line.startswith("#") or not line.strip(): continue parts = line.strip().split("\t") if len(parts) >= 4: chrom = parts[0] start = int(parts[1]) end = int(parts[2]) label = parts[3] locus = f"{chrom}:{max(1, start-50)}-{end+50}" variants.append({ "locus": locus, "name": label.split("_")[-1], "label": label, "pos_label": f"{chrom}:{start+1}" }) variants_json = json.dumps(variants) bam_url = f"/app/static/{work_dir_name}/synthetic.sorted.bam" bai_url = f"/app/static/{work_dir_name}/synthetic.sorted.bam.bai" vcf_url = f"/app/static/{work_dir_name}/synthetic.vcf" navigator_url = f"/app/static/{work_dir_name}/igv_variant_navigator.bed" probes_url = f"/app/static/{work_dir_name}/fully_covered_exons.bed" mane_url = f"/app/static/{work_dir_name}/mane_transcripts.bed" probes_bed_exists = res.get("fully_covered_bed_path") is not None and Path(res["fully_covered_bed_path"]).exists() mane_transcripts_exists = res.get("mane_transcripts_bed_path") is not None and Path(res["mane_transcripts_bed_path"]).exists() tracks = [ { "name": "Reference", "type": "sequence", "order": 1 } ] if mane_transcripts_exists: tracks.append({ "name": "MANE Transcripts", "type": "annotation", "format": "bed", "url": mane_url, "indexed": False, "order": 1.5, "color": "green", "displayMode": "EXPANDED" }) if probes_bed_exists: tracks.append({ "name": "Probes BED", "type": "annotation", "format": "bed", "url": probes_url, "indexed": False, "order": 2, "color": "blue" }) tracks.extend([ { "name": "Variant Navigator BED", "type": "annotation", "format": "bed", "url": navigator_url, "indexed": False, "order": 3, "color": "red" }, { "name": "Synthetic VCF", "type": "variant", "format": "vcf", "url": vcf_url, "indexed": False, "order": 4 }, { "name": "Synthetic BAM", "type": "alignment", "format": "bam", "url": bam_url, "indexURL": bai_url, "order": 5, "height": 300 } ]) tracks_json = json.dumps(tracks) html_content = f"""

""" st.components.v1.html(html_content, height=620, scrolling=False) st.set_page_config( page_title="In Silico Controls Generator", page_icon="🧬", layout="wide", ) st.title("🧬 In Silico Controls Generator") st.caption( "Generate synthetic BAM + VCF files with realistic variants " "derived from your probe panel and MANE exon annotations." ) # ── Sidebar: parameters ────────────────────────────────────────────────────── uploaded_bed = st.session_state.get("uploaded_bed_file") with st.sidebar: st.header("Pipeline Parameters") st.subheader("Sequencing Parameters") depth = st.number_input("Target read depth per variant", min_value=1, max_value=10000, value=100, step=10) vaf = st.slider("Variant allele frequency (VAF)", min_value=0.01, max_value=1.0, value=0.20, step=0.01, format="%.2f") read_length = st.number_input("Read length (bp)", min_value=50, max_value=300, value=150, step=10) st.subheader("Sequencing Technology") seq_mode = st.radio( "Sequencing assay style", options=["Hybrid Capture (Staggered reads)", "PCR Amplicon (Identical start/ends)"], index=0, help="Hybrid Capture simulates sheared fragments with staggered read start/end coordinates. PCR Amplicon simulates amplicon sequencing where all reads start and end exactly at the probe/target coordinate boundaries." ) if seq_mode == "Hybrid Capture (Staggered reads)": st.subheader("Fragment Insert Size") insert_size = st.number_input("Mean insert size (bp)", min_value=100, max_value=1000, value=379, step=10) insert_std = st.number_input("Insert size std dev (bp)", min_value=0, max_value=200, value=20, step=5) else: insert_size = 250 insert_std = 0 st.subheader("Indel Parameters") indel_interval = st.number_input( "Indel interval (0 = SNVs only)", min_value=0, max_value=100, value=10, step=1, help="Make every Nth variant an indel. Set to 0 to generate only SNVs.", ) st.divider() st.subheader("Reference Genome") genome_version = st.selectbox( "Genome assembly", options=["hg38", "hg19"], index=0, help="Choose the reference genome version (hg38 or hg19)." ) ref_mode = st.radio( "FASTA source", options=["Use cached / download", "Custom path"], help="Downloads and caches the selected assembly, or lets you point to a custom local path.", ) custom_ref_path = "" if ref_mode == "Custom path": custom_ref_path = st.text_input( f"Path to {genome_version}.fa", placeholder=f"/data/references/{genome_version}.fa", help="Must be an indexed FASTA (.fa + .fa.fai).", ) st.divider() st.subheader("Targeting Mode") if genome_version == "hg19": st.info("ℹ️ MANE transcript annotations are hg38-only. Targeting mode is set to Direct Probe Coordinates for hg19.") target_mode = "Direct Probe Coordinates" else: target_mode = st.radio( "Variant targeting logic", options=["MANE Transcript Exons/Introns", "Direct Probe Coordinates"], help="MANE Transcripts Mode places variants in coding exons and flanking introns of protein-coding genes. Direct Probe Mode places a single variant inside each probe coordinate itself, completely ignoring gene annotations." ) if target_mode == "MANE Transcript Exons/Introns": st.subheader("Variant Locations") include_cds = st.checkbox("Generate CDS variants", value=True, help="Place variants in the coding sequence (CDS) of MANE exons.") include_intron = st.checkbox("Generate flanking intronic variants", value=True, help="Place variants in the flanking introns of MANE exons.") include_offtarget = st.checkbox("Generate off-target (unused probe) variants", value=True, help="Place variants in the midpoint of probes with no MANE exon coverage.") direct_window_size = 0 else: include_cds = False include_intron = False include_offtarget = False st.subheader("Direct Probe Settings") direct_variant_strategy = st.radio( "Variant placement strategy", options=["One variant per N bp window", "Single random variant per probe"], index=0, help="Choose whether to generate one variant per N bp window across the probe coordinates or a single random variant per probe." ) if direct_variant_strategy == "One variant per N bp window": direct_window_size = st.number_input( "Window size (bp)", min_value=1, max_value=1000, value=10, step=1, help="Place one variant randomly inside each non-overlapping window of this size." ) else: direct_window_size = 0 st.subheader("Read Group") rg_id = st.text_input("Read Group ID", value="CPDV2510843-SEQ-251103") rg_sm = st.text_input("Sample Name", value="CPDV2510843-SEQ-251103") st.divider() st.subheader("🛠️ Debug Info") st.caption("Helpful diagnostics for troubleshooting deployment status.") st.write("Streamlit Version:", st.__version__) st.write("File Uploaded:", uploaded_bed is not None) if uploaded_bed: st.write("Filename:", uploaded_bed.name) st.write("probes_df in state:", "probes_df" in st.session_state) if "probes_df" in st.session_state: st.write("probes_df count:", len(st.session_state["probes_df"])) import os st.write("CWD:", os.getcwd()) st.write("Script Path:", __file__) st.write("static/ exists:", os.path.exists("static")) st.write("src/static/ exists:", os.path.exists("src/static")) if os.path.exists("static"): st.write("static/ folders:", os.listdir("static")[:5]) if os.path.exists("src/static"): st.write("src/static/ folders:", os.listdir("src/static")[:5]) # ── Main area ──────────────────────────────────────────────────────────────── col_upload, col_info = st.columns([2, 1]) with col_upload: st.header("1 · Upload Probes BED") uploaded_bed = st.file_uploader( "Upload your probes BED file", type=["bed"], key="uploaded_bed_file", help="Standard BED3+ format (chrom, start, end, ...)", ) with col_info: st.header("Cache Status") mane_cached = pl.MANE_BED12.exists() ref_cached = (pl.HG38_FA.exists() and pl.HG38_FAI.exists()) if genome_version == "hg38" else (pl.HG19_FA.exists() and pl.HG19_FAI.exists()) bigbed_cached = pl.BIGBEDTOBED_PATH.exists() st.markdown(f"{'✅' if bigbed_cached else '⬜'} bigBedToBed") st.markdown(f"{'✅' if mane_cached else '⬜'} MANE annotation") st.markdown(f"{'✅' if ref_cached else '⬜'} {genome_version} reference") if not ref_cached and ref_mode == "Use cached / download": st.warning(f"{genome_version} not cached. First run will download and index the assembly, which may take 5–10 minutes.") # ── Step 1.5: Customize Probes ──────────────────────────────────────────────── if uploaded_bed: if "probes_df" not in st.session_state or st.session_state.get("uploaded_file_name") != uploaded_bed.name: import pandas as pd import io try: uploaded_bed.seek(0) content = uploaded_bed.read().decode("utf-8", errors="ignore") # Strip comments and headers lines = [line for line in content.splitlines() if line.strip() and not line.startswith("#") and not line.startswith("track")] if lines: df = pd.read_csv(io.StringIO("\n".join(lines)), sep="\t", header=None) cols = ["chrom", "start", "end"] if len(df.columns) > 3: cols += [f"col_{i}" for i in range(3, len(df.columns))] df.columns = cols[:len(df.columns)] df.insert(0, "Select", True) st.session_state["probes_df"] = df st.session_state["uploaded_file_name"] = uploaded_bed.name st.session_state.pop("sample_seed", None) st.session_state.pop("prev_frac", None) else: st.error("Uploaded BED file appears to be empty or contains only comments.") except Exception as e: st.error(f"Error parsing BED file: {e}") if "probes_df" in st.session_state: df = st.session_state["probes_df"] st.header("1.5 · Customize Probes") st.caption(f"Loaded {len(df):,} probes from {uploaded_bed.name}. Customize which regions will be processed below.") col_mode, col_rand = st.columns([1, 1]) with col_mode: subset_mode = st.radio( "Selection mode", options=["All Probes", "Manual Selection (below)", "Random Sampling"], index=0, help="Choose whether to run all probes, manually check/uncheck probes in the list, or select a random fraction of the probes." ) with col_rand: if subset_mode == "Random Sampling": sample_frac = st.slider("Fraction of probes to keep", min_value=0.01, max_value=1.00, value=0.10, step=0.01) resample_btn = st.button("🎲 Resample") if "sample_seed" not in st.session_state or resample_btn or st.session_state.get("prev_frac") != sample_frac: import random st.session_state["sample_seed"] = random.randint(0, 100000) st.session_state["prev_frac"] = sample_frac sampled_df = df.sample(frac=sample_frac, random_state=st.session_state["sample_seed"]) df["Select"] = df.index.isin(sampled_df.index) elif subset_mode == "All Probes": df["Select"] = True # Render table editor st.markdown("#### 📋 Probes List") st.caption("Double-click a cell to search, or check/uncheck boxes to filter targets.") edited_df = st.data_editor( df, use_container_width=True, hide_index=True, disabled=[col for col in df.columns if col != "Select"], column_config={ "Select": st.column_config.CheckboxColumn( "Select", help="Uncheck to exclude this region from variant generation", default=True ) } ) st.session_state["probes_df"] = edited_df total_selected = len(edited_df[edited_df["Select"] == True]) st.info(f"Selected {total_selected:,} of {len(df):,} probes ({total_selected/len(df)*100:.1f}%) for variant generation.") st.divider() # ── Step 2: Run pipeline ────────────────────────────────────────────────────── st.header("2 · Run Pipeline") if not uploaded_bed: st.info("Upload a probes BED file to enable the pipeline.") st.stop() run_btn = st.button("▶ Run Pipeline", type="primary", use_container_width=True) # Clear results when a new run is requested if run_btn: st.session_state.pop("results", None) st.session_state.pop("log_lines", None) # ── Execute pipeline ────────────────────────────────────────────────────────── if run_btn: fasta_path = (pl.HG38_FA if genome_version == "hg38" else pl.HG19_FA) if ref_mode == "Use cached / download" else Path(custom_ref_path) if ref_mode == "Custom path": if not custom_ref_path: st.error("Please provide a path to your hg38.fa file.") st.stop() if not fasta_path.exists(): st.error(f"FASTA file not found: {fasta_path}") st.stop() fai = Path(str(fasta_path) + ".fai") if not fai.exists(): st.warning("No .fai index found. Attempting to index with samtools faidx...") subprocess.run(f"samtools faidx {fasta_path}", shell=True, capture_output=True) # Filter for selected probes if "probes_df" in st.session_state: df = st.session_state["probes_df"] selected_df = df[df["Select"] == True] else: st.error("No probe data found in session state.") st.stop() if len(selected_df) == 0: st.error("No probes selected! Please select at least one probe in Step 1.5.") st.stop() # Convert back to BED format (tab-separated, without the 'Select' column) bed_cols = [col for col in selected_df.columns if col != "Select"] bed_text = selected_df[bed_cols].to_csv(sep="\t", header=False, index=False) work_dir = Path(tempfile.mkdtemp(prefix="insilicocontrols_")) probes_bed = work_dir / "probes.bed" probes_bed.write_text(bed_text) log_expander = st.expander("Pipeline log", expanded=True) log_area = log_expander.empty() log_lines = [] def append_log(msg): log_lines.append(str(msg)) log_area.code("\n".join(log_lines[-80:]), language=None) progress_bar = st.progress(0.0, text="Starting...") def update_progress(fraction, label=""): progress_bar.progress(min(fraction, 1.0), text=label) try: # Pre-ensure reference genome if ref_mode == "Use cached / download": update_progress(0.08, f"Ensuring {genome_version} reference...") append_log(f"\n=== {genome_version} Reference ===") pl.ensure_reference(genome_version=genome_version, log_func=append_log) if target_mode == "MANE Transcript Exons/Introns": update_progress(0.02, "Setting up tools...") append_log("=== Setting up tools ===") pl.ensure_bigbedtobed(append_log) update_progress(0.05, "Ensuring MANE annotation...") append_log("\n=== MANE Annotation ===") pl.ensure_mane(append_log) update_progress(0.15, "Parsing MANE exons...") append_log("\n=== Parsing MANE Exons ===") exons_bed = pl.parse_mane_exons(work_dir, append_log) update_progress(0.25, "Analyzing probe coverage...") append_log("\n=== Coverage Analysis ===") stats, fully_bed, partial_bed, unused_bed = pl.analyze_coverage( work_dir, probes_bed, exons_bed, append_log ) append_log("\n============================================") append_log(" COVERAGE SUMMARY ") append_log("============================================") append_log(f"Exons with >95% coverage (USED): {stats['fully_covered']}") append_log(f"Exons with partial coverage (USED): {stats['partially_covered']}") append_log(f"Probes with no exon coverage: {stats['probes_no_exons']}") append_log(f"Unused contiguous probes (ADDED): {stats['unused_probes']}") append_log("============================================") # Subset MANE transcripts intersecting with target probes merged_probes = work_dir / "merged_probes.bed" mane_transcripts_bed = work_dir / "mane_transcripts.bed" update_progress(0.30, "Subsetting MANE transcripts...") append_log("\n=== Subsetting MANE Transcripts ===") pl.run_cmd(f"bedtools intersect -a {pl.MANE_BED12} -b {merged_probes} -wa -u > {mane_transcripts_bed}", append_log) update_progress(0.35, "Generating target SNVs...") append_log("\n=== Generating Target SNVs ===") snvs_bed, total_snvs = pl.generate_target_snvs( work_dir=work_dir, fully_bed=fully_bed, partial_bed=partial_bed, unused_bed=unused_bed, include_cds=include_cds, include_intron=include_intron, include_offtarget=include_offtarget, mode="mane", log_func=append_log ) else: # Direct Probe Coordinates mode stats = { "fully_covered": 0, "partially_covered": 0, "probes_no_exons": 0, "unused_probes": 0, } fully_bed = None partial_bed = None unused_bed = None mane_transcripts_bed = None update_progress(0.35, "Generating target SNVs...") append_log("\n=== Generating Target SNVs (Direct BED Mode) ===") snvs_bed, total_snvs = pl.generate_target_snvs( work_dir=work_dir, fully_bed=None, partial_bed=None, unused_bed=None, mode="direct_bed", probes_bed=probes_bed, direct_window_size=direct_window_size, log_func=append_log ) append_log("\n============================================") append_log(" VARIANT SUMMARY ") append_log("============================================") append_log(f"Total SNVs generated for BAM: {total_snvs}") append_log("============================================") update_progress(0.40, "Generating synthetic BAM...") append_log("\n=== Generating Synthetic BAM ===") def bam_progress(fraction, label): update_progress(0.40 + fraction * 0.55, label) sorted_bam, output_vcf = pl.generate_synthetic_bam( work_dir=work_dir, snvs_bed=snvs_bed, fasta_path=fasta_path, depth=depth, vaf=vaf, rg_id=rg_id, rg_sm=rg_sm, insert_size=insert_size, insert_std=insert_std, indel_interval=indel_interval, read_length=read_length, sequencing_mode="pcr_amplicon" if seq_mode.startswith("PCR Amplicon") else "hybrid_capture", log_func=append_log, progress_func=bam_progress, ) update_progress(1.0, "Done!") append_log("\n✅ Pipeline complete.") bai_path = Path(str(sorted_bam) + ".bai") vcf_path = Path(output_vcf) if not isinstance(output_vcf, Path) else output_vcf igv_bed_path = Path(snvs_bed) if not isinstance(snvs_bed, Path) else snvs_bed fully_bed_path = Path(fully_bed) if fully_bed and not isinstance(fully_bed, Path) else fully_bed # Copy to static directories for IGV.js visualization (both root and script-relative) work_dir_name = work_dir.name static_dest_cwd = Path("static") / work_dir_name static_dest_script = Path(__file__).parent / "static" / work_dir_name for dest in [static_dest_cwd, static_dest_script]: dest.mkdir(parents=True, exist_ok=True) shutil.copy(sorted_bam, dest / "synthetic.sorted.bam") if bai_path.exists(): shutil.copy(bai_path, dest / "synthetic.sorted.bam.bai") shutil.copy(vcf_path, dest / "synthetic.vcf") shutil.copy(igv_bed_path, dest / "igv_variant_navigator.bed") if fully_bed_path and fully_bed_path.exists(): shutil.copy(fully_bed_path, dest / "fully_covered_exons.bed") if mane_transcripts_bed and mane_transcripts_bed.exists(): shutil.copy(mane_transcripts_bed, dest / "mane_transcripts.bed") # Store paths only — never load large files into session_state memory st.session_state["results"] = { "stats": stats, "total_snvs": total_snvs, "bam_path": str(sorted_bam), "bai_path": str(bai_path) if bai_path.exists() else None, "vcf_path": str(vcf_path), "igv_bed_path": str(igv_bed_path), "fully_covered_bed_path": str(fully_bed_path) if fully_bed_path else None, "mane_transcripts_bed_path": str(mane_transcripts_bed) if mane_transcripts_bed else None, "work_dir_name": work_dir_name, "genome_version": genome_version, } st.session_state["log_lines"] = log_lines[:] except Exception as e: st.error(f"Pipeline failed: {e}") append_log(f"\n❌ ERROR: {e}") raise # ── Results section (persists across reruns via session_state) ──────────────── if "results" in st.session_state: res = st.session_state["results"] stats = res["stats"] total_snvs = res["total_snvs"] st.success("Pipeline completed successfully!") # Show log if available and pipeline didn't just run if not run_btn and "log_lines" in st.session_state: with st.expander("Pipeline log", expanded=False): st.code("\n".join(st.session_state["log_lines"][-80:]), language=None) st.header("3 · Results") m1, m2, m3, m4 = st.columns(4) m1.metric("Fully Covered Exons", f"{stats['fully_covered']:,}") m2.metric("Partially Covered Exons", f"{stats['partially_covered']:,}") m3.metric("Off-target Probes", f"{stats['probes_no_exons']:,}") m4.metric("Total SNVs Generated", f"{total_snvs:,}") st.header("🔍 Interactive Variant Browser") st.caption("Inspect the generated synthetic alignments and mutations directly in the browser. Click on a variant in the navigator panel to jump to its locus.") render_igv(res) st.header("4 · Download Outputs") dl1, dl2, dl3 = st.columns(3) bam_path = Path(res["bam_path"]) bai_path = Path(res["bai_path"]) if res["bai_path"] else None vcf_path = Path(res["vcf_path"]) igv_bed_path = Path(res["igv_bed_path"]) fully_bed_path = Path(res["fully_covered_bed_path"]) if res.get("fully_covered_bed_path") else None with dl1: st.markdown("**Synthetic BAM**") if bam_path.exists(): with open(bam_path, "rb") as f: st.download_button( "⬇ Download BAM", data=f, file_name="synthetic.sorted.bam", mime="application/octet-stream", use_container_width=True, ) with dl2: st.markdown("**BAM Index (.bai)**") if bai_path and bai_path.exists(): with open(bai_path, "rb") as f: st.download_button( "⬇ Download BAI", data=f, file_name="synthetic.sorted.bam.bai", mime="application/octet-stream", use_container_width=True, ) with dl3: st.markdown("**Synthetic VCF**") if vcf_path.exists(): with open(vcf_path, "rb") as f: st.download_button( "⬇ Download VCF", data=f, file_name="synthetic.vcf", mime="text/plain", use_container_width=True, ) dl4, dl5, dl6 = st.columns(3) with dl4: st.markdown("**IGV Variant Navigator BED**") if igv_bed_path.exists(): with open(igv_bed_path, "rb") as f: st.download_button( "⬇ Download IGV BED", data=f, file_name="igv_variant_navigator.bed", mime="text/plain", use_container_width=True, ) with dl5: st.markdown("**Fully Covered Exons BED**") if fully_bed_path and fully_bed_path.exists(): with open(fully_bed_path, "rb") as f: st.download_button( "⬇ Download Fully Covered Exons", data=f, file_name="fully_covered_exons.bed", mime="text/plain", use_container_width=True, ) with dl6: st.markdown("**MANE Transcripts BED12**") mane_transcripts_bed_path = Path(res["mane_transcripts_bed_path"]) if res.get("mane_transcripts_bed_path") else None if mane_transcripts_bed_path and mane_transcripts_bed_path.exists(): with open(mane_transcripts_bed_path, "rb") as f: st.download_button( "⬇ Download MANE Transcripts", data=f, file_name="mane_transcripts.bed", mime="text/plain", use_container_width=True, ) # ── Footer ──────────────────────────────────────────────────────────────────── st.divider() st.caption( "**How it works:** Your probe BED is intersected with MANE CDS exons. " "For each covered exon, synthetic SNVs are placed in the CDS and flanking " "intronic positions. For unused probes, a variant is placed at the midpoint. " "Paired-end reads are generated at the target depth and VAF, then written to " "a sorted, indexed BAM alongside a matching VCF." )