Spaces:

Rishabh12j
/

DepthLens

Sleeping

DepthLens / src /evaluation /bertscore_ablation.py

Rishabh Jain

Initial upload — depth-aware scene description system

5412d82 about 2 months ago

21.6 kB

	"""
	Deterministic ablation: Stage 1 (VLM only) vs Stage 2 (VLM + depth context)
	vs Stage 3 (VLM + depth context + YOLOv8n).

	All metrics are reference-free — no human-written descriptions are required.
	Ground truth is derived from the depth context preamble itself (which contains
	the exact spatial facts injected into the VLM) and from a fixed spatial
	vocabulary.

	Metrics computed
	----------------
	Spatial Term Density (STD)
	Count of directional/distance terms per 100 words. Stage 2 and Stage 3
	descriptions should contain substantially more spatial language than Stage 1.

	Preamble BERTScore (P/R/F1)
	BERTScore [Zhang et al., ICLR 2020] computed between each stage's
	description and the depth context preamble that was prepended to that
	stage's prompt. Stage 3 is scored against its OWN preamble (which
	includes per-object YOLO measurements absent from Stage 2's preamble).
	Stage 1 (no preamble) descriptions are scored against the Stage 2 preamble
	as a null baseline.

	Typical usage (library)::

	from src.evaluation.bertscore_ablation import run_ablation
	rows, summary = run_ablation(image_paths)

	CLI usage::

	python -m src.evaluation.bertscore_ablation \\
	--images data/test_images/ \\
	--output outputs/results/bertscore_ablation.csv
	"""

	import argparse
	import csv
	import re
	import sys
	import traceback
	from pathlib import Path
	from typing import Sequence

	import numpy as np
	import torch
	from bert_score import score as compute_bertscore
	from PIL import Image

	from ..config import RESULTS_DIR
	from ..pipeline import Pipeline

	# ---------------------------------------------------------------------------
	# Spatial vocabulary for Spatial Term Density (STD)
	# ---------------------------------------------------------------------------

	_DIRECTION_TERMS: frozenset[str] = frozenset({
	"left", "right", "centre", "center", "ahead", "behind",
	"front", "back", "beside", "between",
	})
	_DISTANCE_TERMS: frozenset[str] = frozenset({
	"cm", "metre", "metres", "meter", "meters", "m",
	"near", "nearby", "close", "far", "away", "approximately", "about",
	})
	_SPATIAL_TERMS: frozenset[str] = _DIRECTION_TERMS \| _DISTANCE_TERMS


	def _spatial_term_density(text: str) -> float:
	"""Count spatial vocabulary terms per 100 words.

	Args:
	text: Raw description string.

	Returns:
	Spatial term density (float, ≥ 0).
	"""
	words = re.findall(r"[a-zA-Z]+", text.lower())
	if not words:
	return 0.0
	hits = sum(1 for w in words if w in _SPATIAL_TERMS)
	return hits / len(words) * 100.0


	# ---------------------------------------------------------------------------
	# CSV schema
	# ---------------------------------------------------------------------------

	_FIELDNAMES = [
	"image",
	# Stage 1
	"s1_total_s",
	"s1_spatial_density",
	# Stage 2
	"s2_depth_s", "s2_vlm_s", "s2_total_s",
	"s2_spatial_density",
	"spatial_uplift", # s2_density - s1_density
	# Preamble BERTScore (Stage 2 description vs injected preamble)
	"preamble_P", "preamble_R", "preamble_F1",
	# Null baseline BERTScore (Stage 1 description vs same preamble)
	"baseline_P", "baseline_R", "baseline_F1",
	# Delta F1 (preamble faithfulness gain from adding depth context)
	"delta_faith_F1",
	# Stage 3 (VLM + depth + YOLO)
	"s3_total_s", "s3_depth_s", "s3_detect_s", "s3_vlm_s",
	"s3_spatial_density",
	"s3_num_objects",
	# Preamble BERTScore Stage 3 (vs its own YOLO-enriched preamble)
	"preamble_F1_s3",
	# Faithfulness deltas for Stage 3
	"delta_faith_F1_s3_vs_s1", # S3 F1 − S1 F1
	"delta_faith_F1_s3_vs_s2", # S3 F1 − S2 F1 (headline S2→S3 increment)
	# Raw text
	"s1_description",
	"s2_description",
	"s3_description",
	"depth_context",
	]

	# Sentinel timing dicts used when a stage errors out on one image.
	_TIMING1_ZERO: dict[str, float] = {"vlm_s": 0.0, "total_s": 0.0, "vram_mb": 0.0}
	_TIMING2_ZERO: dict[str, float] = {
	"depth_s": 0.0, "vlm_s": 0.0, "total_s": 0.0, "vram_mb": 0.0,
	}
	_TIMING3_ZERO: dict[str, float] = {
	"depth_s": 0.0, "yolo_s": 0.0, "vlm_s": 0.0,
	"total_s": 0.0, "vram_mb": 0.0, "n_detections": 0.0,
	}


	# ---------------------------------------------------------------------------
	# Main function
	# ---------------------------------------------------------------------------

	def run_ablation(
	image_paths: Sequence[str \| Path],
	output_csv: Path \| None = None,
	bert_device: str = "cpu",
	skip_stage3: bool = False,
	) -> tuple[list[dict], dict]:
	"""Run deterministic Stage 1 / Stage 2 / Stage 3 ablation over a set of images.

	No human-written references are needed. Ground truth is derived from:
	- The depth context preamble injected into each stage (Preamble BERTScore)
	- A fixed spatial vocabulary (Spatial Term Density)

	Descriptions are collected from all images first, then BERTScore is
	computed in a single batched call (more efficient than N individual calls).
	Failed images are included in the CSV with zero scores so they don't
	silently skew averages.

	Args:
	image_paths: Ordered list of image file paths.
	output_csv: Destination CSV file. Defaults to
	``RESULTS_DIR/bertscore_ablation.csv``.
	bert_device: Device for BERTScore model (``"cpu"`` keeps it off the
	GPU that the pipeline models occupy).
	skip_stage3: If True, Stage 3 columns are filled with zeros and
	``run_stage3`` is never called. Use when YOLO is
	unavailable.

	Returns:
	rows: List of per-image result dicts (same columns as CSV).
	summary: Dict with mean/std for each numeric column.
	"""
	if output_csv is None:
	output_csv = RESULTS_DIR / "bertscore_ablation.csv"
	output_csv = Path(output_csv)
	output_csv.parent.mkdir(parents=True, exist_ok=True)

	n = len(image_paths)
	pipeline = Pipeline()

	# ── Step 1: run all stages on every image ─────────────────────────────────
	s1_descriptions: list[str] = []
	s2_descriptions: list[str] = []
	s3_descriptions: list[str] = []
	s2_preambles: list[str] = []
	s3_preambles: list[str] = []
	s1_timings: list[dict[str, float]] = []
	s2_timings: list[dict[str, float]] = []
	s3_timings: list[dict[str, float]] = []
	errors: list[str] = [""] * n

	for i, img_path in enumerate(image_paths):
	img_path = Path(img_path)
	print(f"[{i + 1}/{n}] {img_path.name}", flush=True)

	try:
	frame_rgb = np.array(Image.open(img_path).convert("RGB"))
	except Exception:
	msg = f"LOAD_ERROR: {traceback.format_exc(limit=1).strip()}"
	print(f" WARNING: {msg}")
	errors[i] = msg
	s1_descriptions.append("")
	s2_descriptions.append("")
	s3_descriptions.append("")
	s2_preambles.append("")
	s3_preambles.append("")
	s1_timings.append(_TIMING1_ZERO.copy())
	s2_timings.append(_TIMING2_ZERO.copy())
	s3_timings.append(_TIMING3_ZERO.copy())
	continue

	# Stage 1
	try:
	desc1, t1 = pipeline.run_stage1(frame_rgb)
	print(f" S1 {t1['total_s']:.2f}s \| {desc1[:80]}...")
	except Exception:
	msg = f"STAGE1_ERROR: {traceback.format_exc(limit=1).strip()}"
	print(f" WARNING: {msg}")
	errors[i] = msg
	desc1, t1 = "", _TIMING1_ZERO.copy()

	# Stage 2 — also captures the preamble (depth context)
	try:
	desc2, ctx2, t2 = pipeline.run_stage2(frame_rgb)
	print(f" S2 {t2['total_s']:.2f}s \| {desc2[:80]}...")
	except Exception:
	msg2 = f"STAGE2_ERROR: {traceback.format_exc(limit=1).strip()}"
	print(f" WARNING: {msg2}")
	if not errors[i]:
	errors[i] = msg2
	desc2, ctx2, t2 = "", "", _TIMING2_ZERO.copy()

	# Stage 3 — own preamble includes per-object YOLO measurements
	if skip_stage3:
	desc3, ctx3, t3 = "", "", _TIMING3_ZERO.copy()
	else:
	try:
	desc3, ctx3, t3 = pipeline.run_stage3(frame_rgb)
	n_det = int(t3.get("n_detections", 0))
	print(
	f" S3 {t3['total_s']:.2f}s \| "
	f"objects={n_det} \| {desc3[:80]}..."
	)
	except Exception:
	msg3 = f"STAGE3_ERROR: {traceback.format_exc(limit=1).strip()}"
	print(f" WARNING: {msg3}")
	if not errors[i]:
	errors[i] = msg3
	desc3, ctx3, t3 = "", "", _TIMING3_ZERO.copy()

	s1_descriptions.append(desc1)
	s2_descriptions.append(desc2)
	s3_descriptions.append(desc3)
	s2_preambles.append(ctx2)
	s3_preambles.append(ctx3)
	s1_timings.append(t1)
	s2_timings.append(t2)
	s3_timings.append(t3)

	# ── Step 2: Spatial Term Density (no model required) ─────────────────────
	s1_densities = [_spatial_term_density(d) for d in s1_descriptions]
	s2_densities = [_spatial_term_density(d) for d in s2_descriptions]
	s3_densities = [_spatial_term_density(d) for d in s3_descriptions]

	# Free GPU memory before loading BERTScore model so there is headroom
	# even when the pipeline models are still in scope.
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# ── Step 3: Preamble BERTScore (one batched call per comparison) ──────────
	# Stage 1 and Stage 2 are both scored against the Stage 2 preamble.
	# Stage 3 is scored against its own (YOLO-enriched) preamble.
	preamble_refs_s2 = [p if p else " " for p in s2_preambles]
	preamble_refs_s3 = [p if p else " " for p in s3_preambles]
	cands1 = [d if d else " " for d in s1_descriptions]
	cands2 = [d if d else " " for d in s2_descriptions]
	cands3 = [d if d else " " for d in s3_descriptions]

	print("\nComputing Preamble BERTScore (Stage 1 baseline)...", flush=True)
	bP1_t, bR1_t, bF1_1_t = compute_bertscore(
	cands1, preamble_refs_s2, lang="en", device=bert_device, verbose=False
	)

	print("Computing Preamble BERTScore (Stage 2)...", flush=True)
	bP2_t, bR2_t, bF1_2_t = compute_bertscore(
	cands2, preamble_refs_s2, lang="en", device=bert_device, verbose=False
	)

	if not skip_stage3:
	print("Computing Preamble BERTScore (Stage 3 vs own preamble)...", flush=True)
	_, _, bF1_3_t = compute_bertscore(
	cands3, preamble_refs_s3, lang="en", device=bert_device, verbose=False
	)
	bF1_3 = bF1_3_t.tolist()
	else:
	bF1_3 = [0.0] * n

	bP1 = bP1_t.tolist(); bR1 = bR1_t.tolist(); bF1_1 = bF1_1_t.tolist()
	bP2 = bP2_t.tolist(); bR2 = bR2_t.tolist(); bF1_2 = bF1_2_t.tolist()

	# ── Step 4: assemble per-image rows ───────────────────────────────────────
	rows: list[dict] = []
	for i, img_path in enumerate(image_paths):
	t1, t2, t3 = s1_timings[i], s2_timings[i], s3_timings[i]
	row: dict = {
	"image": Path(img_path).name,
	# Stage 1
	"s1_total_s": round(t1["total_s"], 3),
	"s1_spatial_density": round(s1_densities[i], 2),
	# Stage 2
	"s2_depth_s": round(t2.get("depth_s", 0.0), 3),
	"s2_vlm_s": round(t2.get("vlm_s", 0.0), 3),
	"s2_total_s": round(t2["total_s"], 3),
	"s2_spatial_density": round(s2_densities[i], 2),
	"spatial_uplift": round(s2_densities[i] - s1_densities[i], 2),
	# Preamble BERTScore — Stage 2
	"preamble_P": round(bP2[i], 4),
	"preamble_R": round(bR2[i], 4),
	"preamble_F1": round(bF1_2[i], 4),
	# Preamble BERTScore — Stage 1 null baseline
	"baseline_P": round(bP1[i], 4),
	"baseline_R": round(bR1[i], 4),
	"baseline_F1": round(bF1_1[i], 4),
	# Faithfulness gain S1 → S2
	"delta_faith_F1": round(bF1_2[i] - bF1_1[i], 4),
	# Stage 3
	"s3_total_s": round(t3["total_s"], 3),
	"s3_depth_s": round(t3.get("depth_s", 0.0), 3),
	"s3_detect_s": round(t3.get("yolo_s", 0.0), 3),
	"s3_vlm_s": round(t3.get("vlm_s", 0.0), 3),
	"s3_spatial_density": round(s3_densities[i], 2),
	"s3_num_objects": int(t3.get("n_detections", 0)),
	# Preamble BERTScore — Stage 3 vs own preamble
	"preamble_F1_s3": round(bF1_3[i], 4),
	# Faithfulness deltas
	"delta_faith_F1_s3_vs_s1": round(bF1_3[i] - bF1_1[i], 4),
	"delta_faith_F1_s3_vs_s2": round(bF1_3[i] - bF1_2[i], 4),
	# Raw text
	"s1_description": s1_descriptions[i] or errors[i],
	"s2_description": s2_descriptions[i] or errors[i],
	"s3_description": s3_descriptions[i] or errors[i],
	"depth_context": s2_preambles[i],
	}
	rows.append(row)

	# ── Step 5: compute summary statistics ────────────────────────────────────
	numeric_cols = [
	"s1_total_s", "s1_spatial_density",
	"s2_depth_s", "s2_vlm_s", "s2_total_s",
	"s2_spatial_density", "spatial_uplift",
	"preamble_P", "preamble_R", "preamble_F1",
	"baseline_P", "baseline_R", "baseline_F1",
	"delta_faith_F1",
	"s3_total_s", "s3_depth_s", "s3_detect_s", "s3_vlm_s",
	"s3_spatial_density", "s3_num_objects",
	"preamble_F1_s3",
	"delta_faith_F1_s3_vs_s1", "delta_faith_F1_s3_vs_s2",
	]
	summary: dict = {}
	for col in numeric_cols:
	vals = np.array([r[col] for r in rows], dtype=np.float64)
	summary[f"{col}_mean"] = round(float(vals.mean()), 4)
	summary[f"{col}_std"] = round(float(vals.std()), 4)

	text_cols = {"s1_description": "", "s2_description": "",
	"s3_description": "", "depth_context": ""}
	mean_row: dict = {"image": "MEAN"} \| {
	col: summary[f"{col}_mean"] for col in numeric_cols
	} \| text_cols
	std_row: dict = {"image": "STD"} \| {
	col: summary[f"{col}_std"] for col in numeric_cols
	} \| text_cols

	# ── Step 6: write CSV ─────────────────────────────────────────────────────
	with open(output_csv, "w", newline="", encoding="utf-8") as fh:
	writer = csv.DictWriter(fh, fieldnames=_FIELDNAMES)
	writer.writeheader()
	writer.writerows(rows)
	writer.writerow({}) # blank separator before summary
	writer.writerow(mean_row)
	writer.writerow(std_row)

	# ── Step 7: print summary ─────────────────────────────────────────────────
	_print_summary(summary, n, output_csv, skip_stage3=skip_stage3)

	return rows, summary


	# ---------------------------------------------------------------------------
	# Pretty-print helper
	# ---------------------------------------------------------------------------

	def _print_summary(
	summary: dict,
	n: int,
	output_csv: Path,
	skip_stage3: bool = False,
	) -> None:
	"""Print a human-readable summary table to stdout."""
	s3_label = "Stage 3" if not skip_stage3 else "Stage 3*"
	sep = "-" * 72
	print(f"\n{sep}")
	print(f" Depth-Aware Ablation ({n} images) — reference-free metrics")
	if skip_stage3:
	print(" * Stage 3 skipped (--skip-stage3)")
	print(sep)
	print(
	f" {'Metric':<28} {'Stage 1':>10} {'Stage 2':>10} {s3_label:>10}"
	)
	print(sep)

	# Spatial Term Density
	s1_std = summary["s1_spatial_density_mean"]
	s2_std = summary["s2_spatial_density_mean"]
	s3_std = summary["s3_spatial_density_mean"]
	print(
	f" {'Spatial Term Density':<28} "
	f"{s1_std:>10.2f} {s2_std:>10.2f} {s3_std:>10.2f} "
	f"(S2 +{summary['spatial_uplift_mean']:.2f})"
	)
	print(sep)

	# Preamble BERTScore
	print(f" Preamble BERTScore (vs depth context preamble)")
	for metric, b_key, p_key, p3_key in (
	("P", "baseline_P", "preamble_P", None),
	("R", "baseline_R", "preamble_R", None),
	("F1", "baseline_F1", "preamble_F1", "preamble_F1_s3"),
	):
	bm = summary[f"{b_key}_mean"]
	bsd = summary[f"{b_key}_std"]
	pm = summary[f"{p_key}_mean"]
	psd = summary[f"{p_key}_std"]
	if p3_key:
	p3m = summary[f"{p3_key}_mean"]
	p3sd = summary[f"{p3_key}_std"]
	print(
	f" {metric:<28} "
	f"{bm:.4f}±{bsd:.4f} "
	f"{pm:.4f}±{psd:.4f} "
	f"{p3m:.4f}±{p3sd:.4f}"
	)
	else:
	print(
	f" {metric:<28} "
	f"{bm:.4f}±{bsd:.4f} "
	f"{pm:.4f}±{psd:.4f} "
	f"{'—':>14}"
	)
	print(sep)

	# Faithfulness deltas
	df12 = summary["delta_faith_F1_mean"]
	df12s = summary["delta_faith_F1_std"]
	df31 = summary["delta_faith_F1_s3_vs_s1_mean"]
	df31s = summary["delta_faith_F1_s3_vs_s1_std"]
	df32 = summary["delta_faith_F1_s3_vs_s2_mean"]
	df32s = summary["delta_faith_F1_s3_vs_s2_std"]
	print(
	f" {'Faith. gain S1→S2 (ΔF1)':<28} "
	f"{'':>10} {df12:>+.4f}±{df12s:.4f}"
	)
	if not skip_stage3:
	print(
	f" {'Faith. gain S1→S3 (ΔF1)':<28} "
	f"{'':>10} {'':>10} {df31:>+.4f}±{df31s:.4f}"
	)
	print(
	f" {'Faith. gain S2→S3 (ΔF1)':<28} "
	f"{'':>10} {'':>10} {df32:>+.4f}±{df32s:.4f} ← headline"
	)
	print(sep)

	# Latency
	print(f" Latency S1 (mean) {summary['s1_total_s_mean']:>10.2f}s")
	print(f" Latency S2 (mean) {summary['s2_total_s_mean']:>10.2f}s")
	print(f" of which depth {summary['s2_depth_s_mean']:>10.2f}s")
	print(f" of which VLM {summary['s2_vlm_s_mean']:>10.2f}s")
	if not skip_stage3:
	print(f" Latency S3 (mean) {summary['s3_total_s_mean']:>10.2f}s")
	print(f" of which depth {summary['s3_depth_s_mean']:>10.2f}s")
	print(f" of which detect {summary['s3_detect_s_mean']:>10.2f}s")
	print(f" of which VLM {summary['s3_vlm_s_mean']:>10.2f}s")
	print(
	f" Objects/image (mean){summary['s3_num_objects_mean']:>10.2f}"
	)
	print(sep)
	print(f" Results written to: {output_csv}")
	print(sep)


	# ---------------------------------------------------------------------------
	# CLI entry point
	# ---------------------------------------------------------------------------

	def _parse_args(argv: list[str] \| None = None) -> argparse.Namespace:
	p = argparse.ArgumentParser(
	description=(
	"Deterministic ablation: Stage 1 vs Stage 2 vs Stage 3. "
	"No human references required."
	)
	)
	p.add_argument(
	"--images",
	required=True,
	help=(
	"Directory of images (sorted alphabetically) OR a text file "
	"listing one image path per line."
	),
	)
	p.add_argument(
	"--output",
	default=None,
	help="Destination CSV (default: outputs/results/bertscore_ablation.csv).",
	)
	p.add_argument(
	"--bert-device",
	default="cpu",
	help="Device for BERTScore model: 'cpu' or 'cuda' (default: cpu).",
	)
	p.add_argument(
	"--skip-stage3",
	action="store_true",
	default=False,
	help=(
	"Skip Stage 3 (YOLOv8n + depth + VLM). "
	"Use when ultralytics/YOLO is not installed. "
	"Stage 3 columns will be filled with zeros."
	),
	)
	return p.parse_args(argv)


	def _load_image_paths(images_arg: str) -> list[Path]:
	"""Return sorted image paths from a directory or a text-file manifest."""
	p = Path(images_arg)
	if p.is_dir():
	exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
	paths = sorted(f for f in p.iterdir() if f.suffix.lower() in exts)
	if not paths:
	raise FileNotFoundError(f"No images found in directory: {p}")
	return paths
	# Treat as a manifest file
	lines = p.read_text(encoding="utf-8").splitlines()
	return [Path(line.strip()) for line in lines if line.strip()]


	def main(argv: list[str] \| None = None) -> None:
	"""CLI entry point."""
	args = _parse_args(argv)

	image_paths = _load_image_paths(args.images)

	stage_tag = " (Stage 3 skipped)" if args.skip_stage3 else ""
	print(
	f"Running ablation on {len(image_paths)} images "
	f"(reference-free){stage_tag}..."
	)
	run_ablation(
	image_paths=image_paths,
	output_csv=Path(args.output) if args.output else None,
	bert_device=args.bert_device,
	skip_stage3=args.skip_stage3,
	)


	if __name__ == "__main__":
	main()