""" End-to-end test: pulls images and videos from HF datasets, indexes them with ragstudio, syncs the index to the team-0/ragstudio bucket, then runs a sample search over each modality. Both the downloaded datasets and the FAISS index live under tests/.cache/, so re-runs reuse them. Pass --force to rebuild the index from scratch. Usage: python tests/e2e.py [--force] """ import argparse import os import re import sys from pathlib import Path import cv2 from huggingface_hub import snapshot_download ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) IMAGE_DATASET = "team-0/pytorch-conference" VIDEO_DATASET = "team-0/test-videos" IMAGE_QUERY = "a person presenting on stage" CACHE_DIR = ROOT / "tests" / ".cache" DATA_DIR = CACHE_DIR / "data" FRAMES_DIR = CACHE_DIR / "frames" # Video meta entries are formatted by indexers/video.py as " @ s". VIDEO_META_RE = re.compile(r"^(.*) @ (\d+\.\d+)s$") def extract_video_frame(meta: str, out_dir: Path) -> Path: m = VIDEO_META_RE.match(meta) if not m: raise ValueError(f"unexpected video meta format: {meta!r}") vpath, t = m.group(1), float(m.group(2)) cap = cv2.VideoCapture(vpath) fps = cap.get(cv2.CAP_PROP_FPS) or 0 if fps > 0: cap.set(cv2.CAP_PROP_POS_FRAMES, int(round(t * fps))) else: cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000) ok, frame = cap.read() cap.release() if not ok: raise RuntimeError(f"failed to read frame at {t}s from {vpath}") out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"{Path(vpath).stem}_{t:.2f}s.png" cv2.imwrite(str(out_path), frame) return out_path def main() -> None: parser = argparse.ArgumentParser(description=__doc__.strip()) parser.add_argument("--force", action="store_true", help="rebuild the index even if cached") args = parser.parse_args() DATA_DIR.mkdir(parents=True, exist_ok=True) print(f"[1/4] Downloading datasets to {DATA_DIR}") snapshot_download( repo_id=IMAGE_DATASET, repo_type="dataset", local_dir=DATA_DIR / "images", ) videos_dir = DATA_DIR / "videos" snapshot_download( repo_id=VIDEO_DATASET, repo_type="dataset", local_dir=videos_dir, ) video_queries = [ line.strip() for line in (videos_dir / "queries.txt").read_text().splitlines() if line.strip() ] print(f" {len(video_queries)} video queries from dataset") # `index_data/` is resolved relative to cwd by index.py, sync.py, and # searchers/__init__.py — pin cwd to the repo root so the index persists # at /index_data/ instead of vanishing with a temp dir. os.chdir(ROOT) print(f"[2/4] Building index from {DATA_DIR}{' (force)' if args.force else ''}") from index import build_index build_index(DATA_DIR, force=args.force) print("[3/4] Syncing index to team-0/ragstudio bucket") from sync import sync sync() print("[4/4] Searching") from searchers import SEARCHERS image_hits = SEARCHERS["image"](IMAGE_QUERY, top_k=3) print(f"\n=== image: {IMAGE_QUERY!r} ===") for s, p in image_hits: print(f" {s:.3f} {p}") assert image_hits, "no image results" for q in video_queries: video_hits = SEARCHERS["video"](q, top_k=3) print(f"\n=== video: {q!r} ===") for s, meta in video_hits: frame_path = extract_video_frame(meta, FRAMES_DIR) print(f" {s:.3f} {meta} -> {frame_path}") assert video_hits, f"no video results for {q!r}" print("\nOK") if __name__ == "__main__": main()