Spaces:

team-0
/

team-0-hackathon

Sleeping

File size: 3,670 Bytes

"""
End-to-end test: pulls images and videos from HF datasets, indexes them
with ragstudio, syncs the index to the team-0/ragstudio bucket, then runs
a sample search over each modality.

Both the downloaded datasets and the FAISS index live under tests/.cache/,
so re-runs reuse them. Pass --force to rebuild the index from scratch.

Usage:
    python tests/e2e.py [--force]
"""

import argparse
import os
import re
import sys
from pathlib import Path

import cv2
from huggingface_hub import snapshot_download

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

IMAGE_DATASET = "team-0/pytorch-conference"
VIDEO_DATASET = "team-0/test-videos"

IMAGE_QUERY = "a person presenting on stage"

CACHE_DIR = ROOT / "tests" / ".cache"
DATA_DIR = CACHE_DIR / "data"
FRAMES_DIR = CACHE_DIR / "frames"

# Video meta entries are formatted by indexers/video.py as "<path> @ <t>s".
VIDEO_META_RE = re.compile(r"^(.*) @ (\d+\.\d+)s$")


def extract_video_frame(meta: str, out_dir: Path) -> Path:
    m = VIDEO_META_RE.match(meta)
    if not m:
        raise ValueError(f"unexpected video meta format: {meta!r}")
    vpath, t = m.group(1), float(m.group(2))

    cap = cv2.VideoCapture(vpath)
    fps = cap.get(cv2.CAP_PROP_FPS) or 0
    if fps > 0:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(round(t * fps)))
    else:
        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
    ok, frame = cap.read()
    cap.release()
    if not ok:
        raise RuntimeError(f"failed to read frame at {t}s from {vpath}")

    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"{Path(vpath).stem}_{t:.2f}s.png"
    cv2.imwrite(str(out_path), frame)
    return out_path


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__.strip())
    parser.add_argument("--force", action="store_true", help="rebuild the index even if cached")
    args = parser.parse_args()

    DATA_DIR.mkdir(parents=True, exist_ok=True)

    print(f"[1/4] Downloading datasets to {DATA_DIR}")
    snapshot_download(
        repo_id=IMAGE_DATASET,
        repo_type="dataset",
        local_dir=DATA_DIR / "images",
    )
    videos_dir = DATA_DIR / "videos"
    snapshot_download(
        repo_id=VIDEO_DATASET,
        repo_type="dataset",
        local_dir=videos_dir,
    )
    video_queries = [
        line.strip()
        for line in (videos_dir / "queries.txt").read_text().splitlines()
        if line.strip()
    ]
    print(f"  {len(video_queries)} video queries from dataset")

    # `index_data/` is resolved relative to cwd by index.py, sync.py, and
    # searchers/__init__.py — pin cwd to the repo root so the index persists
    # at <repo>/index_data/ instead of vanishing with a temp dir.
    os.chdir(ROOT)

    print(f"[2/4] Building index from {DATA_DIR}{' (force)' if args.force else ''}")
    from index import build_index

    build_index(DATA_DIR, force=args.force)

    print("[3/4] Syncing index to team-0/ragstudio bucket")
    from sync import sync
    sync()

    print("[4/4] Searching")
    from searchers import SEARCHERS


    image_hits = SEARCHERS["image"](IMAGE_QUERY, top_k=3)
    print(f"\n=== image: {IMAGE_QUERY!r} ===")
    for s, p in image_hits:
        print(f"  {s:.3f}  {p}")
    assert image_hits, "no image results"

    for q in video_queries:
        video_hits = SEARCHERS["video"](q, top_k=3)
        print(f"\n=== video: {q!r} ===")
        for s, meta in video_hits:
            frame_path = extract_video_frame(meta, FRAMES_DIR)
            print(f"  {s:.3f}  {meta}  ->  {frame_path}")
        assert video_hits, f"no video results for {q!r}"


    print("\nOK")


if __name__ == "__main__":
    main()