Spaces:
Sleeping
Sleeping
| import asyncio | |
| import json | |
| from pathlib import Path | |
| import click | |
| from mediastorm.config import CHROMADB_PATH | |
| def cli(): | |
| """MediaStorm RAG — Documentary archive search tool.""" | |
| pass | |
| def ingest(uid: str | None, cache_dir: str): | |
| """Fetch and parse MediaStorm data.""" | |
| from mediastorm.ingest.pipeline import ingest_all, ingest_story | |
| cache_path = Path(cache_dir) | |
| async def _run(): | |
| if uid: | |
| click.echo(f"Ingesting story {uid}...") | |
| story = await ingest_story(uid) | |
| cache_path.mkdir(parents=True, exist_ok=True) | |
| (cache_path / f"{uid}.json").write_text( | |
| json.dumps(story, ensure_ascii=False, indent=2) | |
| ) | |
| click.echo(f"Done. {len(story.get('videos', []))} videos found.") | |
| else: | |
| click.echo("Ingesting all stories...") | |
| stories = await ingest_all(cache_dir=cache_path) | |
| click.echo(f"Done. {len(stories)} stories ingested.") | |
| asyncio.run(_run()) | |
| def enrich(cache_dir: str, enrichment_dir: str): | |
| """Enrich ingested stories with LLM-extracted metadata.""" | |
| from mediastorm.enrich.enricher import enrich_story as _enrich | |
| cache_path = Path(cache_dir) | |
| enrichment_path = Path(enrichment_dir) | |
| enrichment_path.mkdir(parents=True, exist_ok=True) | |
| if not cache_path.exists(): | |
| click.echo("No ingested data found. Run 'ingest' first.") | |
| return | |
| files = list(cache_path.glob("*.json")) | |
| click.echo(f"Enriching {len(files)} stories...") | |
| async def _run(): | |
| for i, f in enumerate(files): | |
| story = json.loads(f.read_text()) | |
| uid = story["uid"] | |
| # Combine text for enrichment | |
| description = story.get("short_description", "") + " " + story.get("long_description", "") | |
| transcript = "" | |
| for video in story.get("videos", []): | |
| for turn in video.get("speaker_turns", []): | |
| speaker = turn.get("speaker", "") | |
| text = turn.get("text", "") | |
| transcript += f"{speaker}: {text}\n" if speaker else f"{text}\n" | |
| credits_str = ", ".join( | |
| f"{c.get('name', '')} ({c.get('role', '')})" | |
| for c in story.get("credits", []) | |
| if c.get("name") | |
| ) | |
| try: | |
| result = await _enrich( | |
| title=story.get("name", ""), | |
| description=description.strip(), | |
| transcript=transcript.strip(), | |
| cache_dir=enrichment_path, | |
| story_uid=uid, | |
| credits=credits_str, | |
| ) | |
| click.echo(f" [{i+1}/{len(files)}] {story.get('name', uid)} — {len(result.topics)} topics") | |
| except Exception as e: | |
| click.echo(f" [{i+1}/{len(files)}] FAIL {story.get('name', uid)}: {e}") | |
| asyncio.run(_run()) | |
| click.echo("Enrichment complete.") | |
| def vectorize(cache_dir: str, enrichment_dir: str, db_path: str | None): | |
| """Embed and index stories into ChromaDB + BM25.""" | |
| from mediastorm.enrich.schemas import StoryEnrichment | |
| from mediastorm.vectorize.chunker import chunk_story | |
| from mediastorm.vectorize.embedder import Embedder | |
| from mediastorm.vectorize.store import VectorStore | |
| from mediastorm.vectorize.bm25_store import BM25Store | |
| from mediastorm.config import BM25_INDEX_PATH | |
| cache_path = Path(cache_dir) | |
| enrichment_path = Path(enrichment_dir) | |
| chromadb_path = Path(db_path) if db_path else CHROMADB_PATH | |
| if not cache_path.exists(): | |
| click.echo("No ingested data. Run 'ingest' first.") | |
| return | |
| embedder = Embedder() | |
| store = VectorStore(path=chromadb_path) | |
| files = list(cache_path.glob("*.json")) | |
| click.echo(f"Vectorizing {len(files)} stories...") | |
| bm25_docs = [] | |
| for i, f in enumerate(files): | |
| story = json.loads(f.read_text()) | |
| uid = story["uid"] | |
| # Load enrichment (handle schema changes gracefully) | |
| enrichment_file = enrichment_path / f"{uid}.json" | |
| if enrichment_file.exists(): | |
| try: | |
| enrichment = StoryEnrichment.model_validate_json(enrichment_file.read_text()) | |
| except Exception: | |
| enrichment = StoryEnrichment(summary="") | |
| else: | |
| enrichment = StoryEnrichment(summary="") | |
| # Chunk story | |
| story_doc = chunk_story(story, enrichment) | |
| story_doc["embedding"] = embedder.embed_texts([story_doc["text"]])[0] | |
| store.upsert_stories([story_doc]) | |
| # Collect for BM25 | |
| bm25_docs.append({"id": story_doc["id"], "text": story_doc["text"]}) | |
| click.echo(f" [{i+1}/{len(files)}] {story.get('name', uid)}") | |
| # Build and save BM25 index | |
| bm25 = BM25Store(path=BM25_INDEX_PATH) | |
| bm25.build(bm25_docs) | |
| bm25.save() | |
| click.echo(f"Done. Stories: {store.stories_count()}, BM25 docs: {len(bm25_docs)}") | |
| def audit(subset: int, verbose: bool): | |
| """Run pipeline audit/benchmark.""" | |
| from audit import run_audit | |
| asyncio.run(run_audit(subset=subset, verbose=verbose)) | |
| def eval_cmd(verbose: bool, history: bool, pipeline: bool): | |
| """Run retrieval evaluation and compare to previous run.""" | |
| from mediastorm.eval.runner import ( | |
| _build_run_data, save_run, load_previous_run, load_all_runs, | |
| ) | |
| from mediastorm.eval.display import ( | |
| print_scores, print_verbose, print_diff, print_history, | |
| ) | |
| if history: | |
| runs = load_all_runs() | |
| print_history(runs) | |
| return | |
| # Load previous run BEFORE running eval (so the new run isn't compared to itself) | |
| previous = load_previous_run() | |
| # Run evaluation | |
| from eval_retrieval import run_eval | |
| mode = "pipeline (retriever + Gemini)" if pipeline else "retriever only" | |
| click.echo(f"Running retrieval evaluation ({mode})...") | |
| eval_result = asyncio.run(run_eval(verbose=False, quiet=True, pipeline=pipeline)) | |
| # Build and save run data | |
| run_data = _build_run_data(eval_result) | |
| path = save_run(run_data) | |
| click.echo(f"Results saved to {path}") | |
| # Display | |
| if verbose: | |
| print_verbose(run_data) | |
| print_scores(run_data) | |
| if previous: | |
| print_diff(run_data, previous) | |
| else: | |
| click.echo("\nFirst run — no comparison available.") | |
| if __name__ == "__main__": | |
| cli() | |