""" Package an existing local index folder (e.g. rebuild_1217/storage) into a clean index folder. This is the fastest path if you already built the index locally and want to publish it to Hugging Face without rebuilding embeddings. Input (example): /path/to/storage/ chroma_db/ doc_store.db images/ # optional (ignored) Output: ./index_out/ chroma_db/ doc_store.db manifest.json """ from __future__ import annotations import argparse import json import sys import shutil import sqlite3 import time from pathlib import Path from typing import Dict # Allow running as `python scripts/*.py` without installing the package. sys.path.append(str(Path(__file__).resolve().parents[1])) def _count_by_source_type(doc_store_db: Path) -> Dict[str, int]: counts: Dict[str, int] = {} conn = sqlite3.connect(str(doc_store_db)) try: cur = conn.cursor() cur.execute("SELECT source_type, COUNT(*) FROM documents GROUP BY source_type") for source_type, count in cur.fetchall(): counts[str(source_type)] = int(count) finally: conn.close() return counts def main() -> int: parser = argparse.ArgumentParser(description="Package existing index storage into index_out (no images)") parser.add_argument("--storage", type=str, required=True, help="Existing storage dir containing chroma_db/ + doc_store.db") parser.add_argument("--output-dir", type=str, default="./index_out", help="Output folder") parser.add_argument("--config", type=str, default="config/default_config.yaml", help="Config YAML (for embedding metadata)") parser.add_argument("--overwrite", action="store_true", help="Overwrite output dir if exists") args = parser.parse_args() from radiology_rag.config import Config storage = Path(args.storage) src_chroma = storage / "chroma_db" src_doc = storage / "doc_store.db" if not src_chroma.exists() or not src_doc.exists(): raise SystemExit(f"Storage missing required files: {src_chroma} / {src_doc}") out_dir = Path(args.output_dir) out_chroma = out_dir / "chroma_db" out_doc = out_dir / "doc_store.db" out_manifest = out_dir / "manifest.json" if out_dir.exists() and args.overwrite: shutil.rmtree(out_dir) out_dir.mkdir(parents=True, exist_ok=True) if out_chroma.exists() or out_doc.exists(): if not args.overwrite: raise SystemExit(f"Output already exists. Use --overwrite. ({out_dir})") # Copy artifacts (exclude images/) if out_chroma.exists(): shutil.rmtree(out_chroma, ignore_errors=True) shutil.copytree(src_chroma, out_chroma, dirs_exist_ok=False) shutil.copy2(src_doc, out_doc) cfg = Config(args.config) counts = _count_by_source_type(out_doc) manifest = { "packaged_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), "source_storage": str(storage), "embedding": {"model_name": cfg.get_str("embedding.model_name"), "type": cfg.get_str("embedding.type", "api")}, "processing": { "chunk_size": cfg.get_int("processing.chunk_size", 1024), "chunk_overlap": cfg.get_int("processing.chunk_overlap", 200), }, "counts_by_source_type": counts, "artifacts": {"chroma_dir": "chroma_db", "doc_store": "doc_store.db"}, "images_included": False, } with open(out_manifest, "w", encoding="utf-8") as f: json.dump(manifest, f, ensure_ascii=False, indent=2) print(f"✓ Packaged index to: {out_dir}") print(f" - chroma_db: {out_chroma}") print(f" - doc_store: {out_doc}") print(f" - manifest: {out_manifest}") return 0 if __name__ == "__main__": raise SystemExit(main())