Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Package an existing local index folder (e.g. rebuild_1217/storage) into a clean index folder. | |
| This is the fastest path if you already built the index locally and want to publish it | |
| to Hugging Face without rebuilding embeddings. | |
| Input (example): | |
| /path/to/storage/ | |
| chroma_db/ | |
| doc_store.db | |
| images/ # optional (ignored) | |
| Output: | |
| ./index_out/ | |
| chroma_db/ | |
| doc_store.db | |
| manifest.json | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| import shutil | |
| import sqlite3 | |
| import time | |
| from pathlib import Path | |
| from typing import Dict | |
| # Allow running as `python scripts/*.py` without installing the package. | |
| sys.path.append(str(Path(__file__).resolve().parents[1])) | |
| def _count_by_source_type(doc_store_db: Path) -> Dict[str, int]: | |
| counts: Dict[str, int] = {} | |
| conn = sqlite3.connect(str(doc_store_db)) | |
| try: | |
| cur = conn.cursor() | |
| cur.execute("SELECT source_type, COUNT(*) FROM documents GROUP BY source_type") | |
| for source_type, count in cur.fetchall(): | |
| counts[str(source_type)] = int(count) | |
| finally: | |
| conn.close() | |
| return counts | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Package existing index storage into index_out (no images)") | |
| parser.add_argument("--storage", type=str, required=True, help="Existing storage dir containing chroma_db/ + doc_store.db") | |
| parser.add_argument("--output-dir", type=str, default="./index_out", help="Output folder") | |
| parser.add_argument("--config", type=str, default="config/default_config.yaml", help="Config YAML (for embedding metadata)") | |
| parser.add_argument("--overwrite", action="store_true", help="Overwrite output dir if exists") | |
| args = parser.parse_args() | |
| from radiology_rag.config import Config | |
| storage = Path(args.storage) | |
| src_chroma = storage / "chroma_db" | |
| src_doc = storage / "doc_store.db" | |
| if not src_chroma.exists() or not src_doc.exists(): | |
| raise SystemExit(f"Storage missing required files: {src_chroma} / {src_doc}") | |
| out_dir = Path(args.output_dir) | |
| out_chroma = out_dir / "chroma_db" | |
| out_doc = out_dir / "doc_store.db" | |
| out_manifest = out_dir / "manifest.json" | |
| if out_dir.exists() and args.overwrite: | |
| shutil.rmtree(out_dir) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| if out_chroma.exists() or out_doc.exists(): | |
| if not args.overwrite: | |
| raise SystemExit(f"Output already exists. Use --overwrite. ({out_dir})") | |
| # Copy artifacts (exclude images/) | |
| if out_chroma.exists(): | |
| shutil.rmtree(out_chroma, ignore_errors=True) | |
| shutil.copytree(src_chroma, out_chroma, dirs_exist_ok=False) | |
| shutil.copy2(src_doc, out_doc) | |
| cfg = Config(args.config) | |
| counts = _count_by_source_type(out_doc) | |
| manifest = { | |
| "packaged_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), | |
| "source_storage": str(storage), | |
| "embedding": {"model_name": cfg.get_str("embedding.model_name"), "type": cfg.get_str("embedding.type", "api")}, | |
| "processing": { | |
| "chunk_size": cfg.get_int("processing.chunk_size", 1024), | |
| "chunk_overlap": cfg.get_int("processing.chunk_overlap", 200), | |
| }, | |
| "counts_by_source_type": counts, | |
| "artifacts": {"chroma_dir": "chroma_db", "doc_store": "doc_store.db"}, | |
| "images_included": False, | |
| } | |
| with open(out_manifest, "w", encoding="utf-8") as f: | |
| json.dump(manifest, f, ensure_ascii=False, indent=2) | |
| print(f"β Packaged index to: {out_dir}") | |
| print(f" - chroma_db: {out_chroma}") | |
| print(f" - doc_store: {out_doc}") | |
| print(f" - manifest: {out_manifest}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |