Thoracic-Radiology-RAG-System / scripts /package_existing_storage.py
ZhangNy's picture
Add Space app files
75db650
"""
Package an existing local index folder (e.g. rebuild_1217/storage) into a clean index folder.
This is the fastest path if you already built the index locally and want to publish it
to Hugging Face without rebuilding embeddings.
Input (example):
/path/to/storage/
chroma_db/
doc_store.db
images/ # optional (ignored)
Output:
./index_out/
chroma_db/
doc_store.db
manifest.json
"""
from __future__ import annotations
import argparse
import json
import sys
import shutil
import sqlite3
import time
from pathlib import Path
from typing import Dict
# Allow running as `python scripts/*.py` without installing the package.
sys.path.append(str(Path(__file__).resolve().parents[1]))
def _count_by_source_type(doc_store_db: Path) -> Dict[str, int]:
counts: Dict[str, int] = {}
conn = sqlite3.connect(str(doc_store_db))
try:
cur = conn.cursor()
cur.execute("SELECT source_type, COUNT(*) FROM documents GROUP BY source_type")
for source_type, count in cur.fetchall():
counts[str(source_type)] = int(count)
finally:
conn.close()
return counts
def main() -> int:
parser = argparse.ArgumentParser(description="Package existing index storage into index_out (no images)")
parser.add_argument("--storage", type=str, required=True, help="Existing storage dir containing chroma_db/ + doc_store.db")
parser.add_argument("--output-dir", type=str, default="./index_out", help="Output folder")
parser.add_argument("--config", type=str, default="config/default_config.yaml", help="Config YAML (for embedding metadata)")
parser.add_argument("--overwrite", action="store_true", help="Overwrite output dir if exists")
args = parser.parse_args()
from radiology_rag.config import Config
storage = Path(args.storage)
src_chroma = storage / "chroma_db"
src_doc = storage / "doc_store.db"
if not src_chroma.exists() or not src_doc.exists():
raise SystemExit(f"Storage missing required files: {src_chroma} / {src_doc}")
out_dir = Path(args.output_dir)
out_chroma = out_dir / "chroma_db"
out_doc = out_dir / "doc_store.db"
out_manifest = out_dir / "manifest.json"
if out_dir.exists() and args.overwrite:
shutil.rmtree(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
if out_chroma.exists() or out_doc.exists():
if not args.overwrite:
raise SystemExit(f"Output already exists. Use --overwrite. ({out_dir})")
# Copy artifacts (exclude images/)
if out_chroma.exists():
shutil.rmtree(out_chroma, ignore_errors=True)
shutil.copytree(src_chroma, out_chroma, dirs_exist_ok=False)
shutil.copy2(src_doc, out_doc)
cfg = Config(args.config)
counts = _count_by_source_type(out_doc)
manifest = {
"packaged_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
"source_storage": str(storage),
"embedding": {"model_name": cfg.get_str("embedding.model_name"), "type": cfg.get_str("embedding.type", "api")},
"processing": {
"chunk_size": cfg.get_int("processing.chunk_size", 1024),
"chunk_overlap": cfg.get_int("processing.chunk_overlap", 200),
},
"counts_by_source_type": counts,
"artifacts": {"chroma_dir": "chroma_db", "doc_store": "doc_store.db"},
"images_included": False,
}
with open(out_manifest, "w", encoding="utf-8") as f:
json.dump(manifest, f, ensure_ascii=False, indent=2)
print(f"βœ“ Packaged index to: {out_dir}")
print(f" - chroma_db: {out_chroma}")
print(f" - doc_store: {out_doc}")
print(f" - manifest: {out_manifest}")
return 0
if __name__ == "__main__":
raise SystemExit(main())