|
|
""" |
|
|
Package an existing local index folder (e.g. rebuild_1217/storage) into a clean index folder. |
|
|
|
|
|
This is the fastest path if you already built the index locally and want to publish it |
|
|
to Hugging Face without rebuilding embeddings. |
|
|
|
|
|
Input (example): |
|
|
/path/to/storage/ |
|
|
chroma_db/ |
|
|
doc_store.db |
|
|
images/ # optional (ignored) |
|
|
|
|
|
Output: |
|
|
./index_out/ |
|
|
chroma_db/ |
|
|
doc_store.db |
|
|
manifest.json |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import sys |
|
|
import shutil |
|
|
import sqlite3 |
|
|
import time |
|
|
from pathlib import Path |
|
|
from typing import Dict |
|
|
|
|
|
|
|
|
sys.path.append(str(Path(__file__).resolve().parents[1])) |
|
|
|
|
|
|
|
|
def _count_by_source_type(doc_store_db: Path) -> Dict[str, int]: |
|
|
counts: Dict[str, int] = {} |
|
|
conn = sqlite3.connect(str(doc_store_db)) |
|
|
try: |
|
|
cur = conn.cursor() |
|
|
cur.execute("SELECT source_type, COUNT(*) FROM documents GROUP BY source_type") |
|
|
for source_type, count in cur.fetchall(): |
|
|
counts[str(source_type)] = int(count) |
|
|
finally: |
|
|
conn.close() |
|
|
return counts |
|
|
|
|
|
|
|
|
def main() -> int: |
|
|
parser = argparse.ArgumentParser(description="Package existing index storage into index_out (no images)") |
|
|
parser.add_argument("--storage", type=str, required=True, help="Existing storage dir containing chroma_db/ + doc_store.db") |
|
|
parser.add_argument("--output-dir", type=str, default="./index_out", help="Output folder") |
|
|
parser.add_argument("--config", type=str, default="config/default_config.yaml", help="Config YAML (for embedding metadata)") |
|
|
parser.add_argument("--overwrite", action="store_true", help="Overwrite output dir if exists") |
|
|
args = parser.parse_args() |
|
|
|
|
|
from radiology_rag.config import Config |
|
|
|
|
|
storage = Path(args.storage) |
|
|
src_chroma = storage / "chroma_db" |
|
|
src_doc = storage / "doc_store.db" |
|
|
if not src_chroma.exists() or not src_doc.exists(): |
|
|
raise SystemExit(f"Storage missing required files: {src_chroma} / {src_doc}") |
|
|
|
|
|
out_dir = Path(args.output_dir) |
|
|
out_chroma = out_dir / "chroma_db" |
|
|
out_doc = out_dir / "doc_store.db" |
|
|
out_manifest = out_dir / "manifest.json" |
|
|
|
|
|
if out_dir.exists() and args.overwrite: |
|
|
shutil.rmtree(out_dir) |
|
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
if out_chroma.exists() or out_doc.exists(): |
|
|
if not args.overwrite: |
|
|
raise SystemExit(f"Output already exists. Use --overwrite. ({out_dir})") |
|
|
|
|
|
|
|
|
if out_chroma.exists(): |
|
|
shutil.rmtree(out_chroma, ignore_errors=True) |
|
|
shutil.copytree(src_chroma, out_chroma, dirs_exist_ok=False) |
|
|
shutil.copy2(src_doc, out_doc) |
|
|
|
|
|
cfg = Config(args.config) |
|
|
counts = _count_by_source_type(out_doc) |
|
|
manifest = { |
|
|
"packaged_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), |
|
|
"source_storage": str(storage), |
|
|
"embedding": {"model_name": cfg.get_str("embedding.model_name"), "type": cfg.get_str("embedding.type", "api")}, |
|
|
"processing": { |
|
|
"chunk_size": cfg.get_int("processing.chunk_size", 1024), |
|
|
"chunk_overlap": cfg.get_int("processing.chunk_overlap", 200), |
|
|
}, |
|
|
"counts_by_source_type": counts, |
|
|
"artifacts": {"chroma_dir": "chroma_db", "doc_store": "doc_store.db"}, |
|
|
"images_included": False, |
|
|
} |
|
|
with open(out_manifest, "w", encoding="utf-8") as f: |
|
|
json.dump(manifest, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"β Packaged index to: {out_dir}") |
|
|
print(f" - chroma_db: {out_chroma}") |
|
|
print(f" - doc_store: {out_doc}") |
|
|
print(f" - manifest: {out_manifest}") |
|
|
return 0 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
raise SystemExit(main()) |
|
|
|
|
|
|
|
|
|