File size: 3,753 Bytes
75db650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Package an existing local index folder (e.g. rebuild_1217/storage) into a clean index folder.

This is the fastest path if you already built the index locally and want to publish it
to Hugging Face without rebuilding embeddings.

Input (example):
  /path/to/storage/
    chroma_db/
    doc_store.db
    images/        # optional (ignored)

Output:
  ./index_out/
    chroma_db/
    doc_store.db
    manifest.json
"""

from __future__ import annotations

import argparse
import json
import sys
import shutil
import sqlite3
import time
from pathlib import Path
from typing import Dict

# Allow running as `python scripts/*.py` without installing the package.
sys.path.append(str(Path(__file__).resolve().parents[1]))


def _count_by_source_type(doc_store_db: Path) -> Dict[str, int]:
    counts: Dict[str, int] = {}
    conn = sqlite3.connect(str(doc_store_db))
    try:
        cur = conn.cursor()
        cur.execute("SELECT source_type, COUNT(*) FROM documents GROUP BY source_type")
        for source_type, count in cur.fetchall():
            counts[str(source_type)] = int(count)
    finally:
        conn.close()
    return counts


def main() -> int:
    parser = argparse.ArgumentParser(description="Package existing index storage into index_out (no images)")
    parser.add_argument("--storage", type=str, required=True, help="Existing storage dir containing chroma_db/ + doc_store.db")
    parser.add_argument("--output-dir", type=str, default="./index_out", help="Output folder")
    parser.add_argument("--config", type=str, default="config/default_config.yaml", help="Config YAML (for embedding metadata)")
    parser.add_argument("--overwrite", action="store_true", help="Overwrite output dir if exists")
    args = parser.parse_args()

    from radiology_rag.config import Config

    storage = Path(args.storage)
    src_chroma = storage / "chroma_db"
    src_doc = storage / "doc_store.db"
    if not src_chroma.exists() or not src_doc.exists():
        raise SystemExit(f"Storage missing required files: {src_chroma} / {src_doc}")

    out_dir = Path(args.output_dir)
    out_chroma = out_dir / "chroma_db"
    out_doc = out_dir / "doc_store.db"
    out_manifest = out_dir / "manifest.json"

    if out_dir.exists() and args.overwrite:
        shutil.rmtree(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    if out_chroma.exists() or out_doc.exists():
        if not args.overwrite:
            raise SystemExit(f"Output already exists. Use --overwrite. ({out_dir})")

    # Copy artifacts (exclude images/)
    if out_chroma.exists():
        shutil.rmtree(out_chroma, ignore_errors=True)
    shutil.copytree(src_chroma, out_chroma, dirs_exist_ok=False)
    shutil.copy2(src_doc, out_doc)

    cfg = Config(args.config)
    counts = _count_by_source_type(out_doc)
    manifest = {
        "packaged_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
        "source_storage": str(storage),
        "embedding": {"model_name": cfg.get_str("embedding.model_name"), "type": cfg.get_str("embedding.type", "api")},
        "processing": {
            "chunk_size": cfg.get_int("processing.chunk_size", 1024),
            "chunk_overlap": cfg.get_int("processing.chunk_overlap", 200),
        },
        "counts_by_source_type": counts,
        "artifacts": {"chroma_dir": "chroma_db", "doc_store": "doc_store.db"},
        "images_included": False,
    }
    with open(out_manifest, "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    print(f"✓ Packaged index to: {out_dir}")
    print(f"  - chroma_db: {out_chroma}")
    print(f"  - doc_store: {out_doc}")
    print(f"  - manifest:  {out_manifest}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())