polyglot-alpha / scripts /ingest_reference_translations.py
licaomeng
deploy: main@8970ffb β†’ HF Spaces (2026-05-27T05:19Z)
88d2f2a
#!/usr/bin/env python3
"""One-shot: ingest ``outputs/ground_truth/*.json`` -> ``reference_translations``.
This is a thin wrapper around
``polyglot_alpha.corpus.reference_loader.load_references`` so the
ingestion path can be invoked from CI / ops without remembering the
module name.
Idempotent (the underlying ``_sync_ingest_reference_translations``
upserts on ``sample_id``).
Usage::
.venv/bin/python scripts/ingest_reference_translations.py
.venv/bin/python scripts/ingest_reference_translations.py --path /path/to/dir
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import sys
from pathlib import Path
from polyglot_alpha.corpus.db_ingestion import DEFAULT_REFERENCES_DIR
from polyglot_alpha.corpus.reference_loader import load_references
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--path",
type=Path,
default=DEFAULT_REFERENCES_DIR,
help=(
"Directory of *_ground_truth.json files or a .jsonl file. "
"Default: %(default)s"
),
)
return parser
def main(argv: list[str] | None = None) -> int:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-7s %(name)s %(message)s",
)
log = logging.getLogger("ingest_reference_translations")
args = _build_parser().parse_args(argv)
if not args.path.exists():
log.error(
"reference path not found: %s β€” create the directory or pass --path",
args.path,
)
return 1
stats = asyncio.run(load_references(args.path))
log.info(
"done: inserted=%d updated=%d skipped=%d (total=%d)",
stats.inserted,
stats.updated,
stats.skipped,
stats.total,
)
return 0
if __name__ == "__main__":
sys.exit(main())