Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """One-shot: ingest ``outputs/ground_truth/*.json`` -> ``reference_translations``. | |
| This is a thin wrapper around | |
| ``polyglot_alpha.corpus.reference_loader.load_references`` so the | |
| ingestion path can be invoked from CI / ops without remembering the | |
| module name. | |
| Idempotent (the underlying ``_sync_ingest_reference_translations`` | |
| upserts on ``sample_id``). | |
| Usage:: | |
| .venv/bin/python scripts/ingest_reference_translations.py | |
| .venv/bin/python scripts/ingest_reference_translations.py --path /path/to/dir | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import asyncio | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| from polyglot_alpha.corpus.db_ingestion import DEFAULT_REFERENCES_DIR | |
| from polyglot_alpha.corpus.reference_loader import load_references | |
| def _build_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument( | |
| "--path", | |
| type=Path, | |
| default=DEFAULT_REFERENCES_DIR, | |
| help=( | |
| "Directory of *_ground_truth.json files or a .jsonl file. " | |
| "Default: %(default)s" | |
| ), | |
| ) | |
| return parser | |
| def main(argv: list[str] | None = None) -> int: | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)-7s %(name)s %(message)s", | |
| ) | |
| log = logging.getLogger("ingest_reference_translations") | |
| args = _build_parser().parse_args(argv) | |
| if not args.path.exists(): | |
| log.error( | |
| "reference path not found: %s β create the directory or pass --path", | |
| args.path, | |
| ) | |
| return 1 | |
| stats = asyncio.run(load_references(args.path)) | |
| log.info( | |
| "done: inserted=%d updated=%d skipped=%d (total=%d)", | |
| stats.inserted, | |
| stats.updated, | |
| stats.skipped, | |
| stats.total, | |
| ) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |