| |
| """Run Arabic diacritization locally from the command line.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import logging |
| import sys |
| from pathlib import Path |
|
|
| |
| PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| if str(PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from backend.app.config import get_settings |
| from backend.app.services.model_service import model_service |
|
|
| logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser( |
| description="Diacritize Arabic text locally using a Hugging Face pretrained model.", |
| ) |
| parser.add_argument( |
| "--text", |
| type=str, |
| help="Single Arabic sentence without diacritics", |
| ) |
| parser.add_argument( |
| "--file", |
| type=str, |
| help="Path to a text file with one sentence per line", |
| ) |
| parser.add_argument( |
| "--output", |
| type=str, |
| help="Output file path (for --file mode). Default: outputs/local_diacritized.txt", |
| ) |
| parser.add_argument( |
| "--model", |
| type=str, |
| default=None, |
| help="Hugging Face model ID (overrides MODEL_NAME env var)", |
| ) |
| parser.add_argument( |
| "--constants", |
| type=str, |
| default=None, |
| help="Path to constants/ folder (overrides DIAC_CONSTANTS_PATH)", |
| ) |
| parser.add_argument( |
| "--json", |
| action="store_true", |
| help="Print results as JSON", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| settings = get_settings() |
|
|
| model_name = args.model or settings.model_name |
| constants_path = args.constants or settings.constants_path |
|
|
| if not args.text and not args.file: |
| logger.error("Provide --text or --file") |
| return 1 |
|
|
| try: |
| model_service.load(model_name, constants_path) |
| except Exception as exc: |
| logger.error("Failed to load model: %s", exc) |
| return 1 |
|
|
| if args.text: |
| try: |
| diacritized = model_service.diacritize(args.text) |
| except Exception as exc: |
| logger.error("Diacritization failed: %s", exc) |
| return 1 |
|
|
| if args.json: |
| print(json.dumps({"input": args.text, "diacritized": diacritized}, ensure_ascii=False)) |
| else: |
| print(diacritized) |
| return 0 |
|
|
| input_path = Path(args.file) |
| if not input_path.exists(): |
| logger.error("Input file not found: %s", input_path) |
| return 1 |
|
|
| lines = [line.strip() for line in input_path.read_text(encoding="utf-8").splitlines() if line.strip()] |
| if not lines: |
| logger.error("Input file is empty") |
| return 1 |
|
|
| try: |
| outputs = model_service.diacritize_batch(lines) |
| except Exception as exc: |
| logger.error("Batch diacritization failed: %s", exc) |
| return 1 |
|
|
| output_path = Path(args.output or "outputs/local_diacritized.txt") |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| output_path.write_text("\n".join(outputs) + "\n", encoding="utf-8") |
|
|
| if args.json: |
| results = [{"input": i, "diacritized": o} for i, o in zip(lines, outputs, strict=True)] |
| print(json.dumps({"results": results, "output_file": str(output_path)}, ensure_ascii=False, indent=2)) |
| else: |
| logger.info("Wrote %d lines to %s", len(outputs), output_path) |
| for inp, out in zip(lines, outputs, strict=True): |
| print(f"IN: {inp}") |
| print(f"OUT: {out}") |
| print() |
|
|
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|