#!/usr/bin/env python3 """Run Arabic diacritization locally from the command line.""" from __future__ import annotations import argparse import json import logging import sys from pathlib import Path # Ensure project root is on sys.path when run as a script PROJECT_ROOT = Path(__file__).resolve().parents[2] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from backend.app.config import get_settings from backend.app.services.model_service import model_service logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Diacritize Arabic text locally using a Hugging Face pretrained model.", ) parser.add_argument( "--text", type=str, help="Single Arabic sentence without diacritics", ) parser.add_argument( "--file", type=str, help="Path to a text file with one sentence per line", ) parser.add_argument( "--output", type=str, help="Output file path (for --file mode). Default: outputs/local_diacritized.txt", ) parser.add_argument( "--model", type=str, default=None, help="Hugging Face model ID (overrides MODEL_NAME env var)", ) parser.add_argument( "--constants", type=str, default=None, help="Path to constants/ folder (overrides DIAC_CONSTANTS_PATH)", ) parser.add_argument( "--json", action="store_true", help="Print results as JSON", ) return parser.parse_args() def main() -> int: args = parse_args() settings = get_settings() model_name = args.model or settings.model_name constants_path = args.constants or settings.constants_path if not args.text and not args.file: logger.error("Provide --text or --file") return 1 try: model_service.load(model_name, constants_path) except Exception as exc: logger.error("Failed to load model: %s", exc) return 1 if args.text: try: diacritized = model_service.diacritize(args.text) except Exception as exc: logger.error("Diacritization failed: %s", exc) return 1 if args.json: print(json.dumps({"input": args.text, "diacritized": diacritized}, ensure_ascii=False)) else: print(diacritized) return 0 input_path = Path(args.file) if not input_path.exists(): logger.error("Input file not found: %s", input_path) return 1 lines = [line.strip() for line in input_path.read_text(encoding="utf-8").splitlines() if line.strip()] if not lines: logger.error("Input file is empty") return 1 try: outputs = model_service.diacritize_batch(lines) except Exception as exc: logger.error("Batch diacritization failed: %s", exc) return 1 output_path = Path(args.output or "outputs/local_diacritized.txt") output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text("\n".join(outputs) + "\n", encoding="utf-8") if args.json: results = [{"input": i, "diacritized": o} for i, o in zip(lines, outputs, strict=True)] print(json.dumps({"results": results, "output_file": str(output_path)}, ensure_ascii=False, indent=2)) else: logger.info("Wrote %d lines to %s", len(outputs), output_path) for inp, out in zip(lines, outputs, strict=True): print(f"IN: {inp}") print(f"OUT: {out}") print() return 0 if __name__ == "__main__": raise SystemExit(main())