from __future__ import annotations import argparse import hashlib import json import shutil import uuid from pathlib import Path from typing import Iterable ROOT_DIR = Path(__file__).resolve().parent.parent DEFAULT_OUTPUT = ROOT_DIR / "outputs" / "huggingface-space" FILES = [ "requirements.txt", "requirements-silma.txt", "requirements-supertonic.txt", "requirements-paddleocr.txt", "requirements-paddleocr-vl.txt", "requirements-qari-ocr.txt", "requirements-tawkeed-ocr.txt", "requirements-katib-ocr.txt", "requirements-arabic-qwen-ocr.txt", "requirements-arabic-glm-ocr.txt", "requirements-baseer-ocr.txt", ] DIRECTORIES = [ "app", "api", "docs", "static", "scripts", ] EXCLUDE_NAMES = { "__pycache__", ".pytest_cache", ".ruff_cache", } EXCLUDE_SUFFIXES = { ".pyc", ".pyo", ".pyd", } MANIFEST_NAME = ".export-manifest.json" def should_copy(path: Path) -> bool: if path.name in EXCLUDE_NAMES: return False if path.suffix in EXCLUDE_SUFFIXES: return False return True def copy_tree(source: Path, destination: Path) -> None: if destination.exists(): shutil.rmtree(destination) shutil.copytree(source, destination, ignore=lambda _dir, names: [name for name in names if not should_copy(Path(name))]) def iter_manifest_source_files(root: Path | None = None) -> Iterable[Path]: root = root or ROOT_DIR for relative in FILES: path = root / relative if path.exists(): yield path dockerfile = root / "Dockerfile.worker" if dockerfile.exists(): yield dockerfile for relative in DIRECTORIES: base = root / relative if not base.exists(): continue for path in sorted(base.rglob("*")): if path.is_file() and should_copy(path): yield path def file_sha256(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def build_export_manifest(root: Path | None = None) -> dict[str, object]: root = root or ROOT_DIR files: dict[str, str] = {} for path in iter_manifest_source_files(root): relative = path.relative_to(root).as_posix() if relative == "Dockerfile.worker": relative = "Dockerfile" files[relative] = file_sha256(path) return { "version": 1, "source": "ArabicTranslator", "files": files, } def build_hf_space_bundle(output_dir: Path) -> list[str]: output_dir.mkdir(parents=True) copied: list[str] = [] for relative in FILES: source = ROOT_DIR / relative destination = output_dir / relative destination.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(source, destination) copied.append(relative) shutil.copy2(ROOT_DIR / "Dockerfile.worker", output_dir / "Dockerfile") copied.append("Dockerfile") for relative in DIRECTORIES: source = ROOT_DIR / relative destination = output_dir / relative copy_tree(source, destination) copied.append(relative) (output_dir / ".dockerignore").write_text( "\n".join( [ ".git", ".env", ".venv", ".venv-*", "__pycache__", ".pytest_cache", "outputs", "uploads", "data", "test_pdfs", "tests", "*.pyc", "*.pyo", "*.pyd", "*.log", "", ] ), encoding="utf-8", ) copied.append(".dockerignore") write_space_readme(output_dir / "README.md") copied.append("README.md") (output_dir / MANIFEST_NAME).write_text( json.dumps(build_export_manifest(), indent=2, sort_keys=True) + "\n", encoding="utf-8", ) copied.append(MANIFEST_NAME) (output_dir / ".export-complete").write_text("ready\n", encoding="utf-8") copied.append(".export-complete") return copied def export_hf_space(output_dir: Path = DEFAULT_OUTPUT, force: bool = False) -> dict[str, object]: output_dir = output_dir.resolve() if output_dir.exists() and not force: raise FileExistsError(f"{output_dir} already exists. Use --force to replace it.") parent = output_dir.parent parent.mkdir(parents=True, exist_ok=True) staging_dir = parent / f".{output_dir.name}.staging-{uuid.uuid4().hex}" backup_dir = parent / f".{output_dir.name}.previous-{uuid.uuid4().hex}" copied: list[str] = [] try: copied = build_hf_space_bundle(staging_dir) issues = validate_export(staging_dir) if issues: raise ValueError(f"Staged Hugging Face Space bundle is invalid: {', '.join(issues)}") if output_dir.exists(): output_dir.rename(backup_dir) staging_dir.rename(output_dir) except Exception: shutil.rmtree(staging_dir, ignore_errors=True) if backup_dir.exists() and not output_dir.exists(): backup_dir.rename(output_dir) raise finally: shutil.rmtree(backup_dir, ignore_errors=True) return {"outputDir": str(output_dir), "copied": copied} def write_space_readme(path: Path) -> None: path.write_text( """--- title: Arabic Audio Reader Worker colorFrom: green colorTo: green sdk: docker app_port: 7860 --- # Arabic Audio Reader Worker This is the Docker worker bundle for the Arabic PDF Reader. ## Hugging Face Space Settings - SDK: Docker - Hardware: free CPU is acceptable for demos, but cold starts and long books can be slow - Free CPU Basic currently provides 2 vCPU, 16 GB RAM, and 50 GB non-persistent disk by default; treat generated audio as short-lived unless you add persistent/object storage - Port: 7860 - Default build: installs SILMA, PaddleOCR Arabic, Tesseract Arabic, and eSpeak NG - Optional fast CPU voice: set Docker build arg `INSTALL_SUPERTONIC=1` to add Supertonic 3 Arabic-capable local TTS - Stronger OCR build: set Docker build arg `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` for Arabic-trained models, or `INSTALL_QARI_OCR=1` for the heavier Arabic-book model Set these Space secrets: ```text ACCESS_CODE=1234 SECRET_KEY= CORS_ORIGINS=https://your-vercel-app.vercel.app COOKIE_SAMESITE=none COOKIE_SECURE=1 OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4 DEFAULT_VOICE_ID=silma-local OUTPUT_RETENTION_DAYS=7 OUTPUT_MAX_FILES=25 AUDIO_FORMAT=mp3 MP3_BITRATE=96k ``` Generate the deployment handoff from the main repo to get the exact `SECRET_KEY`, worker secrets, Vercel environment variables, and final proof command: ```powershell python scripts\\deployment_handoff.py https://your-space.hf.space --origin https://your-vercel-app.vercel.app --code 1234 ``` Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets. The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio. Optional stronger-worker build args: ```text INSTALL_QARI_OCR=1 INSTALL_TAWKEED_OCR=1 INSTALL_KATIB_OCR=1 INSTALL_ARABIC_QWEN_OCR=1 INSTALL_ARABIC_GLM_OCR=1 INSTALL_BASEER_OCR=1 INSTALL_PADDLEOCR_VL=1 INSTALL_SUPERTONIC=1 ``` Use `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` first when you want an Arabic-trained OCR model. Use `INSTALL_QARI_OCR=1` when you want the strongest Arabic-book OCR and the worker has enough memory/GPU. Leave heavy options at `0` on free CPU Spaces unless a short benchmark proves the stronger model is worth the cold start, build time, memory, and runtime. After the Space builds, verify it from your main repo: ```powershell python scripts\\verify_worker.py https://your-space.hf.space --code 1234 --origin https://your-vercel-app.vercel.app --require-cors --smoke-upload --smoke-scanned --smoke-ocr-engine arabic ``` """, encoding="utf-8", ) def validate_export(output_dir: Path) -> list[str]: required = [ "Dockerfile", "README.md", ".dockerignore", MANIFEST_NAME, "requirements.txt", "requirements-silma.txt", "requirements-supertonic.txt", "requirements-paddleocr.txt", "requirements-paddleocr-vl.txt", "requirements-qari-ocr.txt", "requirements-tawkeed-ocr.txt", "requirements-katib-ocr.txt", "requirements-arabic-qwen-ocr.txt", "requirements-arabic-glm-ocr.txt", "requirements-baseer-ocr.txt", ".export-complete", "app/main.py", "api/index.py", "static/index.html", "scripts/setup_silma.sh", "scripts/setup_supertonic.sh", "scripts/setup_paddleocr.sh", "scripts/setup_paddleocr_vl.sh", "scripts/setup_qari_ocr.sh", "scripts/setup_tawkeed_ocr.sh", "scripts/setup_katib_ocr.sh", "scripts/setup_arabic_qwen_ocr.sh", "scripts/setup_arabic_glm_ocr.sh", "scripts/setup_baseer_ocr.sh", "scripts/qari_ocr_extract.py", "scripts/tawkeed_ocr_extract.py", "scripts/katib_ocr_extract.py", "scripts/arabic_qwen_ocr_extract.py", "scripts/arabic_glm_ocr_extract.py", "scripts/baseer_ocr_extract.py", "scripts/configure_vercel_worker.py", "scripts/deploy_hf_space.py", "scripts/finish_live_deployment.py", "scripts/prepare_live_deployment.py", "scripts/validate_deployment_env.py", "scripts/refresh_research_evidence.py", "scripts/score_voice_listening.py", "scripts/score_tts_preprocessor.py", "docs/recommended-free-stack.md", "docs/recommended-decision-card.md", "docs/recommended-decision-card.json", ] missing = [relative for relative in required if not (output_dir / relative).exists()] forbidden = [".env", "uploads", "outputs", "data", "test_pdfs", ".venv", ".venv-silma", ".venv-ocr"] present_forbidden = [relative for relative in forbidden if (output_dir / relative).exists()] return [f"missing:{item}" for item in missing] + [f"forbidden:{item}" for item in present_forbidden] def main() -> None: parser = argparse.ArgumentParser(description="Export a clean Hugging Face Spaces Docker worker bundle.") parser.add_argument("--out", type=Path, default=DEFAULT_OUTPUT, help="Destination folder for the Space bundle.") parser.add_argument("--force", action="store_true", help="Replace the destination folder if it already exists.") parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.") args = parser.parse_args() result = export_hf_space(args.out, force=args.force) issues = validate_export(args.out) result["ready"] = not issues result["issues"] = issues if args.json: print(json.dumps(result, indent=2)) else: print(f"Exported Hugging Face Space bundle to {result['outputDir']}") if issues: print("Issues:") for issue in issues: print(f"- {issue}") else: print("Bundle is ready to push to a Docker Space.") if issues: raise SystemExit(1) if __name__ == "__main__": main()