| from __future__ import annotations
|
|
|
| import argparse
|
| import hashlib
|
| import json
|
| import shutil
|
| import uuid
|
| from pathlib import Path
|
| from typing import Iterable
|
|
|
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent
|
| DEFAULT_OUTPUT = ROOT_DIR / "outputs" / "huggingface-space"
|
|
|
| FILES = [
|
| "requirements.txt",
|
| "requirements-silma.txt",
|
| "requirements-supertonic.txt",
|
| "requirements-paddleocr.txt",
|
| "requirements-paddleocr-vl.txt",
|
| "requirements-qari-ocr.txt",
|
| "requirements-tawkeed-ocr.txt",
|
| "requirements-katib-ocr.txt",
|
| "requirements-arabic-qwen-ocr.txt",
|
| "requirements-arabic-glm-ocr.txt",
|
| "requirements-baseer-ocr.txt",
|
| ]
|
| DIRECTORIES = [
|
| "app",
|
| "api",
|
| "docs",
|
| "static",
|
| "scripts",
|
| ]
|
| EXCLUDE_NAMES = {
|
| "__pycache__",
|
| ".pytest_cache",
|
| ".ruff_cache",
|
| }
|
| EXCLUDE_SUFFIXES = {
|
| ".pyc",
|
| ".pyo",
|
| ".pyd",
|
| }
|
| MANIFEST_NAME = ".export-manifest.json"
|
|
|
|
|
| def should_copy(path: Path) -> bool:
|
| if path.name in EXCLUDE_NAMES:
|
| return False
|
| if path.suffix in EXCLUDE_SUFFIXES:
|
| return False
|
| return True
|
|
|
|
|
| def copy_tree(source: Path, destination: Path) -> None:
|
| if destination.exists():
|
| shutil.rmtree(destination)
|
| shutil.copytree(source, destination, ignore=lambda _dir, names: [name for name in names if not should_copy(Path(name))])
|
|
|
|
|
| def iter_manifest_source_files(root: Path | None = None) -> Iterable[Path]:
|
| root = root or ROOT_DIR
|
| for relative in FILES:
|
| path = root / relative
|
| if path.exists():
|
| yield path
|
| dockerfile = root / "Dockerfile.worker"
|
| if dockerfile.exists():
|
| yield dockerfile
|
| for relative in DIRECTORIES:
|
| base = root / relative
|
| if not base.exists():
|
| continue
|
| for path in sorted(base.rglob("*")):
|
| if path.is_file() and should_copy(path):
|
| yield path
|
|
|
|
|
| def file_sha256(path: Path) -> str:
|
| digest = hashlib.sha256()
|
| with path.open("rb") as handle:
|
| for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
| digest.update(chunk)
|
| return digest.hexdigest()
|
|
|
|
|
| def build_export_manifest(root: Path | None = None) -> dict[str, object]:
|
| root = root or ROOT_DIR
|
| files: dict[str, str] = {}
|
| for path in iter_manifest_source_files(root):
|
| relative = path.relative_to(root).as_posix()
|
| if relative == "Dockerfile.worker":
|
| relative = "Dockerfile"
|
| files[relative] = file_sha256(path)
|
| return {
|
| "version": 1,
|
| "source": "ArabicTranslator",
|
| "files": files,
|
| }
|
|
|
|
|
| def build_hf_space_bundle(output_dir: Path) -> list[str]:
|
| output_dir.mkdir(parents=True)
|
| copied: list[str] = []
|
| for relative in FILES:
|
| source = ROOT_DIR / relative
|
| destination = output_dir / relative
|
| destination.parent.mkdir(parents=True, exist_ok=True)
|
| shutil.copy2(source, destination)
|
| copied.append(relative)
|
|
|
| shutil.copy2(ROOT_DIR / "Dockerfile.worker", output_dir / "Dockerfile")
|
| copied.append("Dockerfile")
|
|
|
| for relative in DIRECTORIES:
|
| source = ROOT_DIR / relative
|
| destination = output_dir / relative
|
| copy_tree(source, destination)
|
| copied.append(relative)
|
|
|
| (output_dir / ".dockerignore").write_text(
|
| "\n".join(
|
| [
|
| ".git",
|
| ".env",
|
| ".venv",
|
| ".venv-*",
|
| "__pycache__",
|
| ".pytest_cache",
|
| "outputs",
|
| "uploads",
|
| "data",
|
| "test_pdfs",
|
| "tests",
|
| "*.pyc",
|
| "*.pyo",
|
| "*.pyd",
|
| "*.log",
|
| "",
|
| ]
|
| ),
|
| encoding="utf-8",
|
| )
|
| copied.append(".dockerignore")
|
|
|
| write_space_readme(output_dir / "README.md")
|
| copied.append("README.md")
|
| (output_dir / MANIFEST_NAME).write_text(
|
| json.dumps(build_export_manifest(), indent=2, sort_keys=True) + "\n",
|
| encoding="utf-8",
|
| )
|
| copied.append(MANIFEST_NAME)
|
| (output_dir / ".export-complete").write_text("ready\n", encoding="utf-8")
|
| copied.append(".export-complete")
|
| return copied
|
|
|
|
|
| def export_hf_space(output_dir: Path = DEFAULT_OUTPUT, force: bool = False) -> dict[str, object]:
|
| output_dir = output_dir.resolve()
|
| if output_dir.exists() and not force:
|
| raise FileExistsError(f"{output_dir} already exists. Use --force to replace it.")
|
|
|
| parent = output_dir.parent
|
| parent.mkdir(parents=True, exist_ok=True)
|
| staging_dir = parent / f".{output_dir.name}.staging-{uuid.uuid4().hex}"
|
| backup_dir = parent / f".{output_dir.name}.previous-{uuid.uuid4().hex}"
|
| copied: list[str] = []
|
| try:
|
| copied = build_hf_space_bundle(staging_dir)
|
| issues = validate_export(staging_dir)
|
| if issues:
|
| raise ValueError(f"Staged Hugging Face Space bundle is invalid: {', '.join(issues)}")
|
| if output_dir.exists():
|
| output_dir.rename(backup_dir)
|
| staging_dir.rename(output_dir)
|
| except Exception:
|
| shutil.rmtree(staging_dir, ignore_errors=True)
|
| if backup_dir.exists() and not output_dir.exists():
|
| backup_dir.rename(output_dir)
|
| raise
|
| finally:
|
| shutil.rmtree(backup_dir, ignore_errors=True)
|
| return {"outputDir": str(output_dir), "copied": copied}
|
|
|
|
|
| def write_space_readme(path: Path) -> None:
|
| path.write_text(
|
| """---
|
| title: Arabic Audio Reader Worker
|
| colorFrom: green
|
| colorTo: green
|
| sdk: docker
|
| app_port: 7860
|
| ---
|
|
|
| # Arabic Audio Reader Worker
|
|
|
| This is the Docker worker bundle for the Arabic PDF Reader.
|
|
|
| ## Hugging Face Space Settings
|
|
|
| - SDK: Docker
|
| - Hardware: free CPU is acceptable for demos, but cold starts and long books can be slow
|
| - Free CPU Basic currently provides 2 vCPU, 16 GB RAM, and 50 GB non-persistent disk by default; treat generated audio as short-lived unless you add persistent/object storage
|
| - Port: 7860
|
| - Default build: installs SILMA, PaddleOCR Arabic, Tesseract Arabic, and eSpeak NG
|
| - Optional fast CPU voice: set Docker build arg `INSTALL_SUPERTONIC=1` to add Supertonic 3 Arabic-capable local TTS
|
| - Stronger OCR build: set Docker build arg `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` for Arabic-trained models, or `INSTALL_QARI_OCR=1` for the heavier Arabic-book model
|
|
|
| Set these Space secrets:
|
|
|
| ```text
|
| ACCESS_CODE=1234
|
| SECRET_KEY=<generated by outputs\\deployment-handoff.md>
|
| CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| COOKIE_SAMESITE=none
|
| COOKIE_SECURE=1
|
| OCR_ENGINE=tesseract |
| OCR_RENDER_ZOOM=2 |
| TESSERACT_PSM=4 |
| DEFAULT_VOICE_ID=silma-local
|
| OUTPUT_RETENTION_DAYS=7
|
| OUTPUT_MAX_FILES=25
|
| AUDIO_FORMAT=mp3
|
| MP3_BITRATE=96k
|
| ```
|
|
|
| Generate the deployment handoff from the main repo to get the exact `SECRET_KEY`, worker secrets, Vercel environment variables, and final proof command:
|
|
|
| ```powershell
|
| python scripts\\deployment_handoff.py https://your-space.hf.space --origin https://your-vercel-app.vercel.app --code 1234
|
| ```
|
|
|
| Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.
|
|
|
| The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio. |
|
|
| Optional stronger-worker build args:
|
|
|
| ```text
|
| INSTALL_QARI_OCR=1
|
| INSTALL_TAWKEED_OCR=1
|
| INSTALL_KATIB_OCR=1
|
| INSTALL_ARABIC_QWEN_OCR=1
|
| INSTALL_ARABIC_GLM_OCR=1
|
| INSTALL_BASEER_OCR=1
|
| INSTALL_PADDLEOCR_VL=1
|
| INSTALL_SUPERTONIC=1
|
| ```
|
|
|
| Use `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` first when you want an Arabic-trained OCR model. Use `INSTALL_QARI_OCR=1` when you want the strongest Arabic-book OCR and the worker has enough memory/GPU. Leave heavy options at `0` on free CPU Spaces unless a short benchmark proves the stronger model is worth the cold start, build time, memory, and runtime.
|
|
|
| After the Space builds, verify it from your main repo:
|
|
|
| ```powershell
|
| python scripts\\verify_worker.py https://your-space.hf.space --code 1234 --origin https://your-vercel-app.vercel.app --require-cors --smoke-upload --smoke-scanned --smoke-ocr-engine arabic
|
| ```
|
| """,
|
| encoding="utf-8",
|
| )
|
|
|
|
|
| def validate_export(output_dir: Path) -> list[str]:
|
| required = [
|
| "Dockerfile",
|
| "README.md",
|
| ".dockerignore",
|
| MANIFEST_NAME,
|
| "requirements.txt",
|
| "requirements-silma.txt",
|
| "requirements-supertonic.txt",
|
| "requirements-paddleocr.txt",
|
| "requirements-paddleocr-vl.txt",
|
| "requirements-qari-ocr.txt",
|
| "requirements-tawkeed-ocr.txt",
|
| "requirements-katib-ocr.txt",
|
| "requirements-arabic-qwen-ocr.txt",
|
| "requirements-arabic-glm-ocr.txt",
|
| "requirements-baseer-ocr.txt",
|
| ".export-complete",
|
| "app/main.py",
|
| "api/index.py",
|
| "static/index.html",
|
| "scripts/setup_silma.sh",
|
| "scripts/setup_supertonic.sh",
|
| "scripts/setup_paddleocr.sh",
|
| "scripts/setup_paddleocr_vl.sh",
|
| "scripts/setup_qari_ocr.sh",
|
| "scripts/setup_tawkeed_ocr.sh",
|
| "scripts/setup_katib_ocr.sh",
|
| "scripts/setup_arabic_qwen_ocr.sh",
|
| "scripts/setup_arabic_glm_ocr.sh",
|
| "scripts/setup_baseer_ocr.sh",
|
| "scripts/qari_ocr_extract.py",
|
| "scripts/tawkeed_ocr_extract.py",
|
| "scripts/katib_ocr_extract.py",
|
| "scripts/arabic_qwen_ocr_extract.py",
|
| "scripts/arabic_glm_ocr_extract.py",
|
| "scripts/baseer_ocr_extract.py",
|
| "scripts/configure_vercel_worker.py",
|
| "scripts/deploy_hf_space.py",
|
| "scripts/finish_live_deployment.py",
|
| "scripts/prepare_live_deployment.py",
|
| "scripts/validate_deployment_env.py",
|
| "scripts/refresh_research_evidence.py",
|
| "scripts/score_voice_listening.py",
|
| "scripts/score_tts_preprocessor.py",
|
| "docs/recommended-free-stack.md",
|
| "docs/recommended-decision-card.md",
|
| "docs/recommended-decision-card.json",
|
| ]
|
| missing = [relative for relative in required if not (output_dir / relative).exists()]
|
| forbidden = [".env", "uploads", "outputs", "data", "test_pdfs", ".venv", ".venv-silma", ".venv-ocr"]
|
| present_forbidden = [relative for relative in forbidden if (output_dir / relative).exists()]
|
| return [f"missing:{item}" for item in missing] + [f"forbidden:{item}" for item in present_forbidden]
|
|
|
|
|
| def main() -> None:
|
| parser = argparse.ArgumentParser(description="Export a clean Hugging Face Spaces Docker worker bundle.")
|
| parser.add_argument("--out", type=Path, default=DEFAULT_OUTPUT, help="Destination folder for the Space bundle.")
|
| parser.add_argument("--force", action="store_true", help="Replace the destination folder if it already exists.")
|
| parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
|
| args = parser.parse_args()
|
|
|
| result = export_hf_space(args.out, force=args.force)
|
| issues = validate_export(args.out)
|
| result["ready"] = not issues
|
| result["issues"] = issues
|
| if args.json:
|
| print(json.dumps(result, indent=2))
|
| else:
|
| print(f"Exported Hugging Face Space bundle to {result['outputDir']}")
|
| if issues:
|
| print("Issues:")
|
| for issue in issues:
|
| print(f"- {issue}")
|
| else:
|
| print("Bundle is ready to push to a Docker Space.")
|
| if issues:
|
| raise SystemExit(1)
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|