arabic-audio-reader-worker / scripts /export_hf_space.py
Syncre's picture
Deploy Arabic Audio Reader worker
6d5a99d verified
from __future__ import annotations
import argparse
import hashlib
import json
import shutil
import uuid
from pathlib import Path
from typing import Iterable
ROOT_DIR = Path(__file__).resolve().parent.parent
DEFAULT_OUTPUT = ROOT_DIR / "outputs" / "huggingface-space"
FILES = [
"requirements.txt",
"requirements-silma.txt",
"requirements-supertonic.txt",
"requirements-paddleocr.txt",
"requirements-paddleocr-vl.txt",
"requirements-qari-ocr.txt",
"requirements-tawkeed-ocr.txt",
"requirements-katib-ocr.txt",
"requirements-arabic-qwen-ocr.txt",
"requirements-arabic-glm-ocr.txt",
"requirements-baseer-ocr.txt",
]
DIRECTORIES = [
"app",
"api",
"docs",
"static",
"scripts",
]
EXCLUDE_NAMES = {
"__pycache__",
".pytest_cache",
".ruff_cache",
}
EXCLUDE_SUFFIXES = {
".pyc",
".pyo",
".pyd",
}
MANIFEST_NAME = ".export-manifest.json"
def should_copy(path: Path) -> bool:
if path.name in EXCLUDE_NAMES:
return False
if path.suffix in EXCLUDE_SUFFIXES:
return False
return True
def copy_tree(source: Path, destination: Path) -> None:
if destination.exists():
shutil.rmtree(destination)
shutil.copytree(source, destination, ignore=lambda _dir, names: [name for name in names if not should_copy(Path(name))])
def iter_manifest_source_files(root: Path | None = None) -> Iterable[Path]:
root = root or ROOT_DIR
for relative in FILES:
path = root / relative
if path.exists():
yield path
dockerfile = root / "Dockerfile.worker"
if dockerfile.exists():
yield dockerfile
for relative in DIRECTORIES:
base = root / relative
if not base.exists():
continue
for path in sorted(base.rglob("*")):
if path.is_file() and should_copy(path):
yield path
def file_sha256(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def build_export_manifest(root: Path | None = None) -> dict[str, object]:
root = root or ROOT_DIR
files: dict[str, str] = {}
for path in iter_manifest_source_files(root):
relative = path.relative_to(root).as_posix()
if relative == "Dockerfile.worker":
relative = "Dockerfile"
files[relative] = file_sha256(path)
return {
"version": 1,
"source": "ArabicTranslator",
"files": files,
}
def build_hf_space_bundle(output_dir: Path) -> list[str]:
output_dir.mkdir(parents=True)
copied: list[str] = []
for relative in FILES:
source = ROOT_DIR / relative
destination = output_dir / relative
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, destination)
copied.append(relative)
shutil.copy2(ROOT_DIR / "Dockerfile.worker", output_dir / "Dockerfile")
copied.append("Dockerfile")
for relative in DIRECTORIES:
source = ROOT_DIR / relative
destination = output_dir / relative
copy_tree(source, destination)
copied.append(relative)
(output_dir / ".dockerignore").write_text(
"\n".join(
[
".git",
".env",
".venv",
".venv-*",
"__pycache__",
".pytest_cache",
"outputs",
"uploads",
"data",
"test_pdfs",
"tests",
"*.pyc",
"*.pyo",
"*.pyd",
"*.log",
"",
]
),
encoding="utf-8",
)
copied.append(".dockerignore")
write_space_readme(output_dir / "README.md")
copied.append("README.md")
(output_dir / MANIFEST_NAME).write_text(
json.dumps(build_export_manifest(), indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
copied.append(MANIFEST_NAME)
(output_dir / ".export-complete").write_text("ready\n", encoding="utf-8")
copied.append(".export-complete")
return copied
def export_hf_space(output_dir: Path = DEFAULT_OUTPUT, force: bool = False) -> dict[str, object]:
output_dir = output_dir.resolve()
if output_dir.exists() and not force:
raise FileExistsError(f"{output_dir} already exists. Use --force to replace it.")
parent = output_dir.parent
parent.mkdir(parents=True, exist_ok=True)
staging_dir = parent / f".{output_dir.name}.staging-{uuid.uuid4().hex}"
backup_dir = parent / f".{output_dir.name}.previous-{uuid.uuid4().hex}"
copied: list[str] = []
try:
copied = build_hf_space_bundle(staging_dir)
issues = validate_export(staging_dir)
if issues:
raise ValueError(f"Staged Hugging Face Space bundle is invalid: {', '.join(issues)}")
if output_dir.exists():
output_dir.rename(backup_dir)
staging_dir.rename(output_dir)
except Exception:
shutil.rmtree(staging_dir, ignore_errors=True)
if backup_dir.exists() and not output_dir.exists():
backup_dir.rename(output_dir)
raise
finally:
shutil.rmtree(backup_dir, ignore_errors=True)
return {"outputDir": str(output_dir), "copied": copied}
def write_space_readme(path: Path) -> None:
path.write_text(
"""---
title: Arabic Audio Reader Worker
colorFrom: green
colorTo: green
sdk: docker
app_port: 7860
---
# Arabic Audio Reader Worker
This is the Docker worker bundle for the Arabic PDF Reader.
## Hugging Face Space Settings
- SDK: Docker
- Hardware: free CPU is acceptable for demos, but cold starts and long books can be slow
- Free CPU Basic currently provides 2 vCPU, 16 GB RAM, and 50 GB non-persistent disk by default; treat generated audio as short-lived unless you add persistent/object storage
- Port: 7860
- Default build: installs SILMA, PaddleOCR Arabic, Tesseract Arabic, and eSpeak NG
- Optional fast CPU voice: set Docker build arg `INSTALL_SUPERTONIC=1` to add Supertonic 3 Arabic-capable local TTS
- Stronger OCR build: set Docker build arg `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` for Arabic-trained models, or `INSTALL_QARI_OCR=1` for the heavier Arabic-book model
Set these Space secrets:
```text
ACCESS_CODE=1234
SECRET_KEY=<generated by outputs\\deployment-handoff.md>
CORS_ORIGINS=https://your-vercel-app.vercel.app
COOKIE_SAMESITE=none
COOKIE_SECURE=1
OCR_ENGINE=tesseract
OCR_RENDER_ZOOM=2
TESSERACT_PSM=4
DEFAULT_VOICE_ID=silma-local
OUTPUT_RETENTION_DAYS=7
OUTPUT_MAX_FILES=25
AUDIO_FORMAT=mp3
MP3_BITRATE=96k
```
Generate the deployment handoff from the main repo to get the exact `SECRET_KEY`, worker secrets, Vercel environment variables, and final proof command:
```powershell
python scripts\\deployment_handoff.py https://your-space.hf.space --origin https://your-vercel-app.vercel.app --code 1234
```
Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.
The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
Optional stronger-worker build args:
```text
INSTALL_QARI_OCR=1
INSTALL_TAWKEED_OCR=1
INSTALL_KATIB_OCR=1
INSTALL_ARABIC_QWEN_OCR=1
INSTALL_ARABIC_GLM_OCR=1
INSTALL_BASEER_OCR=1
INSTALL_PADDLEOCR_VL=1
INSTALL_SUPERTONIC=1
```
Use `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` first when you want an Arabic-trained OCR model. Use `INSTALL_QARI_OCR=1` when you want the strongest Arabic-book OCR and the worker has enough memory/GPU. Leave heavy options at `0` on free CPU Spaces unless a short benchmark proves the stronger model is worth the cold start, build time, memory, and runtime.
After the Space builds, verify it from your main repo:
```powershell
python scripts\\verify_worker.py https://your-space.hf.space --code 1234 --origin https://your-vercel-app.vercel.app --require-cors --smoke-upload --smoke-scanned --smoke-ocr-engine arabic
```
""",
encoding="utf-8",
)
def validate_export(output_dir: Path) -> list[str]:
required = [
"Dockerfile",
"README.md",
".dockerignore",
MANIFEST_NAME,
"requirements.txt",
"requirements-silma.txt",
"requirements-supertonic.txt",
"requirements-paddleocr.txt",
"requirements-paddleocr-vl.txt",
"requirements-qari-ocr.txt",
"requirements-tawkeed-ocr.txt",
"requirements-katib-ocr.txt",
"requirements-arabic-qwen-ocr.txt",
"requirements-arabic-glm-ocr.txt",
"requirements-baseer-ocr.txt",
".export-complete",
"app/main.py",
"api/index.py",
"static/index.html",
"scripts/setup_silma.sh",
"scripts/setup_supertonic.sh",
"scripts/setup_paddleocr.sh",
"scripts/setup_paddleocr_vl.sh",
"scripts/setup_qari_ocr.sh",
"scripts/setup_tawkeed_ocr.sh",
"scripts/setup_katib_ocr.sh",
"scripts/setup_arabic_qwen_ocr.sh",
"scripts/setup_arabic_glm_ocr.sh",
"scripts/setup_baseer_ocr.sh",
"scripts/qari_ocr_extract.py",
"scripts/tawkeed_ocr_extract.py",
"scripts/katib_ocr_extract.py",
"scripts/arabic_qwen_ocr_extract.py",
"scripts/arabic_glm_ocr_extract.py",
"scripts/baseer_ocr_extract.py",
"scripts/configure_vercel_worker.py",
"scripts/deploy_hf_space.py",
"scripts/finish_live_deployment.py",
"scripts/prepare_live_deployment.py",
"scripts/validate_deployment_env.py",
"scripts/refresh_research_evidence.py",
"scripts/score_voice_listening.py",
"scripts/score_tts_preprocessor.py",
"docs/recommended-free-stack.md",
"docs/recommended-decision-card.md",
"docs/recommended-decision-card.json",
]
missing = [relative for relative in required if not (output_dir / relative).exists()]
forbidden = [".env", "uploads", "outputs", "data", "test_pdfs", ".venv", ".venv-silma", ".venv-ocr"]
present_forbidden = [relative for relative in forbidden if (output_dir / relative).exists()]
return [f"missing:{item}" for item in missing] + [f"forbidden:{item}" for item in present_forbidden]
def main() -> None:
parser = argparse.ArgumentParser(description="Export a clean Hugging Face Spaces Docker worker bundle.")
parser.add_argument("--out", type=Path, default=DEFAULT_OUTPUT, help="Destination folder for the Space bundle.")
parser.add_argument("--force", action="store_true", help="Replace the destination folder if it already exists.")
parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
args = parser.parse_args()
result = export_hf_space(args.out, force=args.force)
issues = validate_export(args.out)
result["ready"] = not issues
result["issues"] = issues
if args.json:
print(json.dumps(result, indent=2))
else:
print(f"Exported Hugging Face Space bundle to {result['outputDir']}")
if issues:
print("Issues:")
for issue in issues:
print(f"- {issue}")
else:
print("Bundle is ready to push to a Docker Space.")
if issues:
raise SystemExit(1)
if __name__ == "__main__":
main()