Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 12,329 Bytes

from __future__ import annotations

import argparse
import hashlib
import json
import shutil
import uuid
from pathlib import Path
from typing import Iterable


ROOT_DIR = Path(__file__).resolve().parent.parent
DEFAULT_OUTPUT = ROOT_DIR / "outputs" / "huggingface-space"

FILES = [
    "requirements.txt",
    "requirements-silma.txt",
    "requirements-supertonic.txt",
    "requirements-paddleocr.txt",
    "requirements-paddleocr-vl.txt",
    "requirements-qari-ocr.txt",
    "requirements-tawkeed-ocr.txt",
    "requirements-katib-ocr.txt",
    "requirements-arabic-qwen-ocr.txt",
    "requirements-arabic-glm-ocr.txt",
    "requirements-baseer-ocr.txt",
]
DIRECTORIES = [
    "app",
    "api",
    "docs",
    "static",
    "scripts",
]
EXCLUDE_NAMES = {
    "__pycache__",
    ".pytest_cache",
    ".ruff_cache",
}
EXCLUDE_SUFFIXES = {
    ".pyc",
    ".pyo",
    ".pyd",
}
MANIFEST_NAME = ".export-manifest.json"


def should_copy(path: Path) -> bool:
    if path.name in EXCLUDE_NAMES:
        return False
    if path.suffix in EXCLUDE_SUFFIXES:
        return False
    return True


def copy_tree(source: Path, destination: Path) -> None:
    if destination.exists():
        shutil.rmtree(destination)
    shutil.copytree(source, destination, ignore=lambda _dir, names: [name for name in names if not should_copy(Path(name))])


def iter_manifest_source_files(root: Path | None = None) -> Iterable[Path]:
    root = root or ROOT_DIR
    for relative in FILES:
        path = root / relative
        if path.exists():
            yield path
    dockerfile = root / "Dockerfile.worker"
    if dockerfile.exists():
        yield dockerfile
    for relative in DIRECTORIES:
        base = root / relative
        if not base.exists():
            continue
        for path in sorted(base.rglob("*")):
            if path.is_file() and should_copy(path):
                yield path


def file_sha256(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def build_export_manifest(root: Path | None = None) -> dict[str, object]:
    root = root or ROOT_DIR
    files: dict[str, str] = {}
    for path in iter_manifest_source_files(root):
        relative = path.relative_to(root).as_posix()
        if relative == "Dockerfile.worker":
            relative = "Dockerfile"
        files[relative] = file_sha256(path)
    return {
        "version": 1,
        "source": "ArabicTranslator",
        "files": files,
    }


def build_hf_space_bundle(output_dir: Path) -> list[str]:
    output_dir.mkdir(parents=True)
    copied: list[str] = []
    for relative in FILES:
        source = ROOT_DIR / relative
        destination = output_dir / relative
        destination.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, destination)
        copied.append(relative)

    shutil.copy2(ROOT_DIR / "Dockerfile.worker", output_dir / "Dockerfile")
    copied.append("Dockerfile")

    for relative in DIRECTORIES:
        source = ROOT_DIR / relative
        destination = output_dir / relative
        copy_tree(source, destination)
        copied.append(relative)

    (output_dir / ".dockerignore").write_text(
        "\n".join(
            [
                ".git",
                ".env",
                ".venv",
                ".venv-*",
                "__pycache__",
                ".pytest_cache",
                "outputs",
                "uploads",
                "data",
                "test_pdfs",
                "tests",
                "*.pyc",
                "*.pyo",
                "*.pyd",
                "*.log",
                "",
            ]
        ),
        encoding="utf-8",
    )
    copied.append(".dockerignore")

    write_space_readme(output_dir / "README.md")
    copied.append("README.md")
    (output_dir / MANIFEST_NAME).write_text(
        json.dumps(build_export_manifest(), indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )
    copied.append(MANIFEST_NAME)
    (output_dir / ".export-complete").write_text("ready\n", encoding="utf-8")
    copied.append(".export-complete")
    return copied


def export_hf_space(output_dir: Path = DEFAULT_OUTPUT, force: bool = False) -> dict[str, object]:
    output_dir = output_dir.resolve()
    if output_dir.exists() and not force:
        raise FileExistsError(f"{output_dir} already exists. Use --force to replace it.")

    parent = output_dir.parent
    parent.mkdir(parents=True, exist_ok=True)
    staging_dir = parent / f".{output_dir.name}.staging-{uuid.uuid4().hex}"
    backup_dir = parent / f".{output_dir.name}.previous-{uuid.uuid4().hex}"
    copied: list[str] = []
    try:
        copied = build_hf_space_bundle(staging_dir)
        issues = validate_export(staging_dir)
        if issues:
            raise ValueError(f"Staged Hugging Face Space bundle is invalid: {', '.join(issues)}")
        if output_dir.exists():
            output_dir.rename(backup_dir)
        staging_dir.rename(output_dir)
    except Exception:
        shutil.rmtree(staging_dir, ignore_errors=True)
        if backup_dir.exists() and not output_dir.exists():
            backup_dir.rename(output_dir)
        raise
    finally:
        shutil.rmtree(backup_dir, ignore_errors=True)
    return {"outputDir": str(output_dir), "copied": copied}


def write_space_readme(path: Path) -> None:
    path.write_text(
        """---

title: Arabic Audio Reader Worker

colorFrom: green

colorTo: green

sdk: docker

app_port: 7860

---



# Arabic Audio Reader Worker



This is the Docker worker bundle for the Arabic PDF Reader.



## Hugging Face Space Settings



- SDK: Docker

- Hardware: free CPU is acceptable for demos, but cold starts and long books can be slow

- Free CPU Basic currently provides 2 vCPU, 16 GB RAM, and 50 GB non-persistent disk by default; treat generated audio as short-lived unless you add persistent/object storage

- Port: 7860

- Default build: installs SILMA, PaddleOCR Arabic, Tesseract Arabic, and eSpeak NG

- Optional fast CPU voice: set Docker build arg `INSTALL_SUPERTONIC=1` to add Supertonic 3 Arabic-capable local TTS

- Stronger OCR build: set Docker build arg `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` for Arabic-trained models, or `INSTALL_QARI_OCR=1` for the heavier Arabic-book model



Set these Space secrets:



```text

ACCESS_CODE=1234

SECRET_KEY=<generated by outputs\\deployment-handoff.md>

CORS_ORIGINS=https://your-vercel-app.vercel.app

COOKIE_SAMESITE=none

COOKIE_SECURE=1

OCR_ENGINE=tesseract
OCR_RENDER_ZOOM=2
TESSERACT_PSM=4
DEFAULT_VOICE_ID=silma-local

OUTPUT_RETENTION_DAYS=7

OUTPUT_MAX_FILES=25

AUDIO_FORMAT=mp3

MP3_BITRATE=96k

```



Generate the deployment handoff from the main repo to get the exact `SECRET_KEY`, worker secrets, Vercel environment variables, and final proof command:



```powershell

python scripts\\deployment_handoff.py https://your-space.hf.space --origin https://your-vercel-app.vercel.app --code 1234

```



Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.



The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.


Optional stronger-worker build args:



```text

INSTALL_QARI_OCR=1

INSTALL_TAWKEED_OCR=1

INSTALL_KATIB_OCR=1

INSTALL_ARABIC_QWEN_OCR=1

INSTALL_ARABIC_GLM_OCR=1

INSTALL_BASEER_OCR=1

INSTALL_PADDLEOCR_VL=1

INSTALL_SUPERTONIC=1

```



Use `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` first when you want an Arabic-trained OCR model. Use `INSTALL_QARI_OCR=1` when you want the strongest Arabic-book OCR and the worker has enough memory/GPU. Leave heavy options at `0` on free CPU Spaces unless a short benchmark proves the stronger model is worth the cold start, build time, memory, and runtime.



After the Space builds, verify it from your main repo:



```powershell

python scripts\\verify_worker.py https://your-space.hf.space --code 1234 --origin https://your-vercel-app.vercel.app --require-cors --smoke-upload --smoke-scanned --smoke-ocr-engine arabic

```

""",
        encoding="utf-8",
    )


def validate_export(output_dir: Path) -> list[str]:
    required = [
        "Dockerfile",
        "README.md",
        ".dockerignore",
        MANIFEST_NAME,
        "requirements.txt",
        "requirements-silma.txt",
        "requirements-supertonic.txt",
        "requirements-paddleocr.txt",
        "requirements-paddleocr-vl.txt",
        "requirements-qari-ocr.txt",
        "requirements-tawkeed-ocr.txt",
        "requirements-katib-ocr.txt",
        "requirements-arabic-qwen-ocr.txt",
        "requirements-arabic-glm-ocr.txt",
        "requirements-baseer-ocr.txt",
        ".export-complete",
        "app/main.py",
        "api/index.py",
        "static/index.html",
        "scripts/setup_silma.sh",
        "scripts/setup_supertonic.sh",
        "scripts/setup_paddleocr.sh",
        "scripts/setup_paddleocr_vl.sh",
        "scripts/setup_qari_ocr.sh",
        "scripts/setup_tawkeed_ocr.sh",
        "scripts/setup_katib_ocr.sh",
        "scripts/setup_arabic_qwen_ocr.sh",
        "scripts/setup_arabic_glm_ocr.sh",
        "scripts/setup_baseer_ocr.sh",
        "scripts/qari_ocr_extract.py",
        "scripts/tawkeed_ocr_extract.py",
        "scripts/katib_ocr_extract.py",
        "scripts/arabic_qwen_ocr_extract.py",
        "scripts/arabic_glm_ocr_extract.py",
        "scripts/baseer_ocr_extract.py",
        "scripts/configure_vercel_worker.py",
        "scripts/deploy_hf_space.py",
        "scripts/finish_live_deployment.py",
        "scripts/prepare_live_deployment.py",
        "scripts/validate_deployment_env.py",
        "scripts/refresh_research_evidence.py",
        "scripts/score_voice_listening.py",
        "scripts/score_tts_preprocessor.py",
        "docs/recommended-free-stack.md",
        "docs/recommended-decision-card.md",
        "docs/recommended-decision-card.json",
    ]
    missing = [relative for relative in required if not (output_dir / relative).exists()]
    forbidden = [".env", "uploads", "outputs", "data", "test_pdfs", ".venv", ".venv-silma", ".venv-ocr"]
    present_forbidden = [relative for relative in forbidden if (output_dir / relative).exists()]
    return [f"missing:{item}" for item in missing] + [f"forbidden:{item}" for item in present_forbidden]


def main() -> None:
    parser = argparse.ArgumentParser(description="Export a clean Hugging Face Spaces Docker worker bundle.")
    parser.add_argument("--out", type=Path, default=DEFAULT_OUTPUT, help="Destination folder for the Space bundle.")
    parser.add_argument("--force", action="store_true", help="Replace the destination folder if it already exists.")
    parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
    args = parser.parse_args()

    result = export_hf_space(args.out, force=args.force)
    issues = validate_export(args.out)
    result["ready"] = not issues
    result["issues"] = issues
    if args.json:
        print(json.dumps(result, indent=2))
    else:
        print(f"Exported Hugging Face Space bundle to {result['outputDir']}")
        if issues:
            print("Issues:")
            for issue in issues:
                print(f"- {issue}")
        else:
            print("Bundle is ready to push to a Docker Space.")
    if issues:
        raise SystemExit(1)


if __name__ == "__main__":
    main()