File size: 12,329 Bytes
088795a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d5a99d
 
 
088795a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d5a99d
088795a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
from __future__ import annotations

import argparse
import hashlib
import json
import shutil
import uuid
from pathlib import Path
from typing import Iterable


ROOT_DIR = Path(__file__).resolve().parent.parent
DEFAULT_OUTPUT = ROOT_DIR / "outputs" / "huggingface-space"

FILES = [
    "requirements.txt",
    "requirements-silma.txt",
    "requirements-supertonic.txt",
    "requirements-paddleocr.txt",
    "requirements-paddleocr-vl.txt",
    "requirements-qari-ocr.txt",
    "requirements-tawkeed-ocr.txt",
    "requirements-katib-ocr.txt",
    "requirements-arabic-qwen-ocr.txt",
    "requirements-arabic-glm-ocr.txt",
    "requirements-baseer-ocr.txt",
]
DIRECTORIES = [
    "app",
    "api",
    "docs",
    "static",
    "scripts",
]
EXCLUDE_NAMES = {
    "__pycache__",
    ".pytest_cache",
    ".ruff_cache",
}
EXCLUDE_SUFFIXES = {
    ".pyc",
    ".pyo",
    ".pyd",
}
MANIFEST_NAME = ".export-manifest.json"


def should_copy(path: Path) -> bool:
    if path.name in EXCLUDE_NAMES:
        return False
    if path.suffix in EXCLUDE_SUFFIXES:
        return False
    return True


def copy_tree(source: Path, destination: Path) -> None:
    if destination.exists():
        shutil.rmtree(destination)
    shutil.copytree(source, destination, ignore=lambda _dir, names: [name for name in names if not should_copy(Path(name))])


def iter_manifest_source_files(root: Path | None = None) -> Iterable[Path]:
    root = root or ROOT_DIR
    for relative in FILES:
        path = root / relative
        if path.exists():
            yield path
    dockerfile = root / "Dockerfile.worker"
    if dockerfile.exists():
        yield dockerfile
    for relative in DIRECTORIES:
        base = root / relative
        if not base.exists():
            continue
        for path in sorted(base.rglob("*")):
            if path.is_file() and should_copy(path):
                yield path


def file_sha256(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def build_export_manifest(root: Path | None = None) -> dict[str, object]:
    root = root or ROOT_DIR
    files: dict[str, str] = {}
    for path in iter_manifest_source_files(root):
        relative = path.relative_to(root).as_posix()
        if relative == "Dockerfile.worker":
            relative = "Dockerfile"
        files[relative] = file_sha256(path)
    return {
        "version": 1,
        "source": "ArabicTranslator",
        "files": files,
    }


def build_hf_space_bundle(output_dir: Path) -> list[str]:
    output_dir.mkdir(parents=True)
    copied: list[str] = []
    for relative in FILES:
        source = ROOT_DIR / relative
        destination = output_dir / relative
        destination.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, destination)
        copied.append(relative)

    shutil.copy2(ROOT_DIR / "Dockerfile.worker", output_dir / "Dockerfile")
    copied.append("Dockerfile")

    for relative in DIRECTORIES:
        source = ROOT_DIR / relative
        destination = output_dir / relative
        copy_tree(source, destination)
        copied.append(relative)

    (output_dir / ".dockerignore").write_text(
        "\n".join(
            [
                ".git",
                ".env",
                ".venv",
                ".venv-*",
                "__pycache__",
                ".pytest_cache",
                "outputs",
                "uploads",
                "data",
                "test_pdfs",
                "tests",
                "*.pyc",
                "*.pyo",
                "*.pyd",
                "*.log",
                "",
            ]
        ),
        encoding="utf-8",
    )
    copied.append(".dockerignore")

    write_space_readme(output_dir / "README.md")
    copied.append("README.md")
    (output_dir / MANIFEST_NAME).write_text(
        json.dumps(build_export_manifest(), indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )
    copied.append(MANIFEST_NAME)
    (output_dir / ".export-complete").write_text("ready\n", encoding="utf-8")
    copied.append(".export-complete")
    return copied


def export_hf_space(output_dir: Path = DEFAULT_OUTPUT, force: bool = False) -> dict[str, object]:
    output_dir = output_dir.resolve()
    if output_dir.exists() and not force:
        raise FileExistsError(f"{output_dir} already exists. Use --force to replace it.")

    parent = output_dir.parent
    parent.mkdir(parents=True, exist_ok=True)
    staging_dir = parent / f".{output_dir.name}.staging-{uuid.uuid4().hex}"
    backup_dir = parent / f".{output_dir.name}.previous-{uuid.uuid4().hex}"
    copied: list[str] = []
    try:
        copied = build_hf_space_bundle(staging_dir)
        issues = validate_export(staging_dir)
        if issues:
            raise ValueError(f"Staged Hugging Face Space bundle is invalid: {', '.join(issues)}")
        if output_dir.exists():
            output_dir.rename(backup_dir)
        staging_dir.rename(output_dir)
    except Exception:
        shutil.rmtree(staging_dir, ignore_errors=True)
        if backup_dir.exists() and not output_dir.exists():
            backup_dir.rename(output_dir)
        raise
    finally:
        shutil.rmtree(backup_dir, ignore_errors=True)
    return {"outputDir": str(output_dir), "copied": copied}


def write_space_readme(path: Path) -> None:
    path.write_text(
        """---

title: Arabic Audio Reader Worker

colorFrom: green

colorTo: green

sdk: docker

app_port: 7860

---



# Arabic Audio Reader Worker



This is the Docker worker bundle for the Arabic PDF Reader.



## Hugging Face Space Settings



- SDK: Docker

- Hardware: free CPU is acceptable for demos, but cold starts and long books can be slow

- Free CPU Basic currently provides 2 vCPU, 16 GB RAM, and 50 GB non-persistent disk by default; treat generated audio as short-lived unless you add persistent/object storage

- Port: 7860

- Default build: installs SILMA, PaddleOCR Arabic, Tesseract Arabic, and eSpeak NG

- Optional fast CPU voice: set Docker build arg `INSTALL_SUPERTONIC=1` to add Supertonic 3 Arabic-capable local TTS

- Stronger OCR build: set Docker build arg `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` for Arabic-trained models, or `INSTALL_QARI_OCR=1` for the heavier Arabic-book model



Set these Space secrets:



```text

ACCESS_CODE=1234

SECRET_KEY=<generated by outputs\\deployment-handoff.md>

CORS_ORIGINS=https://your-vercel-app.vercel.app

COOKIE_SAMESITE=none

COOKIE_SECURE=1

OCR_ENGINE=tesseract
OCR_RENDER_ZOOM=2
TESSERACT_PSM=4
DEFAULT_VOICE_ID=silma-local

OUTPUT_RETENTION_DAYS=7

OUTPUT_MAX_FILES=25

AUDIO_FORMAT=mp3

MP3_BITRATE=96k

```



Generate the deployment handoff from the main repo to get the exact `SECRET_KEY`, worker secrets, Vercel environment variables, and final proof command:



```powershell

python scripts\\deployment_handoff.py https://your-space.hf.space --origin https://your-vercel-app.vercel.app --code 1234

```



Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.



The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.


Optional stronger-worker build args:



```text

INSTALL_QARI_OCR=1

INSTALL_TAWKEED_OCR=1

INSTALL_KATIB_OCR=1

INSTALL_ARABIC_QWEN_OCR=1

INSTALL_ARABIC_GLM_OCR=1

INSTALL_BASEER_OCR=1

INSTALL_PADDLEOCR_VL=1

INSTALL_SUPERTONIC=1

```



Use `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` first when you want an Arabic-trained OCR model. Use `INSTALL_QARI_OCR=1` when you want the strongest Arabic-book OCR and the worker has enough memory/GPU. Leave heavy options at `0` on free CPU Spaces unless a short benchmark proves the stronger model is worth the cold start, build time, memory, and runtime.



After the Space builds, verify it from your main repo:



```powershell

python scripts\\verify_worker.py https://your-space.hf.space --code 1234 --origin https://your-vercel-app.vercel.app --require-cors --smoke-upload --smoke-scanned --smoke-ocr-engine arabic

```

""",
        encoding="utf-8",
    )


def validate_export(output_dir: Path) -> list[str]:
    required = [
        "Dockerfile",
        "README.md",
        ".dockerignore",
        MANIFEST_NAME,
        "requirements.txt",
        "requirements-silma.txt",
        "requirements-supertonic.txt",
        "requirements-paddleocr.txt",
        "requirements-paddleocr-vl.txt",
        "requirements-qari-ocr.txt",
        "requirements-tawkeed-ocr.txt",
        "requirements-katib-ocr.txt",
        "requirements-arabic-qwen-ocr.txt",
        "requirements-arabic-glm-ocr.txt",
        "requirements-baseer-ocr.txt",
        ".export-complete",
        "app/main.py",
        "api/index.py",
        "static/index.html",
        "scripts/setup_silma.sh",
        "scripts/setup_supertonic.sh",
        "scripts/setup_paddleocr.sh",
        "scripts/setup_paddleocr_vl.sh",
        "scripts/setup_qari_ocr.sh",
        "scripts/setup_tawkeed_ocr.sh",
        "scripts/setup_katib_ocr.sh",
        "scripts/setup_arabic_qwen_ocr.sh",
        "scripts/setup_arabic_glm_ocr.sh",
        "scripts/setup_baseer_ocr.sh",
        "scripts/qari_ocr_extract.py",
        "scripts/tawkeed_ocr_extract.py",
        "scripts/katib_ocr_extract.py",
        "scripts/arabic_qwen_ocr_extract.py",
        "scripts/arabic_glm_ocr_extract.py",
        "scripts/baseer_ocr_extract.py",
        "scripts/configure_vercel_worker.py",
        "scripts/deploy_hf_space.py",
        "scripts/finish_live_deployment.py",
        "scripts/prepare_live_deployment.py",
        "scripts/validate_deployment_env.py",
        "scripts/refresh_research_evidence.py",
        "scripts/score_voice_listening.py",
        "scripts/score_tts_preprocessor.py",
        "docs/recommended-free-stack.md",
        "docs/recommended-decision-card.md",
        "docs/recommended-decision-card.json",
    ]
    missing = [relative for relative in required if not (output_dir / relative).exists()]
    forbidden = [".env", "uploads", "outputs", "data", "test_pdfs", ".venv", ".venv-silma", ".venv-ocr"]
    present_forbidden = [relative for relative in forbidden if (output_dir / relative).exists()]
    return [f"missing:{item}" for item in missing] + [f"forbidden:{item}" for item in present_forbidden]


def main() -> None:
    parser = argparse.ArgumentParser(description="Export a clean Hugging Face Spaces Docker worker bundle.")
    parser.add_argument("--out", type=Path, default=DEFAULT_OUTPUT, help="Destination folder for the Space bundle.")
    parser.add_argument("--force", action="store_true", help="Replace the destination folder if it already exists.")
    parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
    args = parser.parse_args()

    result = export_hf_space(args.out, force=args.force)
    issues = validate_export(args.out)
    result["ready"] = not issues
    result["issues"] = issues
    if args.json:
        print(json.dumps(result, indent=2))
    else:
        print(f"Exported Hugging Face Space bundle to {result['outputDir']}")
        if issues:
            print("Issues:")
            for issue in issues:
                print(f"- {issue}")
        else:
            print("Bundle is ready to push to a Docker Space.")
    if issues:
        raise SystemExit(1)


if __name__ == "__main__":
    main()