Spaces:

Syncre
/

arabic-audio-reader-worker

Running

App Files Files Community

arabic-audio-reader-worker / scripts /export_hf_space.py

Syncre

Deploy Arabic Audio Reader worker

6d5a99d verified about 5 hours ago

raw

history blame contribute delete

12.3 kB

	from __future__ import annotations

	import argparse
	import hashlib
	import json
	import shutil
	import uuid
	from pathlib import Path
	from typing import Iterable


	ROOT_DIR = Path(__file__).resolve().parent.parent
	DEFAULT_OUTPUT = ROOT_DIR / "outputs" / "huggingface-space"

	FILES = [
	"requirements.txt",
	"requirements-silma.txt",
	"requirements-supertonic.txt",
	"requirements-paddleocr.txt",
	"requirements-paddleocr-vl.txt",
	"requirements-qari-ocr.txt",
	"requirements-tawkeed-ocr.txt",
	"requirements-katib-ocr.txt",
	"requirements-arabic-qwen-ocr.txt",
	"requirements-arabic-glm-ocr.txt",
	"requirements-baseer-ocr.txt",
	]
	DIRECTORIES = [
	"app",
	"api",
	"docs",
	"static",
	"scripts",
	]
	EXCLUDE_NAMES = {
	"__pycache__",
	".pytest_cache",
	".ruff_cache",
	}
	EXCLUDE_SUFFIXES = {
	".pyc",
	".pyo",
	".pyd",
	}
	MANIFEST_NAME = ".export-manifest.json"


	def should_copy(path: Path) -> bool:
	if path.name in EXCLUDE_NAMES:
	return False
	if path.suffix in EXCLUDE_SUFFIXES:
	return False
	return True


	def copy_tree(source: Path, destination: Path) -> None:
	if destination.exists():
	shutil.rmtree(destination)
	shutil.copytree(source, destination, ignore=lambda _dir, names: [name for name in names if not should_copy(Path(name))])


	def iter_manifest_source_files(root: Path \| None = None) -> Iterable[Path]:
	root = root or ROOT_DIR
	for relative in FILES:
	path = root / relative
	if path.exists():
	yield path
	dockerfile = root / "Dockerfile.worker"
	if dockerfile.exists():
	yield dockerfile
	for relative in DIRECTORIES:
	base = root / relative
	if not base.exists():
	continue
	for path in sorted(base.rglob("*")):
	if path.is_file() and should_copy(path):
	yield path


	def file_sha256(path: Path) -> str:
	digest = hashlib.sha256()
	with path.open("rb") as handle:
	for chunk in iter(lambda: handle.read(1024 * 1024), b""):
	digest.update(chunk)
	return digest.hexdigest()


	def build_export_manifest(root: Path \| None = None) -> dict[str, object]:
	root = root or ROOT_DIR
	files: dict[str, str] = {}
	for path in iter_manifest_source_files(root):
	relative = path.relative_to(root).as_posix()
	if relative == "Dockerfile.worker":
	relative = "Dockerfile"
	files[relative] = file_sha256(path)
	return {
	"version": 1,
	"source": "ArabicTranslator",
	"files": files,
	}


	def build_hf_space_bundle(output_dir: Path) -> list[str]:
	output_dir.mkdir(parents=True)
	copied: list[str] = []
	for relative in FILES:
	source = ROOT_DIR / relative
	destination = output_dir / relative
	destination.parent.mkdir(parents=True, exist_ok=True)
	shutil.copy2(source, destination)
	copied.append(relative)

	shutil.copy2(ROOT_DIR / "Dockerfile.worker", output_dir / "Dockerfile")
	copied.append("Dockerfile")

	for relative in DIRECTORIES:
	source = ROOT_DIR / relative
	destination = output_dir / relative
	copy_tree(source, destination)
	copied.append(relative)

	(output_dir / ".dockerignore").write_text(
	"\n".join(
	[
	".git",
	".env",
	".venv",
	".venv-*",
	"__pycache__",
	".pytest_cache",
	"outputs",
	"uploads",
	"data",
	"test_pdfs",
	"tests",
	"*.pyc",
	"*.pyo",
	"*.pyd",
	"*.log",
	"",
	]
	),
	encoding="utf-8",
	)
	copied.append(".dockerignore")

	write_space_readme(output_dir / "README.md")
	copied.append("README.md")
	(output_dir / MANIFEST_NAME).write_text(
	json.dumps(build_export_manifest(), indent=2, sort_keys=True) + "\n",
	encoding="utf-8",
	)
	copied.append(MANIFEST_NAME)
	(output_dir / ".export-complete").write_text("ready\n", encoding="utf-8")
	copied.append(".export-complete")
	return copied


	def export_hf_space(output_dir: Path = DEFAULT_OUTPUT, force: bool = False) -> dict[str, object]:
	output_dir = output_dir.resolve()
	if output_dir.exists() and not force:
	raise FileExistsError(f"{output_dir} already exists. Use --force to replace it.")

	parent = output_dir.parent
	parent.mkdir(parents=True, exist_ok=True)
	staging_dir = parent / f".{output_dir.name}.staging-{uuid.uuid4().hex}"
	backup_dir = parent / f".{output_dir.name}.previous-{uuid.uuid4().hex}"
	copied: list[str] = []
	try:
	copied = build_hf_space_bundle(staging_dir)
	issues = validate_export(staging_dir)
	if issues:
	raise ValueError(f"Staged Hugging Face Space bundle is invalid: {', '.join(issues)}")
	if output_dir.exists():
	output_dir.rename(backup_dir)
	staging_dir.rename(output_dir)
	except Exception:
	shutil.rmtree(staging_dir, ignore_errors=True)
	if backup_dir.exists() and not output_dir.exists():
	backup_dir.rename(output_dir)
	raise
	finally:
	shutil.rmtree(backup_dir, ignore_errors=True)
	return {"outputDir": str(output_dir), "copied": copied}


	def write_space_readme(path: Path) -> None:
	path.write_text(
	"""---
	title: Arabic Audio Reader Worker
	colorFrom: green
	colorTo: green
	sdk: docker
	app_port: 7860
	---

	# Arabic Audio Reader Worker

	This is the Docker worker bundle for the Arabic PDF Reader.

	## Hugging Face Space Settings

	- SDK: Docker
	- Hardware: free CPU is acceptable for demos, but cold starts and long books can be slow
	- Free CPU Basic currently provides 2 vCPU, 16 GB RAM, and 50 GB non-persistent disk by default; treat generated audio as short-lived unless you add persistent/object storage
	- Port: 7860
	- Default build: installs SILMA, PaddleOCR Arabic, Tesseract Arabic, and eSpeak NG
	- Optional fast CPU voice: set Docker build arg `INSTALL_SUPERTONIC=1` to add Supertonic 3 Arabic-capable local TTS
	- Stronger OCR build: set Docker build arg `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` for Arabic-trained models, or `INSTALL_QARI_OCR=1` for the heavier Arabic-book model

	Set these Space secrets:

	```text
	ACCESS_CODE=1234
	SECRET_KEY=<generated by outputs\\deployment-handoff.md>
	CORS_ORIGINS=https://your-vercel-app.vercel.app
	COOKIE_SAMESITE=none
	COOKIE_SECURE=1
	OCR_ENGINE=tesseract
	OCR_RENDER_ZOOM=2
	TESSERACT_PSM=4
	DEFAULT_VOICE_ID=silma-local
	OUTPUT_RETENTION_DAYS=7
	OUTPUT_MAX_FILES=25
	AUDIO_FORMAT=mp3
	MP3_BITRATE=96k
	```

	Generate the deployment handoff from the main repo to get the exact `SECRET_KEY`, worker secrets, Vercel environment variables, and final proof command:

	```powershell
	python scripts\\deployment_handoff.py https://your-space.hf.space --origin https://your-vercel-app.vercel.app --code 1234
	```

	Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.

	The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.

	Optional stronger-worker build args:

	```text
	INSTALL_QARI_OCR=1
	INSTALL_TAWKEED_OCR=1
	INSTALL_KATIB_OCR=1
	INSTALL_ARABIC_QWEN_OCR=1
	INSTALL_ARABIC_GLM_OCR=1
	INSTALL_BASEER_OCR=1
	INSTALL_PADDLEOCR_VL=1
	INSTALL_SUPERTONIC=1
	```

	Use `INSTALL_TAWKEED_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_ARABIC_GLM_OCR=1`, or `INSTALL_BASEER_OCR=1` first when you want an Arabic-trained OCR model. Use `INSTALL_QARI_OCR=1` when you want the strongest Arabic-book OCR and the worker has enough memory/GPU. Leave heavy options at `0` on free CPU Spaces unless a short benchmark proves the stronger model is worth the cold start, build time, memory, and runtime.

	After the Space builds, verify it from your main repo:

	```powershell
	python scripts\\verify_worker.py https://your-space.hf.space --code 1234 --origin https://your-vercel-app.vercel.app --require-cors --smoke-upload --smoke-scanned --smoke-ocr-engine arabic
	```
	""",
	encoding="utf-8",
	)


	def validate_export(output_dir: Path) -> list[str]:
	required = [
	"Dockerfile",
	"README.md",
	".dockerignore",
	MANIFEST_NAME,
	"requirements.txt",
	"requirements-silma.txt",
	"requirements-supertonic.txt",
	"requirements-paddleocr.txt",
	"requirements-paddleocr-vl.txt",
	"requirements-qari-ocr.txt",
	"requirements-tawkeed-ocr.txt",
	"requirements-katib-ocr.txt",
	"requirements-arabic-qwen-ocr.txt",
	"requirements-arabic-glm-ocr.txt",
	"requirements-baseer-ocr.txt",
	".export-complete",
	"app/main.py",
	"api/index.py",
	"static/index.html",
	"scripts/setup_silma.sh",
	"scripts/setup_supertonic.sh",
	"scripts/setup_paddleocr.sh",
	"scripts/setup_paddleocr_vl.sh",
	"scripts/setup_qari_ocr.sh",
	"scripts/setup_tawkeed_ocr.sh",
	"scripts/setup_katib_ocr.sh",
	"scripts/setup_arabic_qwen_ocr.sh",
	"scripts/setup_arabic_glm_ocr.sh",
	"scripts/setup_baseer_ocr.sh",
	"scripts/qari_ocr_extract.py",
	"scripts/tawkeed_ocr_extract.py",
	"scripts/katib_ocr_extract.py",
	"scripts/arabic_qwen_ocr_extract.py",
	"scripts/arabic_glm_ocr_extract.py",
	"scripts/baseer_ocr_extract.py",
	"scripts/configure_vercel_worker.py",
	"scripts/deploy_hf_space.py",
	"scripts/finish_live_deployment.py",
	"scripts/prepare_live_deployment.py",
	"scripts/validate_deployment_env.py",
	"scripts/refresh_research_evidence.py",
	"scripts/score_voice_listening.py",
	"scripts/score_tts_preprocessor.py",
	"docs/recommended-free-stack.md",
	"docs/recommended-decision-card.md",
	"docs/recommended-decision-card.json",
	]
	missing = [relative for relative in required if not (output_dir / relative).exists()]
	forbidden = [".env", "uploads", "outputs", "data", "test_pdfs", ".venv", ".venv-silma", ".venv-ocr"]
	present_forbidden = [relative for relative in forbidden if (output_dir / relative).exists()]
	return [f"missing:{item}" for item in missing] + [f"forbidden:{item}" for item in present_forbidden]


	def main() -> None:
	parser = argparse.ArgumentParser(description="Export a clean Hugging Face Spaces Docker worker bundle.")
	parser.add_argument("--out", type=Path, default=DEFAULT_OUTPUT, help="Destination folder for the Space bundle.")
	parser.add_argument("--force", action="store_true", help="Replace the destination folder if it already exists.")
	parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
	args = parser.parse_args()

	result = export_hf_space(args.out, force=args.force)
	issues = validate_export(args.out)
	result["ready"] = not issues
	result["issues"] = issues
	if args.json:
	print(json.dumps(result, indent=2))
	else:
	print(f"Exported Hugging Face Space bundle to {result['outputDir']}")
	if issues:
	print("Issues:")
	for issue in issues:
	print(f"- {issue}")
	else:
	print("Bundle is ready to push to a Docker Space.")
	if issues:
	raise SystemExit(1)


	if __name__ == "__main__":
	main()