arabic-audio-reader-worker / scripts /deployment_handoff.py
Syncre's picture
Deploy Arabic Audio Reader worker
6d5a99d verified
from __future__ import annotations
import argparse
import json
import secrets
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from scripts.next_deployment_step import deployment_url_warnings
DEFAULT_OUT = ROOT_DIR / "outputs" / "deployment-handoff.md"
def normalize_origin(value: str) -> str:
return value.rstrip("/")
def generate_secret_key() -> str:
return secrets.token_urlsafe(48)
@dataclass
class DeploymentHandoff:
worker_url: str
vercel_origin: str
access_code: str
urls_look_real: bool
warnings: list[str]
recommended_build_args: dict[str, str]
maximum_quality_build_args: dict[str, str]
worker_secrets: dict[str, str]
vercel_env: dict[str, str]
commands: dict[str, str]
vercel_cli_commands: list[str]
vercel_cleanup_commands: list[str]
huggingface_cli_commands: list[str]
def build_handoff(
worker_url: str,
vercel_origin: str,
access_code: str = "1234",
secret_key: str | None = None,
) -> DeploymentHandoff:
worker_url = normalize_origin(worker_url)
vercel_origin = normalize_origin(vercel_origin)
secret_key = secret_key or generate_secret_key()
warnings = deployment_url_warnings(worker_url, vercel_origin)
urls_look_real = not warnings
recommended_build_args = {
"INSTALL_TAWKEED_OCR": "1",
"INSTALL_KATIB_OCR": "1",
"INSTALL_ARABIC_QWEN_OCR": "1",
"INSTALL_ARABIC_GLM_OCR": "1",
"INSTALL_BASEER_OCR": "1",
"INSTALL_SUPERTONIC": "1",
}
maximum_quality_build_args = {
**recommended_build_args,
"INSTALL_QARI_OCR": "1",
"INSTALL_PADDLEOCR_VL": "1",
}
worker_secrets = {
"ACCESS_CODE": access_code,
"SECRET_KEY": secret_key,
"CORS_ORIGINS": vercel_origin,
"COOKIE_SAMESITE": "none",
"COOKIE_SECURE": "1",
"OCR_ENGINE": "tesseract",
"OCR_RENDER_ZOOM": "2",
"TESSERACT_PSM": "4",
"DEFAULT_VOICE_ID": "silma-local",
"OUTPUT_RETENTION_DAYS": "7",
"OUTPUT_MAX_FILES": "25",
"AUDIO_FORMAT": "mp3",
"MP3_BITRATE": "96k",
"SILMA_ENABLE_NORMALIZER": "0",
"SILMA_FORCE_TASHKEEL": "0",
"SILMA_NORMALIZE_NUMBERS": "0",
}
vercel_env = {
"ACCESS_CODE": access_code,
"SECRET_KEY": secret_key,
"WORKER_BASE_URL": worker_url,
}
commands = {
"localReadiness": "python scripts\\prove_local_readiness.py --refresh-research",
"refreshResearch": "python scripts\\refresh_research_evidence.py",
"licensePolicy": "python scripts\\research_watchlist.py --check-license-policy --json",
"researchSources": (
"python scripts\\check_research_sources.py "
"--check-hf-metadata --write-hf-metadata-report outputs\\hf-model-metadata.md"
),
"exportWorker": "python scripts\\export_hf_space.py --force",
"prepareDeployment": (
f"python scripts\\prepare_live_deployment.py --worker-url {worker_url} "
f"--origin {vercel_origin} --code {access_code}"
),
"validateEnv": (
"python scripts\\validate_deployment_env.py "
"--vercel-env outputs\\vercel-production.env "
"--worker-env outputs\\worker-secrets.env "
f"--worker-url {worker_url} --origin {vercel_origin}"
),
"deploymentStatus": (
f"python scripts\\deployment_status.py --worker-url {worker_url} "
f"--origin {vercel_origin} --code {access_code}"
),
"diagnoseVercelWorker": (
f"python scripts\\hosted_preflight.py {vercel_origin} "
f"--code {access_code} --worker-url {worker_url}"
),
"configureVercelWorker": (
f"python scripts\\configure_vercel_worker.py {worker_url} "
f"--site-url {vercel_origin} --code {access_code} --verify"
),
"verifyLive": (
f"python scripts\\prove_live_deployment.py {worker_url} "
f"--origin {vercel_origin} --code {access_code} "
"--smoke-ocr-engine arabic "
"--check-hf-metadata --hf-metadata-report outputs\\hf-model-metadata.md "
"--proof-out outputs\\live-deployment-proof.json"
),
"finalAudit": (
"python scripts\\audit_goal_readiness.py "
"--worker-report outputs\\worker-verification.json "
"--site-report outputs\\site-verification.json"
),
}
vercel_cli_commands = [
"npm i -g vercel",
"vercel login",
"vercel link --yes",
f'cmd /c "echo {access_code}| vercel env add ACCESS_CODE production"',
f'cmd /c "echo {secret_key}| vercel env add SECRET_KEY production"',
f'cmd /c "echo {worker_url}| vercel env add WORKER_BASE_URL production"',
f"python scripts\\configure_vercel_worker.py {worker_url} --site-url {vercel_origin} --code {access_code} --verify",
"vercel --prod --yes",
]
vercel_cleanup_commands = [
"vercel env rm ENABLE_DIRECT_CLOUD_TTS production --yes",
"vercel env rm HF_API_TOKEN production --yes",
"vercel env rm HF_TTS_MODEL production --yes",
"vercel env rm DEFAULT_VOICE_ID production --yes",
]
huggingface_cli_commands = [
"python -m pip install -U huggingface_hub",
"python scripts\\export_hf_space.py --force",
"python scripts\\deploy_hf_space.py <your-hf-username>/<your-space-name> --bundle-dir outputs/huggingface-space --json",
]
return DeploymentHandoff(
worker_url=worker_url,
vercel_origin=vercel_origin,
access_code=access_code,
urls_look_real=urls_look_real,
warnings=warnings,
recommended_build_args=recommended_build_args,
maximum_quality_build_args=maximum_quality_build_args,
worker_secrets=worker_secrets,
vercel_env=vercel_env,
commands=commands,
vercel_cli_commands=vercel_cli_commands,
vercel_cleanup_commands=vercel_cleanup_commands,
huggingface_cli_commands=huggingface_cli_commands,
)
def format_env_table(values: dict[str, str]) -> str:
lines = ["| Name | Value |", "| --- | --- |"]
lines.extend(f"| `{key}` | `{value}` |" for key, value in values.items())
return "\n".join(lines)
def format_key_value_block(values: dict[str, str]) -> str:
return "\n".join(f"{key}={value}" for key, value in values.items())
def write_markdown(path: Path, handoff: DeploymentHandoff) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
"\n".join(
[
"# Arabic Audio Reader Deployment Handoff",
"",
"Use this after the Hugging Face worker URL and Vercel production URL exist.",
"",
"## URLs",
"",
f"- Worker: {handoff.worker_url}",
f"- Vercel site: {handoff.vercel_origin}",
f"- URLs look real: {'yes' if handoff.urls_look_real else 'no'}",
"",
*(
[
"## URL Warnings",
"",
*[f"- {warning}" for warning in handoff.warnings],
"",
"Replace placeholder/local/test URLs before running live proof. The final audit will not treat placeholder reports as completed deployment evidence.",
"",
]
if handoff.warnings
else []
),
"## Hugging Face Space Secrets",
"",
format_env_table(handoff.worker_secrets),
"",
"Copy/paste secret values:",
"",
"```text",
format_key_value_block(handoff.worker_secrets),
"```",
"",
"## Safety Checklist",
"",
"- Set the same `ACCESS_CODE` and `SECRET_KEY` on both Hugging Face and Vercel.",
"- Set Hugging Face `CORS_ORIGINS` to the exact Vercel production origin shown above.",
"- Set Vercel `WORKER_BASE_URL` to the exact Hugging Face worker URL shown above.",
"- After both deployments finish, run the Vercel worker diagnostic command below before uploading a large PDF. It must show `site worker reachable from vercel` and `site worker CORS ready`.",
"- Remove Vercel's temporary direct Hugging Face TTS fallback variables for production: `ENABLE_DIRECT_CLOUD_TTS`, `HF_API_TOKEN`, `HF_TTS_MODEL`, and `DEFAULT_VOICE_ID`.",
"- Keep `OCR_ENGINE=tesseract`, `OCR_RENDER_ZOOM=2`, and `TESSERACT_PSM=4` for normal scanned Arabic books; use `arabic-max` only when a short sample needs the slower maximum comparison.",
"- Do not commit this handoff; it contains the deployment `SECRET_KEY`.",
"",
"## Hugging Face Docker Build Args",
"",
"Balanced Arabic OCR worker for the first strong-worker deployment:",
"",
format_env_table(handoff.recommended_build_args),
"",
"Copy/paste balanced build args:",
"",
"```text",
format_key_value_block(handoff.recommended_build_args),
"```",
"",
"Maximum quality worker for a GPU or larger paid/owned worker after a 5-page benchmark proves the heavier models help:",
"",
format_env_table(handoff.maximum_quality_build_args),
"",
"Copy/paste maximum-quality build args:",
"",
"```text",
format_key_value_block(handoff.maximum_quality_build_args),
"```",
"",
"## Vercel Environment Variables",
"",
format_env_table(handoff.vercel_env),
"",
"Copy/paste Vercel values:",
"",
"```text",
format_key_value_block(handoff.vercel_env),
"```",
"",
"## Deploy Worker With Hugging Face CLI",
"",
"Create a Hugging Face Space with SDK **Docker**, then run these from the repo root. Replace `<your-hf-username>/<your-space-name>` with the Space repo id:",
"",
"```powershell",
*handoff.huggingface_cli_commands,
"```",
"",
"Set the Hugging Face Space secrets and Docker build args from the tables above in the Space settings before the final smoke test.",
"The `SECRET_KEY` value in this handoff is generated for this deployment; keep it private and do not commit the handoff output.",
"",
"## Deploy Vercel Site With CLI",
"",
"Run these from the repo root after the worker URL is known:",
"",
"```powershell",
*handoff.vercel_cli_commands,
"```",
"",
"If you previously tested direct Hugging Face cloud TTS on Vercel, remove those temporary variables before or after setting `WORKER_BASE_URL`. It is normal if a remove command says the variable does not exist:",
"",
"```powershell",
*handoff.vercel_cleanup_commands,
"vercel --prod --yes",
"```",
"",
"## Commands",
"",
"Run these from the repo root:",
"",
"```powershell",
handoff.commands["localReadiness"],
handoff.commands["refreshResearch"],
handoff.commands["licensePolicy"],
handoff.commands["researchSources"],
handoff.commands["exportWorker"],
handoff.commands["prepareDeployment"],
handoff.commands["validateEnv"],
handoff.commands["deploymentStatus"],
handoff.commands["diagnoseVercelWorker"],
handoff.commands["verifyLive"],
handoff.commands["finalAudit"],
"```",
"",
"The hosted preflight command checks the hosted site's `/api/worker-diagnostics` endpoint and writes `outputs\\hosted-preflight.json`. If it says `cors-blocked`, set Hugging Face `CORS_ORIGINS` to the exact Vercel production URL, keep `COOKIE_SAMESITE=none` and `COOKIE_SECURE=1`, restart the Space, redeploy Vercel, and run the preflight again.",
"",
"The goal audit is complete only after `outputs\\site-verification.json` proves the Vercel shell login, worker routing, large-PDF readiness, hosted provider limits, recommended stack, disabled direct cloud fallback, worker diagnostics reachability, and worker CORS readiness; `outputs\\worker-verification.json` proves the worker recommended stack plus embedded-text and scanned-OCR smoke jobs, usable extracted text, scanned OCR extraction, audio URLs, download URLs, audio bytes, download bytes, and audio file signatures; and `outputs\\live-deployment-proof.json` records `complete: true`.",
"",
]
),
encoding="utf-8",
)
def main() -> None:
parser = argparse.ArgumentParser(description="Create a deployment handoff file with exact worker/Vercel settings and proof commands.")
parser.add_argument("worker_url", help="Worker URL, for example https://your-space.hf.space")
parser.add_argument("--origin", required=True, help="Vercel production origin, for example https://your-app.vercel.app")
parser.add_argument("--code", default="1234", help="Access code for the site and worker.")
parser.add_argument("--secret-key", help="Optional fixed cookie-signing secret. Omit to generate a random deployment secret.")
parser.add_argument("--out", type=Path, default=DEFAULT_OUT, help="Markdown handoff destination.")
parser.add_argument("--json", action="store_true", help="Print JSON instead of a short summary.")
args = parser.parse_args()
handoff = build_handoff(args.worker_url, args.origin, args.code, args.secret_key)
write_markdown(args.out, handoff)
result = {"out": str(args.out), **asdict(handoff)}
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"Wrote deployment handoff to {args.out}")
print(handoff.commands["verifyLive"])
if __name__ == "__main__":
main()