| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import re |
| from dataclasses import asdict, dataclass |
| from datetime import date, datetime |
| from pathlib import Path |
| from urllib.error import HTTPError, URLError |
| from urllib.request import Request, urlopen |
|
|
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| DEFAULT_DOCS = [ |
| ROOT_DIR / "docs" / "best-free-arabic-pdf-audio-stack.md", |
| ROOT_DIR / "docs" / "source-evidence.md", |
| ROOT_DIR / "docs" / "huggingface-model-metadata.md", |
| ROOT_DIR / "docs" / "live-deployment-checklist.md", |
| ROOT_DIR / "docs" / "production-worker-architecture.md", |
| ROOT_DIR / "docs" / "research-watchlist.md", |
| ROOT_DIR / "docs" / "recommended-free-stack.md", |
| ROOT_DIR / "docs" / "recommended-decision-card.md", |
| ROOT_DIR / "docs" / "recommended-decision-card.json", |
| ] |
|
|
|
|
| def load_env_file(path: Path) -> None: |
| if not path.exists(): |
| return |
| for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines(): |
| line = raw_line.strip() |
| if not line or line.startswith("#") or "=" not in line: |
| continue |
| key, value = line.split("=", 1) |
| os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'")) |
|
|
|
|
| load_env_file(ROOT_DIR / ".env") |
| URL_RE = re.compile(r"https?://[^\s)\]>`]+") |
| REQUIRED_SOURCE_MARKERS = [ |
| "QARI-OCR", |
| "QARI-OCR 0.4", |
| "Qari-OCR-0.4.0-VL-4B-Instruct", |
| "no hosted inference provider", |
| "worker runtime", |
| "QARI-OCR 0.4 GGUF", |
| "marwan-osama/Qari-OCR-0.4.0-VL-4B-Instruct-GGUF", |
| "KATIB 0.8B", |
| "Katib-Qwen3.5-0.8B-0.1", |
| "Ketaba-OCR LoRA", |
| "HassanB4/Ketaba-OCR-LoRA", |
| "Qari-OCR-LoRA", |
| "HassanB4/Qari-OCR-LoRA", |
| "Tawkeed OCR", |
| "tawkeed-sa/tawkeed-ocr", |
| "PaddleOCR-VL", |
| "PaddlePaddle/PaddleOCR-VL-1.6", |
| "oi-OCR", |
| "oi-uae/oi-OCR", |
| "SILMA TTS", |
| "silma-ai/silma-tts", |
| "Apache-2.0 model weights", |
| "SILMA open source Arabic TTS models", |
| "SILMA Arabic TTS benchmark", |
| "SILMA Hugging Face launch article", |
| "Habibi-TTS", |
| "Habibi-TTS paper", |
| "2601.13802", |
| "specialized MSA model is Apache-2.0", |
| "Mishkala Tashkeel", |
| "flokymind/mishkala", |
| "Tashkeel-350M", |
| "Etherll/Tashkeel-350M", |
| "Mushkil", |
| "riotu-lab/mushkil", |
| "Thaka KSAA-2026 speech diacritization", |
| "2605.25928", |
| "KSAA-2026", |
| "research signal only", |
| "3arab-TTS 500M", |
| "sherif1313/3arab-TTS-500M-v1", |
| "3arab-TTS-500M-v1-VoiceDesign", |
| "KaniTTS Arabic", |
| "nineninesix/kani-tts-400m-ar", |
| "Emirati VITS Male", |
| "vadimbelsky/emirati-vits-male-1.0", |
| "VoxCPM2", |
| "openbmb/VoxCPM2", |
| "Voxtral TTS", |
| "mistralai/Voxtral-4B-TTS-2603", |
| "cc-by-nc-4.0", |
| "MOSS-TTS-Nano", |
| "OpenMOSS/MOSS-TTS-Nano", |
| "Supertonic 3", |
| "Supertone/supertonic-3", |
| "OpenRAIL model", |
| "Kyutai Pocket TTS", |
| "kyutai.org/tts", |
| "not Arabic", |
| "Falcon-OCR", |
| "tiiuae/Falcon-OCR", |
| "Baseer OCR", |
| "AbdoTarek/Baseer-OCR-V1.0", |
| "Arabic-GLM-OCR-v2", |
| "sherif1313/Arabic-GLM-OCR-v2", |
| "Arabic-Qwen3.5-OCR-v4", |
| "sherif1313/Arabic-Qwen3.5-OCR-v4", |
| "aNS Qwen3-VL Arabic OCR v3", |
| "aNS2024/qwen3-vl-arabic-ocr-v3", |
| "Waraqon v3 Arabic OCR HTML Qari", |
| "FatimahEmadEldin/Waraqon-v3-Arabic-OCR-HTML-Qari", |
| "DeepSeek-OCR-2", |
| "deepseek-ai/DeepSeek-OCR-2", |
| "DeepSeek Arabic OCR v6", |
| "melsiddieg/deepseek_ocr_arabic_v6", |
| "Loay Arabic-OCR-DeepSeek-OCR-2", |
| "loay/Arabic-OCR-DeepSeek-OCR-2", |
| "Arabic-English handwritten OCR Qwen3-VL", |
| "sherif1313/Arabic-English-handwritten-OCR-Qwen3-VL-4B", |
| "Arabic-English handwritten OCR v3", |
| "sherif1313/Arabic-English-handwritten-OCR-v3", |
| "Arabic handwritten OCR 4-bit Qwen2.5-VL", |
| "sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v3", |
| "NAKBA Arabic manuscript line OCR baseline", |
| "U4RASD/ar-ms-baseline", |
| "HAFITH", |
| "mdnaseif/hafith", |
| "Glimpse RTL OCR", |
| "surfiniaburger/unsloth_finetune_ocr_arabic", |
| "Arabic OCR Qwen2.5-VL GGUF", |
| "mo1998/arabic-ocr-qwen2.5-vl", |
| "Qwen3-VL Persian/Arabic line OCR", |
| "mohajesmaeili/Qwen3-VL-2B-Persian-Arabic-Ocr-v1.0", |
| "DIMI Arabic OCR v2", |
| "AhmedZaky1/DIMI-Arabic-OCR-V2", |
| "Raqim post-OCR correction", |
| "Arabic Legal Documents OCR 1.0", |
| "bakrianoo/arabic-legal-documents-ocr-1.0", |
| "Loay Arabic-OCR-Qwen2.5-VL-7B", |
| "loay/Arabic-OCR-Qwen2.5-VL-7B-Vision", |
| "AtlasOCR", |
| "atlasia/AtlasOCR", |
| "NuExtract3", |
| "numind/NuExtract3", |
| "Qianfan-OCR", |
| "baidu/Qianfan-OCR", |
| "Chandra OCR 2", |
| "datalab-to/chandra", |
| "dots.ocr", |
| "rednote-hilab/dots.ocr", |
| "olmOCR Arabic LoRA v2", |
| "hastyle/olmOCR-arabic-lora-v2", |
| "Arabic Large Nougat", |
| "MohamedRashad/arabic-large-nougat", |
| "DocTR Arabic FAST/PARSEQ", |
| "madskills/doctr-fast_base-arabic", |
| "madskills/doctr-parseq-arabic", |
| "Kraken/eScriptorium Arabic script", |
| "kraken.re/main/index.html", |
| "escriptorium.eu/about", |
| "Kairawan/Qalamus manuscript OCR", |
| "kairawan.org", |
| "GLM-OCR Arabic/French documents", |
| "maloukafer/GLM-OCR-finetuned-documents", |
| "mimoha Arabic OCR", |
| "mimoha/ocr", |
| "OmniVoice", |
| "k2-fsa/OmniVoice", |
| "OmniVoice Arabic LoRA", |
| "vivooglobal/omnivoice-lora-ar", |
| "Arabic-text-to-speech OmniVoice", |
| "bilalRHCH/Arabic-text-to-speech", |
| "Lahgtna OmniVoice v2", |
| "oddadmix/lahgtna-omnivoice-v2", |
| "TADA multilingual TTS", |
| "HumeAI/tada-3b-ml", |
| "Lahgtna Chatterbox", |
| "oddadmix/lahgtna-chatterbox-v1", |
| "NAMAA-Saudi-TTS", |
| "NAMAA-Space/NAMAA-Saudi-TTS", |
| "NAMAA-Egyptian-TTS", |
| "NAMAA-Space/NAMAA-Egyptian-TTS", |
| "Saudi Chatterbox fine-tune", |
| "FatimahEmadEldin/saudi-tts-chatterbox-finetuned", |
| "Saudi TTS", |
| "AhmedEladl/saudi-tts", |
| "Egyptian Arabic Chatterbox", |
| "AliAbdallah/egyptian-arabic-tts-chatterbox", |
| "NileTTS-XTTS", |
| "KickItLikeShika/NileTTS-XTTS", |
| "Arabic XTTS-v2 Egyptian fine-tune", |
| "Moeeldouma/arabic-tts-xtts-v2", |
| "Coqui Public Model License", |
| "Chatterbox-Multilingual", |
| "resemble-ai/chatterbox", |
| "Chatterbox Arabic fine-tune", |
| "juliardi/chatterbox-multilingual-finetuned-arabic", |
| "Chatterbox-Multilingual ONNX", |
| "onnx-community/chatterbox-multilingual-ONNX", |
| "tts-arabic-onnx", |
| "nipponjo/tts-arabic-onnx", |
| "Spark-TTS Arabic", |
| "azeddinShr/Spark-TTS-Arabic-Complete", |
| "Sofelia-TTS", |
| "hamdallah/Sofelia-TTS", |
| "Arabic-F5-TTS-v2", |
| "IbrahimSalah/Arabic-F5-TTS-v2", |
| "Qwen3-TTS", |
| "Qwen3-TTS-12Hz-0.6B-Base", |
| "Qwen3-TTS-12Hz-1.7B-Base", |
| "Egyptian Arabic Qwen3-TTS", |
| "itshamdi404/Egy_Arabic_Qwen3-TTS-12Hz-1.7B-Base", |
| "Saudi Arabic Qwen3-TTS", |
| "vadimbelsky/qwen3-TTS-KSA", |
| "Emirati Qwen3.5-TTS", |
| "vadimbelsky/qwen3.5-TTS-Emirati", |
| "MMS Arabic TTS", |
| "Vercel FastAPI", |
| "Vercel Blob usage and pricing", |
| "Vercel Functions limits", |
| "4.5 MB request/response body limit", |
| "Hugging Face Docker Spaces", |
| "Hugging Face Hub storage limits", |
| "2 vCPU", |
| "16 GB RAM", |
| "50 GB non-persistent disk", |
| ] |
| MAX_METADATA_AGE_DAYS = 30 |
| REQUIRED_METADATA_MARKERS = [ |
| "NAMAA-Space/Qari-OCR-0.4.0-VL-4B-Instruct", |
| "silma-ai/silma-tts", |
| "sherif1313/Arabic-Qwen3.5-OCR-v4", |
| "deepseek-ai/DeepSeek-OCR-2", |
| "melsiddieg/deepseek_ocr_arabic_v6", |
| "sherif1313/Arabic-GLM-OCR-v2", |
| "sherif1313/Arabic-English-handwritten-OCR-Qwen3-VL-4B", |
| "sherif1313/Arabic-English-handwritten-OCR-v3", |
| "mohajesmaeili/Qwen3-VL-2B-Persian-Arabic-Ocr-v1.0", |
| "bakrianoo/arabic-legal-documents-ocr-1.0", |
| "oi-uae/oi-OCR", |
| "NAMAA-Space/NAMAA-Saudi-TTS", |
| "AhmedEladl/saudi-tts", |
| "AliAbdallah/egyptian-arabic-tts-chatterbox", |
| "KickItLikeShika/NileTTS-XTTS", |
| "Moeeldouma/arabic-tts-xtts-v2", |
| "onnx-community/chatterbox-multilingual-ONNX", |
| "itshamdi404/Egy_Arabic_Qwen3-TTS-12Hz-1.7B-Base", |
| "vadimbelsky/qwen3-TTS-KSA", |
| "vadimbelsky/qwen3.5-TTS-Emirati", |
| "sherif1313/3arab-TTS-500M-v1-VoiceDesign", |
| "numind/NuExtract3", |
| "baidu/Qianfan-OCR", |
| "datalab-to/chandra", |
| "rednote-hilab/dots.ocr", |
| "MohamedRashad/arabic-large-nougat", |
| "apache-2.0", |
| "gpl-3.0", |
| "cc-by-nc-4.0", |
| "fair-noncommercial-research-license", |
| "coqui-public-model-license", |
| "openrail", |
| "Rows marked `page-only` use verified public model-page evidence", |
| ] |
| REQUIRED_RECOMMENDATION_MARKERS = [ |
| "Recommended Free Arabic PDF To Audio Stack", |
| "PyMuPDF text extraction first", |
| "`OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4`", |
| "SILMA TTS", |
| "Vercel shell plus Docker worker", |
| "Benchmark Before Promoting", |
| "model_promotion_gate.py", |
| "PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS", |
| ] |
| REQUIRED_DECISION_CARD_MARKERS = [ |
| "Recommended Free Arabic PDF To Audio Decision Card", |
| "PyMuPDF embedded text first", |
| "OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4", |
| "SILMA TTS", |
| "worker-local retained downloads", |
| "Vercel shell plus Docker worker", |
| "model_promotion_gate.py", |
| "scoreJsonRequired", |
| ] |
| REQUIRED_WATCHLIST_COMMAND_MARKERS = [ |
| "model_promotion_gate.py", |
| "score_external_ocr.py", |
| "score_voice_listening.py", |
| "score_tts_preprocessor.py", |
| "--write-json outputs\\external-ocr-sample\\external-ocr-score.json", |
| "--write-json outputs\\external-tts-sample\\voice-listening-score.json", |
| "--write-json outputs\\external-tts-sample\\tts-preprocessor-score.json", |
| "--candidate oi-ocr=outputs\\external-ocr-sample\\oi-ocr.txt", |
| "--kind ocr", |
| "--kind tts", |
| "--kind preprocessor", |
| ] |
| WORKFLOW_DOC_PATHS = [ |
| ROOT_DIR / "README.md", |
| ROOT_DIR / "docs" / "best-free-arabic-pdf-audio-stack.md", |
| ] |
| REQUIRED_WORKFLOW_MARKERS = [ |
| "QARI-OCR 0.4 GGUF", |
| "Loay Arabic-OCR-DeepSeek-OCR-2", |
| "NAMAA-Egyptian-TTS", |
| "Chatterbox Arabic fine-tune", |
| "not served as a simple hosted Hugging Face inference route", |
| "not deployed by a hosted inference provider", |
| "--candidate qari-gguf=outputs\\external-ocr-sample\\qari-gguf.txt", |
| "--candidate loay-deepseek-ocr-2=outputs\\external-ocr-sample\\loay-deepseek-ocr-2.txt", |
| "outputs\\external-tts-sample\\arabic-tts-sample.txt", |
| "model_promotion_gate.py", |
| ] |
| KEY_SOURCE_URLS = { |
| "QARI-OCR 0.4 model": "https://huggingface.co/NAMAA-Space/Qari-OCR-0.4.0-VL-4B-Instruct", |
| "QARI-OCR 0.4 GGUF model": "https://huggingface.co/marwan-osama/Qari-OCR-0.4.0-VL-4B-Instruct-GGUF", |
| "KATIB Arabic OCR model": "https://huggingface.co/oddadmix/Katib-Qwen3.5-0.8B-0.1", |
| "Ketaba-OCR LoRA model": "https://huggingface.co/HassanB4/Ketaba-OCR-LoRA", |
| "Qari-OCR-LoRA model": "https://huggingface.co/HassanB4/Qari-OCR-LoRA", |
| "Tawkeed Arabic OCR model": "https://huggingface.co/tawkeed-sa/tawkeed-ocr", |
| "PaddleOCR-VL 1.6 model": "https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.6", |
| "oi-OCR model": "https://huggingface.co/oi-uae/oi-OCR", |
| "Qianfan-OCR model": "https://huggingface.co/baidu/Qianfan-OCR", |
| "PaddleOCR latest docs": "https://www.paddleocr.ai/latest/en/index.html", |
| "SILMA TTS model": "https://huggingface.co/silma-ai/silma-tts", |
| "SILMA open source Arabic TTS models": "https://silma.ai/open-source-arabic-tts-models", |
| "SILMA Arabic TTS benchmark": "https://silma.ai/arabic-tts-benchmark", |
| "SILMA Hugging Face launch article": "https://huggingface.co/blog/silma-ai/opensource-arabic-english-text-to-speech-model", |
| "Habibi-TTS repository": "https://github.com/SWivid/Habibi-TTS", |
| "Habibi-TTS paper": "https://arxiv.org/abs/2601.13802", |
| "Mishkala Tashkeel model": "https://huggingface.co/flokymind/mishkala", |
| "Tashkeel 350M model": "https://huggingface.co/Etherll/Tashkeel-350M", |
| "Mushkil model": "https://huggingface.co/riotu-lab/mushkil", |
| "Thaka KSAA 2026 speech diacritization paper": "https://arxiv.org/abs/2605.25928", |
| "KSAA 2026 shared task": "https://www.codabench.org/competitions/11859/", |
| "3arab-TTS 500M model": "https://huggingface.co/sherif1313/3arab-TTS-500M-v1", |
| "3arab-TTS 500M VoiceDesign model": "https://huggingface.co/sherif1313/3arab-TTS-500M-v1-VoiceDesign", |
| "KaniTTS Arabic model": "https://huggingface.co/nineninesix/kani-tts-400m-ar", |
| "Emirati VITS Male model": "https://huggingface.co/vadimbelsky/emirati-vits-male-1.0", |
| "VoxCPM2 model": "https://huggingface.co/openbmb/VoxCPM2", |
| "VoxCPM paper": "https://arxiv.org/abs/2509.24650", |
| "Voxtral TTS model": "https://huggingface.co/mistralai/Voxtral-4B-TTS-2603", |
| "Voxtral TTS paper": "https://arxiv.org/abs/2603.25551", |
| "MOSS-TTS-Nano repository": "https://github.com/OpenMOSS/MOSS-TTS-Nano", |
| "Supertonic 3 model": "https://huggingface.co/Supertone/supertonic-3", |
| "Kyutai TTS official page": "https://kyutai.org/tts", |
| "Falcon OCR model": "https://huggingface.co/tiiuae/Falcon-OCR", |
| "Falcon Perception paper": "https://arxiv.org/abs/2603.27365", |
| "Baseer OCR model": "https://huggingface.co/AbdoTarek/Baseer-OCR-V1.0", |
| "Arabic GLM OCR v2 model": "https://huggingface.co/sherif1313/Arabic-GLM-OCR-v2", |
| "Arabic Qwen3.5 OCR v4 model": "https://huggingface.co/sherif1313/Arabic-Qwen3.5-OCR-v4", |
| "aNS Qwen3 VL Arabic OCR v3 model": "https://huggingface.co/aNS2024/qwen3-vl-arabic-ocr-v3", |
| "Waraqon v3 Arabic OCR HTML Qari model": "https://huggingface.co/FatimahEmadEldin/Waraqon-v3-Arabic-OCR-HTML-Qari", |
| "DeepSeek OCR 2 model": "https://huggingface.co/deepseek-ai/DeepSeek-OCR-2", |
| "DeepSeek Arabic OCR v6 model": "https://huggingface.co/melsiddieg/deepseek_ocr_arabic_v6", |
| "Loay Arabic DeepSeek OCR 2 model": "https://huggingface.co/loay/Arabic-OCR-DeepSeek-OCR-2", |
| "Arabic-English handwritten OCR Qwen3-VL model": "https://huggingface.co/sherif1313/Arabic-English-handwritten-OCR-Qwen3-VL-4B", |
| "Arabic-English handwritten OCR v3 model": "https://huggingface.co/sherif1313/Arabic-English-handwritten-OCR-v3", |
| "Arabic handwritten OCR 4-bit Qwen2.5 VL model": "https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v3", |
| "NAKBA Arabic manuscript line OCR baseline": "https://huggingface.co/U4RASD/ar-ms-baseline", |
| "HAFITH model": "https://huggingface.co/mdnaseif/hafith", |
| "Glimpse RTL OCR model": "https://huggingface.co/surfiniaburger/unsloth_finetune_ocr_arabic", |
| "Arabic OCR Qwen2.5 VL GGUF model": "https://huggingface.co/mo1998/arabic-ocr-qwen2.5-vl", |
| "Qwen3-VL Persian Arabic line OCR model": "https://huggingface.co/mohajesmaeili/Qwen3-VL-2B-Persian-Arabic-Ocr-v1.0", |
| "DIMI Arabic OCR v2 model": "https://huggingface.co/AhmedZaky1/DIMI-Arabic-OCR-V2", |
| "Loay Arabic OCR Qwen2.5 VL 7B model": "https://huggingface.co/loay/Arabic-OCR-Qwen2.5-VL-7B-Vision", |
| "Arabic Legal Documents OCR 1.0 model": "https://huggingface.co/bakrianoo/arabic-legal-documents-ocr-1.0", |
| "AtlasOCR model": "https://huggingface.co/atlasia/AtlasOCR", |
| "NuExtract3 model": "https://huggingface.co/numind/NuExtract3", |
| "Chandra OCR repository": "https://github.com/datalab-to/chandra", |
| "dots.ocr model": "https://huggingface.co/rednote-hilab/dots.ocr", |
| "olmOCR Arabic LoRA v2 model": "https://huggingface.co/hastyle/olmOCR-arabic-lora-v2", |
| "Arabic Large Nougat model": "https://huggingface.co/MohamedRashad/arabic-large-nougat", |
| "DocTR Arabic FAST detector": "https://huggingface.co/madskills/doctr-fast_base-arabic", |
| "DocTR Arabic PARSEQ recognizer": "https://huggingface.co/madskills/doctr-parseq-arabic", |
| "Kraken OCR documentation": "https://kraken.re/main/index.html", |
| "eScriptorium overview": "https://escriptorium.eu/about", |
| "Kairawan manuscript OCR": "https://kairawan.org/", |
| "GLM-OCR Arabic French documents model": "https://huggingface.co/maloukafer/GLM-OCR-finetuned-documents", |
| "mimoha Arabic OCR model": "https://huggingface.co/mimoha/ocr", |
| "OmniVoice model": "https://huggingface.co/k2-fsa/OmniVoice", |
| "OmniVoice Arabic LoRA": "https://huggingface.co/vivooglobal/omnivoice-lora-ar", |
| "Arabic text to speech OmniVoice model": "https://huggingface.co/bilalRHCH/Arabic-text-to-speech", |
| "Lahgtna OmniVoice v2 model": "https://huggingface.co/oddadmix/lahgtna-omnivoice-v2", |
| "TADA multilingual TTS model": "https://huggingface.co/HumeAI/tada-3b-ml", |
| "Lahgtna Chatterbox model": "https://huggingface.co/oddadmix/lahgtna-chatterbox-v1", |
| "NAMAA Saudi TTS model": "https://huggingface.co/NAMAA-Space/NAMAA-Saudi-TTS", |
| "NAMAA Egyptian TTS model": "https://huggingface.co/NAMAA-Space/NAMAA-Egyptian-TTS", |
| "Saudi Chatterbox fine-tune model": "https://huggingface.co/FatimahEmadEldin/saudi-tts-chatterbox-finetuned", |
| "Saudi TTS model": "https://huggingface.co/AhmedEladl/saudi-tts", |
| "Egyptian Arabic Chatterbox model": "https://huggingface.co/AliAbdallah/egyptian-arabic-tts-chatterbox", |
| "NileTTS XTTS model": "https://huggingface.co/KickItLikeShika/NileTTS-XTTS", |
| "Arabic XTTS v2 Egyptian fine-tune model": "https://huggingface.co/Moeeldouma/arabic-tts-xtts-v2", |
| "NileTTS paper": "https://arxiv.org/abs/2602.15675", |
| "Chatterbox repository": "https://github.com/resemble-ai/chatterbox", |
| "Chatterbox Arabic fine-tune model": "https://huggingface.co/juliardi/chatterbox-multilingual-finetuned-arabic", |
| "Chatterbox Multilingual ONNX model": "https://huggingface.co/onnx-community/chatterbox-multilingual-ONNX", |
| "tts-arabic-onnx model": "https://huggingface.co/nipponjo/tts-arabic-onnx", |
| "tts_arabic repository": "https://github.com/nipponjo/tts_arabic", |
| "Spark-TTS Arabic model": "https://huggingface.co/azeddinShr/Spark-TTS-Arabic-Complete", |
| "Sofelia-TTS model": "https://huggingface.co/hamdallah/Sofelia-TTS", |
| "Arabic F5 TTS v2 model": "https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2", |
| "Qwen3-TTS 0.6B Base": "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-Base", |
| "Qwen3-TTS 1.7B Base": "https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-Base", |
| "Egyptian Arabic Qwen3-TTS model": "https://huggingface.co/itshamdi404/Egy_Arabic_Qwen3-TTS-12Hz-1.7B-Base", |
| "Saudi Arabic Qwen3-TTS model": "https://huggingface.co/vadimbelsky/qwen3-TTS-KSA", |
| "Emirati Qwen3.5-TTS model": "https://huggingface.co/vadimbelsky/qwen3.5-TTS-Emirati", |
| "Qwen3-TTS technical report": "https://arxiv.org/abs/2601.15621", |
| "Vercel FastAPI deployment": "https://vercel.com/docs/frameworks/backend/fastapi", |
| "Vercel Functions limits": "https://vercel.com/docs/functions/limitations/", |
| "Vercel Blob usage and pricing": "https://vercel.com/docs/vercel-blob/usage-and-pricing", |
| "Hugging Face Docker Spaces": "https://huggingface.co/docs/hub/main/en/spaces-sdks-docker", |
| "Hugging Face Hub storage limits": "https://huggingface.co/docs/hub/main/storage-limits", |
| } |
| HF_EXPECTED_LICENSES = { |
| "NAMAA-Space/Qari-OCR-0.4.0-VL-4B-Instruct": "apache-2.0", |
| "marwan-osama/Qari-OCR-0.4.0-VL-4B-Instruct-GGUF": "apache-2.0", |
| "oddadmix/Katib-Qwen3.5-0.8B-0.1": "apache-2.0", |
| "HassanB4/Ketaba-OCR-LoRA": "apache-2.0", |
| "HassanB4/Qari-OCR-LoRA": "apache-2.0", |
| "tawkeed-sa/tawkeed-ocr": "apache-2.0", |
| "PaddlePaddle/PaddleOCR-VL-1.6": "apache-2.0", |
| "oi-uae/oi-OCR": "apache-2.0", |
| "madskills/doctr-fast_base-arabic": "apache-2.0", |
| "mimoha/ocr": "apache-2.0", |
| "silma-ai/silma-tts": "apache-2.0", |
| "flokymind/mishkala": "apache-2.0", |
| "Etherll/Tashkeel-350M": "apache-2.0", |
| "riotu-lab/mushkil": "apache-2.0", |
| "sherif1313/3arab-TTS-500M-v1": "apache-2.0", |
| "sherif1313/3arab-TTS-500M-v1-VoiceDesign": "apache-2.0", |
| "vadimbelsky/emirati-vits-male-1.0": "apache-2.0", |
| "openbmb/VoxCPM2": "apache-2.0", |
| "mistralai/Voxtral-4B-TTS-2603": "cc-by-nc-4.0", |
| "Supertone/supertonic-3": "openrail", |
| "baidu/Qianfan-OCR": "apache-2.0", |
| "tiiuae/Falcon-OCR": "apache-2.0", |
| "AbdoTarek/Baseer-OCR-V1.0": "apache-2.0", |
| "sherif1313/Arabic-GLM-OCR-v2": "apache-2.0", |
| "sherif1313/Arabic-Qwen3.5-OCR-v4": "apache-2.0", |
| "FatimahEmadEldin/Waraqon-v3-Arabic-OCR-HTML-Qari": "apache-2.0", |
| "deepseek-ai/DeepSeek-OCR-2": "apache-2.0", |
| "melsiddieg/deepseek_ocr_arabic_v6": "apache-2.0", |
| "loay/Arabic-OCR-DeepSeek-OCR-2": "apache-2.0", |
| "sherif1313/Arabic-English-handwritten-OCR-Qwen3-VL-4B": "apache-2.0", |
| "sherif1313/Arabic-English-handwritten-OCR-v3": "apache-2.0", |
| "sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v3": "apache-2.0", |
| "mdnaseif/hafith": "apache-2.0", |
| "surfiniaburger/unsloth_finetune_ocr_arabic": "apache-2.0", |
| "mohajesmaeili/Qwen3-VL-2B-Persian-Arabic-Ocr-v1.0": "apache-2.0", |
| "AhmedZaky1/DIMI-Arabic-OCR-V2": "apache-2.0", |
| "hastyle/olmOCR-arabic-lora-v2": "apache-2.0", |
| "MohamedRashad/arabic-large-nougat": "gpl-3.0", |
| "bakrianoo/arabic-legal-documents-ocr-1.0": "gemma", |
| "k2-fsa/OmniVoice": "apache-2.0", |
| "bilalRHCH/Arabic-text-to-speech": "apache-2.0", |
| "vivooglobal/omnivoice-lora-ar": "apache-2.0", |
| "HumeAI/tada-3b-ml": "llama3.2", |
| "oddadmix/lahgtna-chatterbox-v1": "mit", |
| "NAMAA-Space/NAMAA-Saudi-TTS": "mit", |
| "NAMAA-Space/NAMAA-Egyptian-TTS": "mit", |
| "FatimahEmadEldin/saudi-tts-chatterbox-finetuned": "apache-2.0", |
| "AhmedEladl/saudi-tts": "apache-2.0", |
| "AliAbdallah/egyptian-arabic-tts-chatterbox": "apache-2.0", |
| "juliardi/chatterbox-multilingual-finetuned-arabic": "mit", |
| "KickItLikeShika/NileTTS-XTTS": "apache-2.0", |
| "Moeeldouma/arabic-tts-xtts-v2": "coqui-public-model-license", |
| "onnx-community/chatterbox-multilingual-ONNX": "mit", |
| "azeddinShr/Spark-TTS-Arabic-Complete": "apache-2.0", |
| "hamdallah/Sofelia-TTS": "apache-2.0", |
| "Qwen/Qwen3-TTS-12Hz-0.6B-Base": "apache-2.0", |
| "Qwen/Qwen3-TTS-12Hz-1.7B-Base": "apache-2.0", |
| "itshamdi404/Egy_Arabic_Qwen3-TTS-12Hz-1.7B-Base": "apache-2.0", |
| "vadimbelsky/qwen3-TTS-KSA": "apache-2.0", |
| "vadimbelsky/qwen3.5-TTS-Emirati": "apache-2.0", |
| } |
| HF_PAGE_ONLY_METADATA = { |
| "NAMAA-Space/Qari-OCR-0.4.0-VL-4B-Instruct": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only so restricted local sockets do not erase the core Arabic-book OCR evidence.", |
| }, |
| "oddadmix/Katib-Qwen3.5-0.8B-0.1": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because it is a wired optional Arabic OCR sidecar.", |
| }, |
| "HassanB4/Ketaba-OCR-LoRA": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because it is an external Arabic manuscript benchmark candidate.", |
| }, |
| "HassanB4/Qari-OCR-LoRA": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because it is a secondary external QARI-family manuscript benchmark.", |
| }, |
| "silma-ai/silma-tts": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only so restricted local sockets do not erase the core Arabic TTS evidence.", |
| }, |
| "NAMAA-Space/NAMAA-Saudi-TTS": { |
| "license": "mit", |
| "reason": "Hugging Face model page was verified in research, but raw metadata may not be available in restricted environments.", |
| }, |
| "tawkeed-sa/tawkeed-ocr": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page is public, but raw API/HTTP requests can return 401/404 for this namespace.", |
| }, |
| "MohamedRashad/arabic-large-nougat": { |
| "license": "gpl-3.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because the public model card is enough for license/status tracking and the model is benchmark-only.", |
| }, |
| "baidu/Qianfan-OCR": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because this large benchmark-only model does not need local metadata fetches to block/promote the default stack.", |
| }, |
| "AhmedEladl/saudi-tts": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because it is a dialect benchmark candidate, not a default production voice.", |
| }, |
| "Moeeldouma/arabic-tts-xtts-v2": { |
| "license": "coqui-public-model-license", |
| "reason": "Hugging Face model page was verified in research; keep page-only because it is an XTTS-v2 dialect benchmark and inherits CPML base-license caution.", |
| }, |
| "datalab-to/chandra": { |
| "license": "openrail", |
| "reason": "Hugging Face/official project metadata was verified in research; keep page-only because Chandra is benchmark-only and the weights are not the permissive default path.", |
| }, |
| "mistralai/Voxtral-4B-TTS-2603": { |
| "license": "cc-by-nc-4.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because this voice is explicitly non-commercial and external-only.", |
| }, |
| "IbrahimSalah/Arabic-F5-TTS-v2": { |
| "license": "fair-noncommercial-research-license", |
| "reason": "Hugging Face model page was verified in research; keep page-only because it is non-commercial and requires diacritized Arabic.", |
| }, |
| "Supertone/supertonic-3": { |
| "license": "openrail", |
| "reason": "Hugging Face model page was verified in research; keep page-only so the optional CPU voice benchmark remains tracked when live metadata fetches are blocked.", |
| }, |
| "bilalRHCH/Arabic-text-to-speech": { |
| "license": "apache-2.0", |
| "reason": "Hugging Face model page was verified in research; keep page-only because this Arabic-focused OmniVoice package is benchmark-only and raw metadata fetches can fail locally.", |
| } |
| } |
|
|
| WATCHLIST_POLICY_PATH = ROOT_DIR / "docs" / "research-watchlist.md" |
| WATCHLIST_ALLOWED_DEFAULTS = { |
| "SILMA TTS", |
| } |
| WATCHLIST_ALLOWED_WIRED_OPTIONAL = { |
| "QARI-OCR 0.4", |
| "PaddleOCR-VL-1.6", |
| "KATIB 0.8B", |
| "Arabic-GLM-OCR-v2", |
| "Arabic-Qwen3.5-OCR-v4", |
| "Ketaba-OCR LoRA", |
| "Tawkeed OCR", |
| "Baseer OCR V1.0", |
| "Habibi-TTS MSA", |
| "Supertonic 3", |
| } |
| WATCHLIST_BENCHMARK_ONLY_REASONS = { |
| "DeepSeek-OCR-2": "external general OCR benchmark", |
| "aNS Qwen3-VL Arabic OCR v3": "fresh Qwen3-VL Arabic OCR benchmark with sparse production evidence", |
| "Waraqon v3 Arabic OCR HTML Qari": "external Qari-family structured HTML OCR benchmark", |
| "DeepSeek Arabic OCR v6": "external Arabic OCR benchmark", |
| "Loay Arabic-OCR-DeepSeek-OCR-2": "external Arabic DeepSeek-OCR-2 layout benchmark", |
| "Arabic-English handwritten OCR v3": "external handwriting/manuscript benchmark", |
| "Arabic handwritten OCR 4-bit Qwen2.5-VL": "external quantized handwriting/manuscript benchmark", |
| "NAKBA Arabic manuscript line OCR baseline": "line-level manuscript OCR benchmark with license confirmation", |
| "HAFITH": "historical Arabic manuscript line OCR benchmark", |
| "Glimpse RTL OCR": "Arabic/Persian RTL text-line OCR benchmark", |
| "Arabic OCR Qwen2.5-VL GGUF": "external GGUF Arabic OCR benchmark with license confirmation", |
| "Qwen3-VL Persian/Arabic line OCR": "line-level OCR benchmark", |
| "Loay Arabic-OCR-Qwen2.5-VL-7B": "large external Arabic OCR benchmark", |
| "DIMI Arabic OCR v2": "large external Arabic OCR benchmark", |
| "AtlasOCR": "license/content-specific OCR watchlist", |
| "NuExtract3": "external multilingual document OCR benchmark", |
| "Qianfan-OCR": "large external multilingual document OCR benchmark", |
| "Chandra OCR 2": "modified OpenRAIL structured-document OCR benchmark", |
| "dots.ocr": "external multilingual document-layout OCR benchmark", |
| "olmOCR Arabic LoRA v2": "full-page Arabic manuscript OCR benchmark with base-license/runtime confirmation", |
| "Arabic Large Nougat": "GPL Arabic book OCR-to-Markdown benchmark", |
| "DocTR Arabic FAST/PARSEQ": "classic Arabic OCR benchmark with recognition license confirmation", |
| "Kraken/eScriptorium Arabic script": "historical Arabic-script OCR benchmark with model-license confirmation", |
| "Kairawan/Qalamus manuscript OCR": "service-only Arabic manuscript OCR benchmark signal", |
| "GLM-OCR Arabic/French documents": "external Arabic/French document OCR benchmark", |
| "mimoha Arabic OCR": "sparse-card Arabic OCR watchlist", |
| "oi-OCR": "external document parser benchmark", |
| "Falcon-OCR": "external OCR benchmark", |
| "Raqim post-OCR correction": "correction-risk OCR caution", |
| "Arabic Legal Documents OCR 1.0": "Gemma-licensed domain-specific OCR caution", |
| "Mishkala Tashkeel": "pronunciation preprocessor benchmark", |
| "Tashkeel-350M": "larger pronunciation preprocessor benchmark", |
| "Mushkil": "AraT5V2 pronunciation preprocessor benchmark", |
| "Thaka KSAA-2026 speech diacritization": "research signal only", |
| "3arab-TTS 500M": "new Arabic voice benchmark", |
| "KaniTTS Arabic": "metadata/license uncertainty", |
| "Emirati VITS Male": "dialect voice benchmark", |
| "VoxCPM2": "large strong-worker voice benchmark", |
| "Voxtral TTS": "non-commercial license", |
| "OmniVoice": "external multilingual voice benchmark", |
| "OmniVoice Arabic LoRA": "external Arabic adapter benchmark", |
| "Arabic-text-to-speech OmniVoice": "external Arabic-focused OmniVoice benchmark", |
| "Lahgtna OmniVoice v2": "dialect and license-uncertain voice benchmark", |
| "TADA multilingual TTS": "Llama-licensed strong-worker voice benchmark", |
| "Lahgtna Chatterbox": "dialect voice benchmark", |
| "NAMAA-Saudi-TTS": "Saudi dialect voice benchmark", |
| "NAMAA-Egyptian-TTS": "Egyptian dialect voice benchmark", |
| "Saudi Chatterbox fine-tune": "Saudi dialect voice benchmark", |
| "Saudi TTS": "Saudi dialect voice benchmark", |
| "Egyptian Arabic Chatterbox": "Egyptian dialect voice benchmark", |
| "NileTTS-XTTS": "Egyptian dialect voice benchmark", |
| "Arabic XTTS-v2 Egyptian fine-tune": "CPML/base-license dialect XTTS benchmark", |
| "Chatterbox-Multilingual": "external multilingual voice benchmark", |
| "Chatterbox Arabic fine-tune": "MSA-focused Chatterbox Arabic adapter benchmark", |
| "Chatterbox-Multilingual ONNX": "external CPU/ONNX multilingual voice benchmark", |
| "tts-arabic-onnx": "license-unclear compact Arabic ONNX voice benchmark", |
| "Spark-TTS Arabic": "external Arabic voice-cloning benchmark", |
| "Sofelia-TTS": "Palestinian dialect voice benchmark", |
| "Arabic-F5-TTS-v2": "non-commercial voice caution", |
| "MOSS-TTS-Nano": "external CPU-friendly multilingual benchmark", |
| "Qwen3-TTS": "not Arabic-ready from official released model cards", |
| "Saudi Arabic Qwen3-TTS": "Saudi/Gulf dialect voice benchmark", |
| "Emirati Qwen3.5-TTS": "Emirati/Gulf dialect voice benchmark", |
| } |
| WATCHLIST_BENCHMARK_STATUS_MARKERS = ( |
| "not wired", |
| "benchmark", |
| "research signal", |
| "not default", |
| "not arabic-ready", |
| "correction can alter", |
| ) |
| WATCHLIST_PROHIBITED_DEFAULT_MARKERS = ( |
| "wired default", |
| "default local", |
| "production default", |
| ) |
|
|
|
|
| @dataclass |
| class SourceCheck: |
| name: str |
| ok: bool |
| detail: str |
|
|
|
|
| def format_report_date(value: date | None = None) -> str: |
| value = value or date.today() |
| return value.strftime("%B %d, %Y").replace(" 0", " ") |
|
|
|
|
| def parse_report_date(value: str) -> date | None: |
| try: |
| return datetime.strptime(value.strip().rstrip("."), "%B %d, %Y").date() |
| except ValueError: |
| return None |
|
|
|
|
| def metadata_refresh_date(text: str) -> date | None: |
| match = re.search(r"^Last refreshed:\s*(.+)$", text, re.MULTILINE) |
| if not match: |
| return None |
| return parse_report_date(match.group(1)) |
|
|
|
|
| def metadata_freshness_check(text: str, max_age_days: int = MAX_METADATA_AGE_DAYS) -> SourceCheck: |
| refreshed = metadata_refresh_date(text) |
| if refreshed is None: |
| return SourceCheck("metadata freshness", False, "missing or invalid Last refreshed date") |
| today = date.today() |
| age_days = (today - refreshed).days |
| if age_days < 0: |
| return SourceCheck("metadata freshness", False, f"future refresh date {refreshed.isoformat()}") |
| return SourceCheck( |
| "metadata freshness", |
| age_days <= max_age_days, |
| f"refreshed={refreshed.isoformat()} ageDays={age_days} maxAgeDays={max_age_days}", |
| ) |
|
|
|
|
| def parse_detail_fields(detail: str) -> dict[str, str]: |
| fields: dict[str, str] = {} |
| for part in detail.split(): |
| if "=" not in part: |
| continue |
| key, value = part.split("=", 1) |
| fields[key] = value |
| return fields |
|
|
|
|
| def extract_urls(paths: list[Path]) -> list[str]: |
| urls: list[str] = [] |
| seen: set[str] = set() |
| for path in paths: |
| if not path.exists(): |
| continue |
| for match in URL_RE.findall(path.read_text(encoding="utf-8", errors="replace")): |
| url = match.rstrip(".,") |
| if url not in seen: |
| seen.add(url) |
| urls.append(url) |
| return urls |
|
|
|
|
| def check_required_markers(path: Path = ROOT_DIR / "docs" / "source-evidence.md") -> list[SourceCheck]: |
| text = path.read_text(encoding="utf-8", errors="replace") if path.exists() else "" |
| checks: list[SourceCheck] = [] |
| for marker in REQUIRED_SOURCE_MARKERS: |
| checks.append(SourceCheck(f"source marker: {marker}", marker in text, "present" if marker in text else "missing")) |
| return checks |
|
|
|
|
| def check_metadata_snapshot(path: Path = ROOT_DIR / "docs" / "huggingface-model-metadata.md") -> list[SourceCheck]: |
| text = path.read_text(encoding="utf-8", errors="replace") if path.exists() else "" |
| checks = [SourceCheck("metadata snapshot exists", bool(text), str(path) if text else "missing")] |
| checks.append(metadata_freshness_check(text)) |
| for marker in REQUIRED_METADATA_MARKERS: |
| checks.append(SourceCheck(f"metadata marker: {marker}", marker in text, "present" if marker in text else "missing")) |
| return checks |
|
|
|
|
| def check_recommendation_report(path: Path = ROOT_DIR / "docs" / "recommended-free-stack.md") -> list[SourceCheck]: |
| text = path.read_text(encoding="utf-8", errors="replace") if path.exists() else "" |
| checks = [SourceCheck("recommendation report exists", bool(text), str(path) if text else "missing")] |
| for marker in REQUIRED_RECOMMENDATION_MARKERS: |
| checks.append( |
| SourceCheck( |
| f"recommendation marker: {marker}", |
| marker in text, |
| "present" if marker in text else "missing", |
| ) |
| ) |
| return checks |
|
|
|
|
| def check_decision_card( |
| markdown_path: Path = ROOT_DIR / "docs" / "recommended-decision-card.md", |
| json_path: Path = ROOT_DIR / "docs" / "recommended-decision-card.json", |
| ) -> list[SourceCheck]: |
| markdown = markdown_path.read_text(encoding="utf-8", errors="replace") if markdown_path.exists() else "" |
| json_text = json_path.read_text(encoding="utf-8", errors="replace") if json_path.exists() else "" |
| combined = f"{markdown}\n{json_text}" |
| checks = [ |
| SourceCheck("decision card markdown exists", bool(markdown), str(markdown_path) if markdown else "missing"), |
| SourceCheck("decision card json exists", bool(json_text), str(json_path) if json_text else "missing"), |
| ] |
| for marker in REQUIRED_DECISION_CARD_MARKERS: |
| checks.append( |
| SourceCheck( |
| f"decision card marker: {marker}", |
| marker in combined, |
| "present" if marker in combined else "missing", |
| ) |
| ) |
| return checks |
|
|
|
|
| def check_watchlist_command_markers(path: Path = WATCHLIST_POLICY_PATH) -> list[SourceCheck]: |
| text = path.read_text(encoding="utf-8", errors="replace") if path.exists() else "" |
| checks = [SourceCheck("watchlist command section exists", "## Benchmark Steps" in text, str(path) if text else "missing")] |
| for marker in REQUIRED_WATCHLIST_COMMAND_MARKERS: |
| checks.append( |
| SourceCheck( |
| f"watchlist command marker: {marker}", |
| marker in text, |
| "present" if marker in text else "missing", |
| ) |
| ) |
| return checks |
|
|
|
|
| def check_workflow_doc_markers(paths: list[Path] | None = None) -> list[SourceCheck]: |
| docs = paths or WORKFLOW_DOC_PATHS |
| checks: list[SourceCheck] = [] |
| for path in docs: |
| text = path.read_text(encoding="utf-8", errors="replace") if path.exists() else "" |
| checks.append(SourceCheck(f"workflow doc exists: {path.name}", bool(text), str(path) if text else "missing")) |
| for marker in REQUIRED_WORKFLOW_MARKERS: |
| checks.append( |
| SourceCheck( |
| f"workflow doc marker: {path.name}: {marker}", |
| marker in text, |
| "present" if marker in text else "missing", |
| ) |
| ) |
| return checks |
|
|
|
|
| def parse_markdown_table_rows(text: str) -> list[dict[str, str]]: |
| headers: list[str] = [] |
| rows: list[dict[str, str]] = [] |
| for raw_line in text.splitlines(): |
| line = raw_line.strip() |
| if not line.startswith("|") or not line.endswith("|"): |
| continue |
| cells = [cell.strip() for cell in line.strip("|").split("|")] |
| if not cells: |
| continue |
| if all(re.fullmatch(r":?-{3,}:?", cell) for cell in cells): |
| continue |
| if not headers: |
| headers = [cell.lower() for cell in cells] |
| continue |
| if len(cells) != len(headers): |
| continue |
| rows.append(dict(zip(headers, cells))) |
| return rows |
|
|
|
|
| def status_is_benchmark_only(status: str) -> bool: |
| normalized = status.lower() |
| prohibited = any(marker in normalized for marker in WATCHLIST_PROHIBITED_DEFAULT_MARKERS) |
| allowed = any(marker in normalized for marker in WATCHLIST_BENCHMARK_STATUS_MARKERS) |
| return allowed and not prohibited |
|
|
|
|
| def check_watchlist_policy(path: Path = WATCHLIST_POLICY_PATH) -> list[SourceCheck]: |
| text = path.read_text(encoding="utf-8", errors="replace") if path.exists() else "" |
| rows = parse_markdown_table_rows(text) |
| checks = [SourceCheck("research watchlist table parsed", bool(rows), f"{len(rows)} rows")] |
| by_candidate = {row.get("candidate", ""): row for row in rows} |
|
|
| for candidate, reason in WATCHLIST_BENCHMARK_ONLY_REASONS.items(): |
| row = by_candidate.get(candidate) |
| if row is None: |
| checks.append(SourceCheck(f"watchlist policy: {candidate}", False, f"missing; reason={reason}")) |
| continue |
| status = row.get("status", "") |
| checks.append( |
| SourceCheck( |
| f"watchlist policy: {candidate}", |
| status_is_benchmark_only(status), |
| f"reason={reason} status={status or '-'}", |
| ) |
| ) |
|
|
| for candidate in WATCHLIST_ALLOWED_DEFAULTS: |
| row = by_candidate.get(candidate) |
| status = row.get("status", "") if row else "" |
| checks.append( |
| SourceCheck( |
| f"watchlist default allowed: {candidate}", |
| row is not None and "wired default" in status.lower(), |
| f"status={status or '-'}", |
| ) |
| ) |
|
|
| for candidate in WATCHLIST_ALLOWED_WIRED_OPTIONAL: |
| row = by_candidate.get(candidate) |
| status = row.get("status", "") if row else "" |
| normalized = status.lower() |
| ok = row is not None and ("wired optional" in normalized or status_is_benchmark_only(status)) |
| checks.append( |
| SourceCheck( |
| f"watchlist optional/default policy: {candidate}", |
| ok, |
| f"status={status or '-'}", |
| ) |
| ) |
|
|
| return checks |
|
|
|
|
| def check_url(url: str, timeout: float = 12.0) -> SourceCheck: |
| request = Request(url, headers={"User-Agent": "ArabicAudioReaderSourceCheck/1.0"}) |
| try: |
| with urlopen(request, timeout=timeout) as response: |
| status = getattr(response, "status", 200) |
| ok = 200 <= int(status) < 400 |
| return SourceCheck(url, ok, f"HTTP {status}") |
| except HTTPError as exc: |
| return SourceCheck(url, False, f"HTTP {exc.code}") |
| except URLError as exc: |
| return SourceCheck(url, False, f"URL error: {exc.reason}") |
| except TimeoutError: |
| return SourceCheck(url, False, "timeout") |
|
|
|
|
| def huggingface_model_id(url: str) -> str | None: |
| prefix = "https://huggingface.co/" |
| if not url.startswith(prefix): |
| return None |
| rest = url[len(prefix) :].strip("/") |
| parts = rest.split("/") |
| if len(parts) < 2 or parts[0] in {"docs", "datasets", "spaces"}: |
| return None |
| return "/".join(parts[:2]) |
|
|
|
|
| def model_license(metadata: dict[str, object]) -> str: |
| card_data = metadata.get("cardData") |
| if isinstance(card_data, dict): |
| license_value = card_data.get("license") |
| if isinstance(license_value, str) and license_value.strip(): |
| return license_value.strip().lower() |
| tags = metadata.get("tags") |
| if isinstance(tags, list): |
| for tag in tags: |
| if isinstance(tag, str) and tag.startswith("license:"): |
| return tag.split(":", 1)[1].strip().lower() |
| return "" |
|
|
|
|
| def fetch_huggingface_model(model_id: str, timeout: float = 12.0) -> dict[str, object]: |
| url = f"https://huggingface.co/api/models/{model_id}" |
| headers = {"User-Agent": "ArabicAudioReaderSourceCheck/1.0"} |
| token = os.getenv("HF_API_TOKEN") or os.getenv("HUGGINGFACE_API_TOKEN") |
| if token: |
| headers["Authorization"] = f"Bearer {token}" |
| request = Request(url, headers=headers) |
| with urlopen(request, timeout=timeout) as response: |
| raw = response.read().decode("utf-8", errors="replace") |
| data = json.loads(raw) |
| return data if isinstance(data, dict) else {} |
|
|
|
|
| def fetch_huggingface_model_page_metadata(model_id: str, timeout: float = 12.0) -> dict[str, object]: |
| url = f"https://huggingface.co/{model_id}" |
| request = Request(url, headers={"User-Agent": "ArabicAudioReaderSourceCheck/1.0"}) |
| with urlopen(request, timeout=timeout) as response: |
| html = response.read().decode("utf-8", errors="replace") |
| license_match = re.search(r"License:\s*</span>\s*<span[^>]*>\s*([^<\s]+)", html, re.IGNORECASE) |
| if license_match is None: |
| license_match = re.search(r"License:\s*([A-Za-z0-9_.+-]+)", html, re.IGNORECASE) |
| license_value = license_match.group(1).strip().lower() if license_match else "" |
| return { |
| "id": model_id, |
| "private": False, |
| "disabled": False, |
| "lastModified": "page-fallback", |
| "cardData": {"license": license_value} if license_value else {}, |
| } |
|
|
|
|
| def collect_huggingface_metadata_checks(timeout: float = 12.0) -> list[SourceCheck]: |
| model_ids: list[str] = [] |
| for url in KEY_SOURCE_URLS.values(): |
| model_id = huggingface_model_id(url) |
| if model_id and model_id not in model_ids: |
| model_ids.append(model_id) |
|
|
| checks: list[SourceCheck] = [] |
| for model_id in model_ids: |
| if model_id in HF_PAGE_ONLY_METADATA: |
| fallback = HF_PAGE_ONLY_METADATA[model_id] |
| license_value = str(fallback.get("license") or "") |
| checks.append( |
| SourceCheck( |
| f"huggingface metadata: {model_id}", |
| True, |
| f"id={model_id} license={license_value or '-'} lastModified=page-only", |
| ) |
| ) |
| expected_license = HF_EXPECTED_LICENSES.get(model_id) |
| if expected_license: |
| checks.append( |
| SourceCheck( |
| f"huggingface license: {model_id}", |
| license_value == expected_license, |
| f"expected={expected_license} actual={license_value or '-'}", |
| ) |
| ) |
| continue |
| try: |
| metadata = fetch_huggingface_model(model_id, timeout=timeout) |
| except HTTPError as exc: |
| if exc.code in {401, 404}: |
| try: |
| metadata = fetch_huggingface_model_page_metadata(model_id, timeout=timeout) |
| except Exception: |
| checks.append(SourceCheck(f"huggingface metadata: {model_id}", False, f"HTTP {exc.code}")) |
| continue |
| else: |
| checks.append(SourceCheck(f"huggingface metadata: {model_id}", False, f"HTTP {exc.code}")) |
| continue |
| except (URLError, TimeoutError) as exc: |
| checks.append(SourceCheck(f"huggingface metadata: {model_id}", False, str(exc))) |
| continue |
| except json.JSONDecodeError: |
| checks.append(SourceCheck(f"huggingface metadata: {model_id}", False, "invalid JSON")) |
| continue |
|
|
| reported_id = str(metadata.get("id") or metadata.get("modelId") or "") |
| private = bool(metadata.get("private")) |
| disabled = bool(metadata.get("disabled")) |
| last_modified = str(metadata.get("lastModified") or metadata.get("createdAt") or "unknown") |
| license_value = model_license(metadata) |
| checks.append( |
| SourceCheck( |
| f"huggingface metadata: {model_id}", |
| reported_id == model_id and not private and not disabled, |
| f"id={reported_id or '-'} license={license_value or '-'} lastModified={last_modified}", |
| ) |
| ) |
|
|
| expected_license = HF_EXPECTED_LICENSES.get(model_id) |
| if expected_license: |
| checks.append( |
| SourceCheck( |
| f"huggingface license: {model_id}", |
| license_value == expected_license, |
| f"expected={expected_license} actual={license_value or '-'}", |
| ) |
| ) |
| return checks |
|
|
|
|
| def collect_checks( |
| paths: list[Path] | None = None, |
| check_links: bool = False, |
| timeout: float = 12.0, |
| metadata_path: Path | None = None, |
| ) -> list[SourceCheck]: |
| docs = paths or DEFAULT_DOCS |
| checks = check_required_markers() |
| checks.extend(check_metadata_snapshot(metadata_path or ROOT_DIR / "docs" / "huggingface-model-metadata.md")) |
| checks.extend(check_recommendation_report()) |
| checks.extend(check_decision_card()) |
| checks.extend(check_watchlist_policy()) |
| checks.extend(check_watchlist_command_markers()) |
| checks.extend(check_workflow_doc_markers()) |
| urls = extract_urls(docs) |
| checks.append(SourceCheck("source urls found", bool(urls), f"{len(urls)} unique URLs")) |
| if check_links: |
| checks.extend(check_url(url, timeout=timeout) for url in urls) |
| return checks |
|
|
|
|
| def collect_command_checks( |
| *, |
| check_links: bool = False, |
| check_key_links: bool = False, |
| check_representative_links: bool = False, |
| check_hf_metadata: bool = False, |
| write_hf_metadata_report: Path | None = None, |
| timeout: float = 12.0, |
| ) -> list[SourceCheck]: |
| hf_checks: list[SourceCheck] | None = None |
| if check_hf_metadata or write_hf_metadata_report: |
| hf_checks = collect_huggingface_metadata_checks(timeout=timeout) |
| if write_hf_metadata_report: |
| write_huggingface_metadata_report(write_hf_metadata_report, hf_checks) |
|
|
| checks = collect_checks( |
| check_links=check_links, |
| timeout=timeout, |
| metadata_path=write_hf_metadata_report, |
| ) |
| checks.extend(collect_key_source_checks(check_links=check_key_links, timeout=timeout)) |
| if check_representative_links: |
| checks.extend(collect_representative_link_checks(timeout=timeout)) |
| if hf_checks is not None: |
| checks.extend(hf_checks) |
| return checks |
|
|
|
|
| def representative_source_urls(urls: list[str]) -> list[str]: |
| preferred_hosts = ["huggingface.co", "github.com", "paddleocr.ai", "arxiv.org", "vercel.com"] |
| selected: list[str] = [] |
| for host in preferred_hosts: |
| match = next((url for url in urls if host in url), None) |
| if match and match not in selected: |
| selected.append(match) |
| return selected |
|
|
|
|
| def collect_representative_link_checks(timeout: float = 8.0) -> list[SourceCheck]: |
| urls = representative_source_urls(extract_urls(DEFAULT_DOCS)) |
| checks = [SourceCheck("representative source urls selected", bool(urls), f"{len(urls)} URLs")] |
| checks.extend(check_url(url, timeout=timeout) for url in urls) |
| return checks |
|
|
|
|
| def collect_key_source_checks(paths: list[Path] | None = None, timeout: float = 8.0, check_links: bool = False) -> list[SourceCheck]: |
| docs = paths or DEFAULT_DOCS |
| urls = set(extract_urls(docs)) |
| checks: list[SourceCheck] = [] |
| for name, url in KEY_SOURCE_URLS.items(): |
| present = url in urls |
| checks.append(SourceCheck(f"key source listed: {name}", present, url if present else f"missing {url}")) |
| if present and check_links: |
| link_check = check_url(url, timeout=timeout) |
| checks.append(SourceCheck(f"key source reachable: {name}", link_check.ok, link_check.detail)) |
| return checks |
|
|
|
|
| def summarize(checks: list[SourceCheck]) -> dict[str, object]: |
| passed = sum(1 for check in checks if check.ok) |
| failed = len(checks) - passed |
| return { |
| "ready": failed == 0, |
| "counts": {"PASS": passed, "FAIL": failed}, |
| "checks": [asdict(check) for check in checks], |
| } |
|
|
|
|
| def build_huggingface_metadata_report(checks: list[SourceCheck], refreshed_at: date | None = None) -> str: |
| model_rows: dict[str, dict[str, str]] = {} |
| for check in checks: |
| if check.name.startswith("huggingface metadata: "): |
| model_id = check.name.removeprefix("huggingface metadata: ") |
| fields = parse_detail_fields(check.detail) |
| model_rows[model_id] = { |
| "model": model_id, |
| "status": "PASS" if check.ok else "FAIL", |
| "reportedId": fields.get("id", "-"), |
| "license": fields.get("license", "-"), |
| "lastModified": fields.get("lastModified", "-"), |
| "licenseCheck": "-", |
| } |
| elif check.name.startswith("huggingface license: "): |
| model_id = check.name.removeprefix("huggingface license: ") |
| row = model_rows.setdefault( |
| model_id, |
| { |
| "model": model_id, |
| "status": "-", |
| "reportedId": "-", |
| "license": "-", |
| "lastModified": "-", |
| "licenseCheck": "-", |
| }, |
| ) |
| row["licenseCheck"] = "PASS" if check.ok else f"FAIL ({check.detail})" |
|
|
| for model_id, fallback in HF_PAGE_ONLY_METADATA.items(): |
| existing = model_rows.get(model_id) |
| if existing and existing.get("status") != "FAIL": |
| continue |
| license_value = str(fallback.get("license") or "-") |
| expected_license = HF_EXPECTED_LICENSES.get(model_id) |
| model_rows[model_id] = { |
| "model": model_id, |
| "status": "PASS", |
| "reportedId": model_id, |
| "license": license_value, |
| "lastModified": "page-only", |
| "licenseCheck": "PASS" if expected_license == license_value else "-", |
| } |
|
|
| lines = [ |
| "# Hugging Face Model Metadata", |
| "", |
| f"Last refreshed: {format_report_date(refreshed_at)}.", |
| "", |
| "Generated by `scripts/check_research_sources.py --check-hf-metadata`.", |
| "", |
| "Rows marked `page-only` use verified public model-page evidence when live Hugging Face API/socket metadata is unavailable in the local environment.", |
| "", |
| "| Model | Status | Reported ID | License | License Check | Last Modified |", |
| "| --- | --- | --- | --- | --- | --- |", |
| ] |
| for row in sorted(model_rows.values(), key=lambda item: item["model"].lower()): |
| lines.append( |
| f"| {row['model']} | {row['status']} | {row['reportedId']} | {row['license']} | " |
| f"{row['licenseCheck']} | {row['lastModified']} |" |
| ) |
| return "\n".join(lines) + "\n" |
|
|
|
|
| def write_huggingface_metadata_report(path: Path, checks: list[SourceCheck]) -> None: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| path.write_text(build_huggingface_metadata_report(checks), encoding="utf-8") |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Check research source coverage for the Arabic audio reader.") |
| parser.add_argument("--check-links", action="store_true", help="Fetch each source URL and require HTTP 2xx/3xx.") |
| parser.add_argument( |
| "--check-representative-links", |
| action="store_true", |
| help="Fetch one representative URL from the major source domains.", |
| ) |
| parser.add_argument( |
| "--check-key-links", |
| action="store_true", |
| help="Fetch the exact key OCR/TTS/hosting source URLs used by the recommendation.", |
| ) |
| parser.add_argument( |
| "--check-hf-metadata", |
| action="store_true", |
| help="Fetch Hugging Face model metadata for key source URLs and verify known licenses/private/disabled state.", |
| ) |
| parser.add_argument( |
| "--write-hf-metadata-report", |
| type=Path, |
| help="Write a Markdown table of Hugging Face model IDs, licenses, and last-modified dates.", |
| ) |
| parser.add_argument("--timeout", type=float, default=12.0, help="Per-link timeout in seconds.") |
| parser.add_argument("--json", action="store_true", help="Print JSON.") |
| args = parser.parse_args() |
|
|
| checks = collect_command_checks( |
| check_links=args.check_links, |
| check_key_links=args.check_key_links, |
| check_representative_links=args.check_representative_links, |
| check_hf_metadata=args.check_hf_metadata, |
| write_hf_metadata_report=args.write_hf_metadata_report, |
| timeout=args.timeout, |
| ) |
| summary = summarize(checks) |
| if args.json: |
| print(json.dumps(summary, indent=2)) |
| else: |
| for check in checks: |
| status = "PASS" if check.ok else "FAIL" |
| print(f"{status:<4} {check.name} {check.detail}") |
| if not summary["ready"]: |
| raise SystemExit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|