Deploy Arabic Audio Reader worker
Browse files- .export-manifest.json +23 -22
- Dockerfile +3 -1
- README.md +4 -2
- app/main.py +13 -7
- docs/best-free-arabic-pdf-audio-stack.md +1 -1
- docs/father-user-guide.md +5 -5
- docs/live-deployment-checklist.md +4 -2
- docs/ocr-readability-benchmark.md +42 -0
- docs/production-worker-architecture.md +2 -2
- docs/recommended-decision-card.json +1 -1
- docs/recommended-decision-card.md +1 -1
- docs/recommended-free-stack.md +2 -2
- scripts/audit_goal_readiness.py +3 -3
- scripts/check_deployment_readiness.py +1 -1
- scripts/check_research_sources.py +3 -3
- scripts/deployment_handoff.py +4 -2
- scripts/export_hf_space.py +4 -2
- scripts/research_watchlist.py +3 -3
- scripts/setup_paddleocr.ps1 +1 -1
- scripts/setup_paddleocr.sh +1 -1
- scripts/validate_deployment_env.py +16 -2
- scripts/verify_site.py +2 -1
- scripts/verify_worker.py +2 -1
- static/app.js +14 -9
- static/index.html +3 -3
.export-manifest.json
CHANGED
|
@@ -1,17 +1,18 @@
|
|
| 1 |
{
|
| 2 |
"files": {
|
| 3 |
-
"Dockerfile": "
|
| 4 |
"api/index.py": "b0fd5c43eadd241aea79131d12ea40fe032a97f06410ce1b607e81c45f33d6f2",
|
| 5 |
"app/__init__.py": "7eb70257593da06f682a3ddda54a9d260d4fc514f645237f5ca74b08f8da61a6",
|
| 6 |
-
"app/main.py": "
|
| 7 |
-
"docs/best-free-arabic-pdf-audio-stack.md": "
|
| 8 |
-
"docs/father-user-guide.md": "
|
| 9 |
"docs/huggingface-model-metadata.md": "4590229078c2048b184787e85e5a00dd687ef5fc90e8d8d0af32538b38363cc2",
|
| 10 |
-
"docs/live-deployment-checklist.md": "
|
| 11 |
-
"docs/
|
| 12 |
-
"docs/
|
| 13 |
-
"docs/recommended-decision-card.
|
| 14 |
-
"docs/recommended-
|
|
|
|
| 15 |
"docs/research-watchlist.md": "9ea43e6f3f1d434e514e451ccc8974faa469f4533d0ffe924c8db3d80755e592",
|
| 16 |
"docs/source-evidence.md": "f308cbd0dc83a5cf34ceb5a010c354cda9acbc690e9b2c93a80cdb519ec07976",
|
| 17 |
"requirements-arabic-glm-ocr.txt": "b4c950c1ef221bfe6e2deda1a93605377619059eb02019931dbfe1fe7bd49e10",
|
|
@@ -27,21 +28,21 @@
|
|
| 27 |
"requirements.txt": "59d736ba33b31a828a5987f3477bec3f7ed6f60ceafaf730ef027a0dbbcd0def",
|
| 28 |
"scripts/arabic_glm_ocr_extract.py": "f56578018b81ac8cd7928baa9576878791214e5659f972520e8817370a9d39ad",
|
| 29 |
"scripts/arabic_qwen_ocr_extract.py": "485e9f3cdf2ced92c666b2f483d4aa37a65cb34052a4967beac7183d02c9ddcc",
|
| 30 |
-
"scripts/audit_goal_readiness.py": "
|
| 31 |
"scripts/baseer_ocr_extract.py": "056ca9cc33591db804639030a16d9635931b720d0d499b444ed6e7d0a653605a",
|
| 32 |
"scripts/benchmark_ocr.py": "b5ffb17845a7945b2a5c52e38bfabb6d82f3a8fbc8f2cdd5528843e09ad4deb4",
|
| 33 |
"scripts/benchmark_voices.py": "705bdfb6260fe90a4a68d9d2455953ea7221d282bbf0cc1cc4fa32cd5ed10205",
|
| 34 |
-
"scripts/check_deployment_readiness.py": "
|
| 35 |
-
"scripts/check_research_sources.py": "
|
| 36 |
"scripts/check_test_environment.py": "7150b13aabad03a9b7ae2527f7cdd942511658eeafb76b41128eab7e0a6dc1ea",
|
| 37 |
"scripts/cleanup_outputs.py": "de8beacd9b8511dc3775d9c232a2c86dc5cbe91c532cb3c130c304117f0d6bfd",
|
| 38 |
"scripts/configure_vercel_worker.py": "76051d6853a60df2ff614b5aa629bac241a32c85baeaf6234f734bac1f49a61f",
|
| 39 |
"scripts/deploy_hf_space.py": "173be92b31c6bcb854eaf23004b0cf4029c79498bd1eff27e6b54c32370e2e22",
|
| 40 |
-
"scripts/deployment_handoff.py": "
|
| 41 |
"scripts/deployment_status.py": "0437afcb47147b3825978d63ff36d38157135b2faf1ee658c203ae77735a3418",
|
| 42 |
"scripts/dry_run_pdf.py": "f162b566fb51d824d484a479a1337d4ee7e9a6762c0f2ccb5acd3173c1cf1bf8",
|
| 43 |
"scripts/easyocr_extract.py": "5a728a80bac7d49281113410316b5004cb6538ad50d1bc7c431eaea9c006ada9",
|
| 44 |
-
"scripts/export_hf_space.py": "
|
| 45 |
"scripts/export_ocr_sample_images.py": "eaf2ed2dca63f649317d283a1339ddae64d79b8d79eb42fe601d3b4a92ce8f45",
|
| 46 |
"scripts/export_tts_sample.py": "477ae98e81c60bc3336012167355b0b8724cd1988047e2af41f443b23de7e9f3",
|
| 47 |
"scripts/finish_live_deployment.py": "e3e7e2600071be49b747cf61f9788427339da2d2017825bda12048285e5232f8",
|
|
@@ -58,7 +59,7 @@
|
|
| 58 |
"scripts/prove_local_readiness.py": "7ccb00fc2d4aa086f8cec5ac9886f87fe044eb46c6f9a7fa7c5eb095d44095ad",
|
| 59 |
"scripts/qari_ocr_extract.py": "82ac22dae63e415c9795f4f597c000beba32af028b0a5afc749ba11bfebe2b22",
|
| 60 |
"scripts/refresh_research_evidence.py": "52209edf8485ff459bda6ada6cb1b978f12b22cdacb03413d293dab9245303fc",
|
| 61 |
-
"scripts/research_watchlist.py": "
|
| 62 |
"scripts/score_external_ocr.py": "e4b36187949dd38eaa9395979a97ccef88f7aff24a1404e3bf0793776eea136f",
|
| 63 |
"scripts/score_tts_preprocessor.py": "7b9afce94bbc914b884a09bb83bd4d267770f6b712ffb5721cfef7c34e2718a2",
|
| 64 |
"scripts/score_voice_listening.py": "22287145ab5677c4e4383a01dc9cb2090f3f22e20a37e0902bcfed8df7c7e5f6",
|
|
@@ -73,8 +74,8 @@
|
|
| 73 |
"scripts/setup_habibi.sh": "a737e7a8266fa47eb1eba3deeed52ff2bc91042646fff3b721afe7edefaf41d8",
|
| 74 |
"scripts/setup_katib_ocr.ps1": "10b3e2a0781bcccec8a344f79b2639d13abcf26904025bf91ed1646fec34115e",
|
| 75 |
"scripts/setup_katib_ocr.sh": "ee59fccd22a76c1d773c40d1e734b6f33e12d04a3e65294c635892d019c9f673",
|
| 76 |
-
"scripts/setup_paddleocr.ps1": "
|
| 77 |
-
"scripts/setup_paddleocr.sh": "
|
| 78 |
"scripts/setup_paddleocr_vl.ps1": "0840fc9f181b246bff754bff6ad6c28a2aaf80fc0771a745f0577737dea3a806",
|
| 79 |
"scripts/setup_paddleocr_vl.sh": "38d048a154d8c55e9ab4c068a7177e33618d921951c753745ab54312ec24e1ea",
|
| 80 |
"scripts/setup_qari_ocr.ps1": "d8dc9e30df59476dfc737d2538d84523a694c97122196690d175f13e5b5e9e6b",
|
|
@@ -91,13 +92,13 @@
|
|
| 91 |
"scripts/supertonic_synthesize.py": "8223e3982de99e06091cff419d9b4584a56823b67c94b1493ce7143dd8c7f4f4",
|
| 92 |
"scripts/surya_extract.py": "7361a8a667779c46aed71fa67b7f869d16f8067b55591d202fa968b8fc7628d7",
|
| 93 |
"scripts/tawkeed_ocr_extract.py": "da554d5620237b70e234032b5525fcee8e9bebe9a924e5750746530c41972318",
|
| 94 |
-
"scripts/validate_deployment_env.py": "
|
| 95 |
"scripts/verify_pipeline.py": "57359e0e4399352976100f633ce780d9a022e96885e18d29d7f5bd4c4a43a857",
|
| 96 |
-
"scripts/verify_site.py": "
|
| 97 |
"scripts/verify_voice.py": "d8fb7e473e47060b2d2f957c5c230807a205e95b1469eef9c32b76d2bc8585b2",
|
| 98 |
-
"scripts/verify_worker.py": "
|
| 99 |
-
"static/app.js": "
|
| 100 |
-
"static/index.html": "
|
| 101 |
"static/styles.css": "a45485cf99eaae8a46e57437a736ce1ebad2528dbf219c5bc79f124ec3c47164"
|
| 102 |
},
|
| 103 |
"source": "ArabicTranslator",
|
|
|
|
| 1 |
{
|
| 2 |
"files": {
|
| 3 |
+
"Dockerfile": "eb8b1b840c8303bbcc2354a0e60896cf96593ac6828df6179877b19022e97c1c",
|
| 4 |
"api/index.py": "b0fd5c43eadd241aea79131d12ea40fe032a97f06410ce1b607e81c45f33d6f2",
|
| 5 |
"app/__init__.py": "7eb70257593da06f682a3ddda54a9d260d4fc514f645237f5ca74b08f8da61a6",
|
| 6 |
+
"app/main.py": "585db9d4acd34f7b69591ae0f6c0807154b9317bb2a6830e1a3642bf50414e47",
|
| 7 |
+
"docs/best-free-arabic-pdf-audio-stack.md": "08234106caacc0207f404b11023656cdc39525b28fedf526e97369edf926c48f",
|
| 8 |
+
"docs/father-user-guide.md": "a05534fa8ecc4bee94704b6691947ac189f6767a95fd12eb65ae27c4ede1182f",
|
| 9 |
"docs/huggingface-model-metadata.md": "4590229078c2048b184787e85e5a00dd687ef5fc90e8d8d0af32538b38363cc2",
|
| 10 |
+
"docs/live-deployment-checklist.md": "7fd21a9316c1d018e2bec0620defcaaca2a690f109e51b5902c7d157244834ac",
|
| 11 |
+
"docs/ocr-readability-benchmark.md": "f93f09729f5e8bd5f938afad9490b471452ca549d081ff7700161cc1dc961453",
|
| 12 |
+
"docs/production-worker-architecture.md": "1264c16b83948385026aca0fab18e7963fa5056a178fa381380659352274b4ff",
|
| 13 |
+
"docs/recommended-decision-card.json": "97e4607db20ac19cadc9b894d6406517bcb37f8ccc6ecbe6c0c41f5f2463398e",
|
| 14 |
+
"docs/recommended-decision-card.md": "f69bbe66d7977a4877f934212862159495ea5a4547997e059f5c4e1b8d6d6cb9",
|
| 15 |
+
"docs/recommended-free-stack.md": "6156deac80f5656ff4cd33d726061965b6e2a6fbc8db4ee4123b2b43e42aa40d",
|
| 16 |
"docs/research-watchlist.md": "9ea43e6f3f1d434e514e451ccc8974faa469f4533d0ffe924c8db3d80755e592",
|
| 17 |
"docs/source-evidence.md": "f308cbd0dc83a5cf34ceb5a010c354cda9acbc690e9b2c93a80cdb519ec07976",
|
| 18 |
"requirements-arabic-glm-ocr.txt": "b4c950c1ef221bfe6e2deda1a93605377619059eb02019931dbfe1fe7bd49e10",
|
|
|
|
| 28 |
"requirements.txt": "59d736ba33b31a828a5987f3477bec3f7ed6f60ceafaf730ef027a0dbbcd0def",
|
| 29 |
"scripts/arabic_glm_ocr_extract.py": "f56578018b81ac8cd7928baa9576878791214e5659f972520e8817370a9d39ad",
|
| 30 |
"scripts/arabic_qwen_ocr_extract.py": "485e9f3cdf2ced92c666b2f483d4aa37a65cb34052a4967beac7183d02c9ddcc",
|
| 31 |
+
"scripts/audit_goal_readiness.py": "4fe8f36c4ef9b8e3c492dcef894cabc7afe98b5396e1c4bd15bdcfef3da733d7",
|
| 32 |
"scripts/baseer_ocr_extract.py": "056ca9cc33591db804639030a16d9635931b720d0d499b444ed6e7d0a653605a",
|
| 33 |
"scripts/benchmark_ocr.py": "b5ffb17845a7945b2a5c52e38bfabb6d82f3a8fbc8f2cdd5528843e09ad4deb4",
|
| 34 |
"scripts/benchmark_voices.py": "705bdfb6260fe90a4a68d9d2455953ea7221d282bbf0cc1cc4fa32cd5ed10205",
|
| 35 |
+
"scripts/check_deployment_readiness.py": "c371706cf94f807354a1a08f274dc17b1c02d68347b98f70b177b4c14f73bf17",
|
| 36 |
+
"scripts/check_research_sources.py": "49bc5a15cddf040f134d21e042d064d64fce2235f2ff1dd01f6b9c69cdf0c3e0",
|
| 37 |
"scripts/check_test_environment.py": "7150b13aabad03a9b7ae2527f7cdd942511658eeafb76b41128eab7e0a6dc1ea",
|
| 38 |
"scripts/cleanup_outputs.py": "de8beacd9b8511dc3775d9c232a2c86dc5cbe91c532cb3c130c304117f0d6bfd",
|
| 39 |
"scripts/configure_vercel_worker.py": "76051d6853a60df2ff614b5aa629bac241a32c85baeaf6234f734bac1f49a61f",
|
| 40 |
"scripts/deploy_hf_space.py": "173be92b31c6bcb854eaf23004b0cf4029c79498bd1eff27e6b54c32370e2e22",
|
| 41 |
+
"scripts/deployment_handoff.py": "f11b974c9bd9661f6f2fb1f893385515676c4248fce11b1177ca2bac87ce9f71",
|
| 42 |
"scripts/deployment_status.py": "0437afcb47147b3825978d63ff36d38157135b2faf1ee658c203ae77735a3418",
|
| 43 |
"scripts/dry_run_pdf.py": "f162b566fb51d824d484a479a1337d4ee7e9a6762c0f2ccb5acd3173c1cf1bf8",
|
| 44 |
"scripts/easyocr_extract.py": "5a728a80bac7d49281113410316b5004cb6538ad50d1bc7c431eaea9c006ada9",
|
| 45 |
+
"scripts/export_hf_space.py": "5d6cd097cd7e251f6ced6c2198a9bb0de64000004e60d16b820a87824fe7c223",
|
| 46 |
"scripts/export_ocr_sample_images.py": "eaf2ed2dca63f649317d283a1339ddae64d79b8d79eb42fe601d3b4a92ce8f45",
|
| 47 |
"scripts/export_tts_sample.py": "477ae98e81c60bc3336012167355b0b8724cd1988047e2af41f443b23de7e9f3",
|
| 48 |
"scripts/finish_live_deployment.py": "e3e7e2600071be49b747cf61f9788427339da2d2017825bda12048285e5232f8",
|
|
|
|
| 59 |
"scripts/prove_local_readiness.py": "7ccb00fc2d4aa086f8cec5ac9886f87fe044eb46c6f9a7fa7c5eb095d44095ad",
|
| 60 |
"scripts/qari_ocr_extract.py": "82ac22dae63e415c9795f4f597c000beba32af028b0a5afc749ba11bfebe2b22",
|
| 61 |
"scripts/refresh_research_evidence.py": "52209edf8485ff459bda6ada6cb1b978f12b22cdacb03413d293dab9245303fc",
|
| 62 |
+
"scripts/research_watchlist.py": "9d50b16d7aeb7838e983e441032ea45e7ecb56015556c88a0fdf9ef1aa273649",
|
| 63 |
"scripts/score_external_ocr.py": "e4b36187949dd38eaa9395979a97ccef88f7aff24a1404e3bf0793776eea136f",
|
| 64 |
"scripts/score_tts_preprocessor.py": "7b9afce94bbc914b884a09bb83bd4d267770f6b712ffb5721cfef7c34e2718a2",
|
| 65 |
"scripts/score_voice_listening.py": "22287145ab5677c4e4383a01dc9cb2090f3f22e20a37e0902bcfed8df7c7e5f6",
|
|
|
|
| 74 |
"scripts/setup_habibi.sh": "a737e7a8266fa47eb1eba3deeed52ff2bc91042646fff3b721afe7edefaf41d8",
|
| 75 |
"scripts/setup_katib_ocr.ps1": "10b3e2a0781bcccec8a344f79b2639d13abcf26904025bf91ed1646fec34115e",
|
| 76 |
"scripts/setup_katib_ocr.sh": "ee59fccd22a76c1d773c40d1e734b6f33e12d04a3e65294c635892d019c9f673",
|
| 77 |
+
"scripts/setup_paddleocr.ps1": "1bc345d3d0f6bc0614a1b2d50fd6c3325b725a807d98c25d6b9b57c0f363ac49",
|
| 78 |
+
"scripts/setup_paddleocr.sh": "9ee6d8aa3107bd040a16e84d2b5a62e2084546d847e5b827dcf0483fd464476a",
|
| 79 |
"scripts/setup_paddleocr_vl.ps1": "0840fc9f181b246bff754bff6ad6c28a2aaf80fc0771a745f0577737dea3a806",
|
| 80 |
"scripts/setup_paddleocr_vl.sh": "38d048a154d8c55e9ab4c068a7177e33618d921951c753745ab54312ec24e1ea",
|
| 81 |
"scripts/setup_qari_ocr.ps1": "d8dc9e30df59476dfc737d2538d84523a694c97122196690d175f13e5b5e9e6b",
|
|
|
|
| 92 |
"scripts/supertonic_synthesize.py": "8223e3982de99e06091cff419d9b4584a56823b67c94b1493ce7143dd8c7f4f4",
|
| 93 |
"scripts/surya_extract.py": "7361a8a667779c46aed71fa67b7f869d16f8067b55591d202fa968b8fc7628d7",
|
| 94 |
"scripts/tawkeed_ocr_extract.py": "da554d5620237b70e234032b5525fcee8e9bebe9a924e5750746530c41972318",
|
| 95 |
+
"scripts/validate_deployment_env.py": "d42531933369e541cc451dabdb2542b9a8cc9b8739a1be292252a8b054613f37",
|
| 96 |
"scripts/verify_pipeline.py": "57359e0e4399352976100f633ce780d9a022e96885e18d29d7f5bd4c4a43a857",
|
| 97 |
+
"scripts/verify_site.py": "7a09c02f0063f913ac76f0793dcf359684cb6d210c3c851e86934527b277295d",
|
| 98 |
"scripts/verify_voice.py": "d8fb7e473e47060b2d2f957c5c230807a205e95b1469eef9c32b76d2bc8585b2",
|
| 99 |
+
"scripts/verify_worker.py": "73329f87852ce805ab7144df6faaab4e081099f7ebc9a2e66e93735ee7fa82cc",
|
| 100 |
+
"static/app.js": "735d2ba288d8f96b7e99d4009d0ad5ef2db845562ea5defb5a6725b3c4dc6993",
|
| 101 |
+
"static/index.html": "0877f04c78afa4078c92fea23a93ff2f97851a8c3d17dd005e3c5a56b8508288",
|
| 102 |
"static/styles.css": "a45485cf99eaae8a46e57437a736ce1ebad2528dbf219c5bc79f124ec3c47164"
|
| 103 |
},
|
| 104 |
"source": "ArabicTranslator",
|
Dockerfile
CHANGED
|
@@ -6,7 +6,9 @@ ENV PYTHONUNBUFFERED=1 \
|
|
| 6 |
DATABASE_PATH=/data/arabic-translator/data/arabic_reader.sqlite3 \
|
| 7 |
TESSDATA_DIR=/usr/share/tesseract-ocr/5/tessdata \
|
| 8 |
ESPEAK_NG_EXE=/usr/bin/espeak-ng \
|
| 9 |
-
OCR_ENGINE=
|
|
|
|
|
|
|
| 10 |
DEFAULT_VOICE_ID=silma-local \
|
| 11 |
MAX_UPLOAD_MB=512 \
|
| 12 |
OUTPUT_RETENTION_DAYS=7 \
|
|
|
|
| 6 |
DATABASE_PATH=/data/arabic-translator/data/arabic_reader.sqlite3 \
|
| 7 |
TESSDATA_DIR=/usr/share/tesseract-ocr/5/tessdata \
|
| 8 |
ESPEAK_NG_EXE=/usr/bin/espeak-ng \
|
| 9 |
+
OCR_ENGINE=tesseract \
|
| 10 |
+
OCR_RENDER_ZOOM=2 \
|
| 11 |
+
TESSERACT_PSM=4 \
|
| 12 |
DEFAULT_VOICE_ID=silma-local \
|
| 13 |
MAX_UPLOAD_MB=512 \
|
| 14 |
OUTPUT_RETENTION_DAYS=7 \
|
README.md
CHANGED
|
@@ -28,7 +28,9 @@ SECRET_KEY=<generated by outputs\deployment-handoff.md>
|
|
| 28 |
CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| 29 |
COOKIE_SAMESITE=none
|
| 30 |
COOKIE_SECURE=1
|
| 31 |
-
OCR_ENGINE=
|
|
|
|
|
|
|
| 32 |
DEFAULT_VOICE_ID=silma-local
|
| 33 |
OUTPUT_RETENTION_DAYS=7
|
| 34 |
OUTPUT_MAX_FILES=25
|
|
@@ -44,7 +46,7 @@ python scripts\deployment_handoff.py https://your-space.hf.space --origin https:
|
|
| 44 |
|
| 45 |
Keep `outputs\deployment-handoff.md` private because it contains deployment secrets.
|
| 46 |
|
| 47 |
-
The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=
|
| 48 |
|
| 49 |
Optional stronger-worker build args:
|
| 50 |
|
|
|
|
| 28 |
CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| 29 |
COOKIE_SAMESITE=none
|
| 30 |
COOKIE_SECURE=1
|
| 31 |
+
OCR_ENGINE=tesseract
|
| 32 |
+
OCR_RENDER_ZOOM=2
|
| 33 |
+
TESSERACT_PSM=4
|
| 34 |
DEFAULT_VOICE_ID=silma-local
|
| 35 |
OUTPUT_RETENTION_DAYS=7
|
| 36 |
OUTPUT_MAX_FILES=25
|
|
|
|
| 46 |
|
| 47 |
Keep `outputs\deployment-handoff.md` private because it contains deployment secrets.
|
| 48 |
|
| 49 |
+
The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
|
| 50 |
|
| 51 |
Optional stronger-worker build args:
|
| 52 |
|
app/main.py
CHANGED
|
@@ -104,7 +104,7 @@ PIPER_MODEL = os.getenv("PIPER_MODEL")
|
|
| 104 |
ESPEAK_NG_EXE = os.getenv("ESPEAK_NG_EXE")
|
| 105 |
TESSERACT_EXE = os.getenv("TESSERACT_EXE")
|
| 106 |
TESSDATA_DIR = Path(os.getenv("TESSDATA_DIR", str(DATA_DIR / "tessdata")))
|
| 107 |
-
OCR_ENGINE = os.getenv("OCR_ENGINE", "
|
| 108 |
OCR_ENGINE_CHOICES = {
|
| 109 |
"arabic",
|
| 110 |
"arabic-max",
|
|
@@ -1203,7 +1203,7 @@ def get_engine_status() -> dict[str, object]:
|
|
| 1203 |
or easyocr_ready
|
| 1204 |
or tesseract_path
|
| 1205 |
),
|
| 1206 |
-
"label": "Arabic OCR -
|
| 1207 |
"trainedFor": "Arabic printed text",
|
| 1208 |
"models": [
|
| 1209 |
"QARI-OCR Arabic book VLM",
|
|
@@ -1251,10 +1251,10 @@ def get_engine_status() -> dict[str, object]:
|
|
| 1251 |
"easyocr": {"available": easyocr_ready, "label": "General Arabic OCR"},
|
| 1252 |
"paddleocr": {
|
| 1253 |
"available": paddleocr_ready,
|
| 1254 |
-
"label": "PaddleOCR Arabic -
|
| 1255 |
"trainedFor": "Arabic printed text",
|
| 1256 |
"model": "arabic_PP-OCRv5_mobile_rec",
|
| 1257 |
-
"recommendedFor": "
|
| 1258 |
},
|
| 1259 |
"paddleocrVl": {
|
| 1260 |
"available": paddleocr_vl_ready,
|
|
@@ -1311,7 +1311,12 @@ def get_engine_status() -> dict[str, object]:
|
|
| 1311 |
"model": "Surya OCR 2",
|
| 1312 |
"recommendedFor": "Hard scans on a real worker, not Vercel serverless",
|
| 1313 |
},
|
| 1314 |
-
"tesseract": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1315 |
"language": os.getenv("OCR_LANGUAGE", "ara"),
|
| 1316 |
},
|
| 1317 |
"readyForArabic": bool(
|
|
@@ -1327,7 +1332,8 @@ def get_engine_status() -> dict[str, object]:
|
|
| 1327 |
},
|
| 1328 |
"recommendedStack": {
|
| 1329 |
"pdf": "PyMuPDF embedded text first",
|
| 1330 |
-
"ocrEngine": "
|
|
|
|
| 1331 |
"voiceId": "silma-local",
|
| 1332 |
"audioStorage": "worker-local retained downloads",
|
| 1333 |
"benchmarkRule": "Run a representative 5-page Arabic sample before full-book audio.",
|
|
@@ -2627,7 +2633,7 @@ def ocr_pdf_text_with_tesseract(pdf_path: Path, job: Job, render_zoom: float | N
|
|
| 2627 |
)
|
| 2628 |
variant = render_zoom is not None or psm is not None
|
| 2629 |
render_zoom = render_zoom or float(os.getenv("OCR_RENDER_ZOOM", "2.0"))
|
| 2630 |
-
psm = psm or int(os.getenv("TESSERACT_PSM", "
|
| 2631 |
temp_dir = UPLOAD_DIR / f"ocr_{uuid.uuid4().hex}"
|
| 2632 |
temp_dir.mkdir(parents=True, exist_ok=True)
|
| 2633 |
pieces: list[str] = []
|
|
|
|
| 104 |
ESPEAK_NG_EXE = os.getenv("ESPEAK_NG_EXE")
|
| 105 |
TESSERACT_EXE = os.getenv("TESSERACT_EXE")
|
| 106 |
TESSDATA_DIR = Path(os.getenv("TESSDATA_DIR", str(DATA_DIR / "tessdata")))
|
| 107 |
+
OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
|
| 108 |
OCR_ENGINE_CHOICES = {
|
| 109 |
"arabic",
|
| 110 |
"arabic-max",
|
|
|
|
| 1203 |
or easyocr_ready
|
| 1204 |
or tesseract_path
|
| 1205 |
),
|
| 1206 |
+
"label": "Arabic OCR comparison - slower",
|
| 1207 |
"trainedFor": "Arabic printed text",
|
| 1208 |
"models": [
|
| 1209 |
"QARI-OCR Arabic book VLM",
|
|
|
|
| 1251 |
"easyocr": {"available": easyocr_ready, "label": "General Arabic OCR"},
|
| 1252 |
"paddleocr": {
|
| 1253 |
"available": paddleocr_ready,
|
| 1254 |
+
"label": "PaddleOCR Arabic - faster, less readable",
|
| 1255 |
"trainedFor": "Arabic printed text",
|
| 1256 |
"model": "arabic_PP-OCRv5_mobile_rec",
|
| 1257 |
+
"recommendedFor": "Usable fallback, but the 5-page benchmark produced more fragmented text than Tesseract",
|
| 1258 |
},
|
| 1259 |
"paddleocrVl": {
|
| 1260 |
"available": paddleocr_vl_ready,
|
|
|
|
| 1311 |
"model": "Surya OCR 2",
|
| 1312 |
"recommendedFor": "Hard scans on a real worker, not Vercel serverless",
|
| 1313 |
},
|
| 1314 |
+
"tesseract": {
|
| 1315 |
+
"available": bool(tesseract_path),
|
| 1316 |
+
"label": "Tesseract Arabic - Recommended readable",
|
| 1317 |
+
"trainedFor": "Arabic printed text",
|
| 1318 |
+
"recommendedFor": "Best readable output on the 5-page Arabic benchmark; uses OCR_RENDER_ZOOM=2 and TESSERACT_PSM=4 by default",
|
| 1319 |
+
},
|
| 1320 |
"language": os.getenv("OCR_LANGUAGE", "ara"),
|
| 1321 |
},
|
| 1322 |
"readyForArabic": bool(
|
|
|
|
| 1332 |
},
|
| 1333 |
"recommendedStack": {
|
| 1334 |
"pdf": "PyMuPDF embedded text first",
|
| 1335 |
+
"ocrEngine": "tesseract",
|
| 1336 |
+
"ocrSettings": "OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
|
| 1337 |
"voiceId": "silma-local",
|
| 1338 |
"audioStorage": "worker-local retained downloads",
|
| 1339 |
"benchmarkRule": "Run a representative 5-page Arabic sample before full-book audio.",
|
|
|
|
| 2633 |
)
|
| 2634 |
variant = render_zoom is not None or psm is not None
|
| 2635 |
render_zoom = render_zoom or float(os.getenv("OCR_RENDER_ZOOM", "2.0"))
|
| 2636 |
+
psm = psm or int(os.getenv("TESSERACT_PSM", "4"))
|
| 2637 |
temp_dir = UPLOAD_DIR / f"ocr_{uuid.uuid4().hex}"
|
| 2638 |
temp_dir.mkdir(parents=True, exist_ok=True)
|
| 2639 |
pieces: list[str] = []
|
docs/best-free-arabic-pdf-audio-stack.md
CHANGED
|
@@ -9,7 +9,7 @@ The source evidence is summarized in `docs/source-evidence.md`; verify the resea
|
|
| 9 |
For this project, the best practical free local stack is:
|
| 10 |
|
| 11 |
1. PyMuPDF for embedded PDF text.
|
| 12 |
-
2. `OCR_ENGINE=
|
| 13 |
3. EasyOCR Arabic as a strong alternate for older scans and difficult layouts.
|
| 14 |
4. `OCR_ENGINE=best` for short quality tests, which compares the free local OCR engines and picks the best-looking Arabic text.
|
| 15 |
5. QARI-OCR as the optional Arabic-native heavy OCR path for strong workers.
|
|
|
|
| 9 |
For this project, the best practical free local stack is:
|
| 10 |
|
| 11 |
1. PyMuPDF for embedded PDF text.
|
| 12 |
+
2. `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for scanned pages by default; it produced the most readable text on the 5-page Arabic benchmark. Use `OCR_ENGINE=arabic-max` only on short samples when the default reads a specific book badly.
|
| 13 |
3. EasyOCR Arabic as a strong alternate for older scans and difficult layouts.
|
| 14 |
4. `OCR_ENGINE=best` for short quality tests, which compares the free local OCR engines and picks the best-looking Arabic text.
|
| 15 |
5. QARI-OCR as the optional Arabic-native heavy OCR path for strong workers.
|
docs/father-user-guide.md
CHANGED
|
@@ -8,7 +8,7 @@ This guide is for the person using the website, not for setup.
|
|
| 8 |
2. Enter the access code.
|
| 9 |
3. Choose the Arabic PDF.
|
| 10 |
4. Leave **Voice** on the best Arabic voice unless someone tells you to change it.
|
| 11 |
-
5. Leave **Text quality** on **Arabic
|
| 12 |
6. Leave **Pages** on **Quick test** first.
|
| 13 |
7. Press **Create Audio**.
|
| 14 |
8. Wait until the status says the audio is ready.
|
|
@@ -18,9 +18,9 @@ This guide is for the person using the website, not for setup.
|
|
| 18 |
|
| 19 |
## Which Text Quality To Choose
|
| 20 |
|
| 21 |
-
Use **Arabic
|
| 22 |
|
| 23 |
-
Use **Maximum Arabic OCR - slower** on a short sample when the recommended option reads badly.
|
| 24 |
|
| 25 |
Use **QARI Arabic books** for a difficult scanned book when the normal option reads badly and the worker is strong enough. It is trained for Arabic books and manuscripts, but it can be much slower.
|
| 26 |
|
|
@@ -28,13 +28,13 @@ Use **KATIB Arabic OCR** when QARI is too slow or too heavy. It is also trained
|
|
| 28 |
|
| 29 |
Use **Best scan test** only on a short sample. It is useful for deciding which OCR engine works best for one book, but it is too slow for most full books.
|
| 30 |
|
| 31 |
-
Use **PaddleOCR Arabic -
|
| 32 |
|
| 33 |
Use **Tesseract Arabic fallback** when the other options are broken or when a benchmark says Tesseract worked best for that book.
|
| 34 |
|
| 35 |
## If Something Fails
|
| 36 |
|
| 37 |
-
If the app says the text quality is poor, do not make full-book audio yet. Try **Arabic
|
| 38 |
|
| 39 |
If the first pages are title pages or blank pages, make a 5-page test PDF from better pages and test that before the full book.
|
| 40 |
|
|
|
|
| 8 |
2. Enter the access code.
|
| 9 |
3. Choose the Arabic PDF.
|
| 10 |
4. Leave **Voice** on the best Arabic voice unless someone tells you to change it.
|
| 11 |
+
5. Leave **Text quality** on **Tesseract Arabic - Recommended readable** for a new scanned book.
|
| 12 |
6. Leave **Pages** on **Quick test** first.
|
| 13 |
7. Press **Create Audio**.
|
| 14 |
8. Wait until the status says the audio is ready.
|
|
|
|
| 18 |
|
| 19 |
## Which Text Quality To Choose
|
| 20 |
|
| 21 |
+
Use **Tesseract Arabic - Recommended readable** first. It produced the most readable text in the 5-page Arabic OCR benchmark and is much faster than the comparison modes.
|
| 22 |
|
| 23 |
+
Use **Arabic OCR comparison - slower** or **Maximum Arabic OCR - slower** on a short sample when the recommended option reads badly. They compare more OCR results and keep the cleanest text, but they can take much longer.
|
| 24 |
|
| 25 |
Use **QARI Arabic books** for a difficult scanned book when the normal option reads badly and the worker is strong enough. It is trained for Arabic books and manuscripts, but it can be much slower.
|
| 26 |
|
|
|
|
| 28 |
|
| 29 |
Use **Best scan test** only on a short sample. It is useful for deciding which OCR engine works best for one book, but it is too slow for most full books.
|
| 30 |
|
| 31 |
+
Use **PaddleOCR Arabic - faster, less readable** only when Tesseract is unavailable or a short test sounds better for that book.
|
| 32 |
|
| 33 |
Use **Tesseract Arabic fallback** when the other options are broken or when a benchmark says Tesseract worked best for that book.
|
| 34 |
|
| 35 |
## If Something Fails
|
| 36 |
|
| 37 |
+
If the app says the text quality is poor, do not make full-book audio yet. Try **Tesseract Arabic - Recommended readable**, then **Arabic OCR comparison - slower**, then **Best scan test** on a short sample.
|
| 38 |
|
| 39 |
If the first pages are title pages or blank pages, make a 5-page test PDF from better pages and test that before the full book.
|
| 40 |
|
docs/live-deployment-checklist.md
CHANGED
|
@@ -157,7 +157,9 @@ SECRET_KEY=<generated by outputs\deployment-handoff.md>
|
|
| 157 |
CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| 158 |
COOKIE_SAMESITE=none
|
| 159 |
COOKIE_SECURE=1
|
| 160 |
-
OCR_ENGINE=
|
|
|
|
|
|
|
| 161 |
DEFAULT_VOICE_ID=silma-local
|
| 162 |
OUTPUT_RETENTION_DAYS=7
|
| 163 |
OUTPUT_MAX_FILES=25
|
|
@@ -168,7 +170,7 @@ SILMA_FORCE_TASHKEEL=0
|
|
| 168 |
SILMA_NORMALIZE_NUMBERS=0
|
| 169 |
```
|
| 170 |
|
| 171 |
-
Keep `OCR_ENGINE=
|
| 172 |
|
| 173 |
## 3. Vercel Website
|
| 174 |
|
|
|
|
| 157 |
CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| 158 |
COOKIE_SAMESITE=none
|
| 159 |
COOKIE_SECURE=1
|
| 160 |
+
OCR_ENGINE=tesseract
|
| 161 |
+
OCR_RENDER_ZOOM=2
|
| 162 |
+
TESSERACT_PSM=4
|
| 163 |
DEFAULT_VOICE_ID=silma-local
|
| 164 |
OUTPUT_RETENTION_DAYS=7
|
| 165 |
OUTPUT_MAX_FILES=25
|
|
|
|
| 170 |
SILMA_NORMALIZE_NUMBERS=0
|
| 171 |
```
|
| 172 |
|
| 173 |
+
Keep `OCR_ENGINE=tesseract`, `OCR_RENDER_ZOOM=2`, and `TESSERACT_PSM=4` for the first real deployment. That setting produced the most readable text on the 5-page Arabic benchmark. Use `arabic-max` only on short samples when this setting reads a specific book badly.
|
| 174 |
|
| 175 |
## 3. Vercel Website
|
| 176 |
|
docs/ocr-readability-benchmark.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Arabic OCR Readability Benchmark
|
| 2 |
+
|
| 3 |
+
Last run: June 8, 2026.
|
| 4 |
+
|
| 5 |
+
Benchmark file: `test_pdfs/arabic-reader-5-page-test.pdf`
|
| 6 |
+
|
| 7 |
+
Scoring uses the app's `assess_text_quality` and speech-readiness metrics: Arabic word count, common Arabic word hits, one-letter fragment ratio, low-information line ratio, placeholder ratio, and total quality score. Higher score is better; `good` is preferred over `warning`.
|
| 8 |
+
|
| 9 |
+
## Result
|
| 10 |
+
|
| 11 |
+
Recommended OCR:
|
| 12 |
+
|
| 13 |
+
```text
|
| 14 |
+
OCR_ENGINE=tesseract
|
| 15 |
+
OCR_RENDER_ZOOM=2
|
| 16 |
+
TESSERACT_PSM=4
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
This setting produced the most readable 5-page output while staying practical for full-book jobs.
|
| 20 |
+
|
| 21 |
+
| OCR setting | Pages | Seconds | Quality | Score | Arabic words | Fragment line ratio | Extraction |
|
| 22 |
+
| --- | ---: | ---: | --- | ---: | ---: | ---: | --- |
|
| 23 |
+
| Tesseract 2x PSM 4 | 5 | 37.30 | good | 11919.05 | 3120 | 0.0433 | `tesseract@2x-psm4` |
|
| 24 |
+
| Tesseract default PSM 6 | 5 | 28.88 | good | 11510.50 | 3284 | 0.0166 | `tesseract@1.5x-psm6` |
|
| 25 |
+
| PaddleOCR Arabic | 5 | 106.91 | warning | 8105.80 | 2251 | 0.3133 | `paddleocr` |
|
| 26 |
+
| Auto fallback | 5 | 104.47 | warning | 8105.80 | 2251 | 0.3133 | `paddleocr` |
|
| 27 |
+
| EasyOCR mode | 5 | 102.39 | warning | 8105.80 | 2251 | 0.3133 | `paddleocr` |
|
| 28 |
+
|
| 29 |
+
The slower comparison modes were tested on the 1-page sample because the full 5-page comparison exceeded the 10-minute run window. Both selected the same underlying winner, `tesseract@2x-psm4`, but took about 4.5 minutes for one page:
|
| 30 |
+
|
| 31 |
+
| OCR setting | Pages | Seconds | Quality | Score | Arabic words | Extraction |
|
| 32 |
+
| --- | ---: | ---: | --- | ---: | ---: | --- |
|
| 33 |
+
| Arabic OCR comparison | 1 | 280.76 | good | 3565.85 | 719 | `arabic:tesseract@2x-psm4` |
|
| 34 |
+
| Maximum Arabic OCR | 1 | 268.47 | good | 3565.85 | 719 | `arabic-max:tesseract@2x-psm4` |
|
| 35 |
+
|
| 36 |
+
## Interpretation
|
| 37 |
+
|
| 38 |
+
`arabic` and `arabic-max` are useful short-sample diagnostics because they can compare installed OCR engines and pick the cleanest text. They are not the right default for long PDFs on the current free worker because they spend minutes per page and selected Tesseract anyway.
|
| 39 |
+
|
| 40 |
+
PaddleOCR is available and works, but on this book sample it returned many low-information lines and more fragmented Arabic text. It remains a fallback, not the recommendation.
|
| 41 |
+
|
| 42 |
+
The live/default website setting should therefore be `Tesseract Arabic - Recommended readable`.
|
docs/production-worker-architecture.md
CHANGED
|
@@ -93,7 +93,7 @@ The worker bundle also includes setup scripts for optional heavy paths, but they
|
|
| 93 |
|
| 94 |
`Dockerfile.worker` exposes `INSTALL_QARI_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_BASEER_OCR=1`, `INSTALL_PADDLEOCR_VL=1`, and `INSTALL_SUPERTONIC=1` build args so stronger workers can install QARI-OCR, KATIB, Arabic-Qwen, Baseer, PaddleOCR-VL, and the optional Supertonic CPU voice without editing the Dockerfile. This keeps the free CPU image practical while making the higher-quality free OCR paths and fast voice comparison path deployable.
|
| 95 |
|
| 96 |
-
QARI-OCR 0.4 is the strongest Arabic-native OCR upgrade to test for a stronger worker. It is a 4B VLM fine-tuned for Islamic books and Arabic manuscripts, so keep it out of the default free CPU family-site worker unless a short Arabic-book benchmark proves it improves the actual pages and the worker has enough RAM/GPU. KATIB 0.8B and Arabic-Qwen3.5-OCR-v4 are the smaller Arabic-trained OCR upgrades to try when QARI is too heavy. If the worker is too small for QARI, set `QARI_OCR_MODEL=NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct` to test the lighter older QARI path. PaddleOCR-VL-1.6 remains the main general document-parser upgrade to watch. In the website, start with `Arabic
|
| 97 |
|
| 98 |
The repo includes optional KATIB, Arabic-Qwen, QARI-OCR, and PaddleOCR-VL sidecar scripts for this evaluation path:
|
| 99 |
|
|
@@ -131,7 +131,7 @@ Create a new Space:
|
|
| 131 |
- `CORS_ORIGINS=https://your-vercel-app.vercel.app`
|
| 132 |
- `COOKIE_SAMESITE=none`
|
| 133 |
- `COOKIE_SECURE=1`
|
| 134 |
-
5. Keep `OCR_ENGINE=
|
| 135 |
|
| 136 |
Then set this on Vercel:
|
| 137 |
|
|
|
|
| 93 |
|
| 94 |
`Dockerfile.worker` exposes `INSTALL_QARI_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_BASEER_OCR=1`, `INSTALL_PADDLEOCR_VL=1`, and `INSTALL_SUPERTONIC=1` build args so stronger workers can install QARI-OCR, KATIB, Arabic-Qwen, Baseer, PaddleOCR-VL, and the optional Supertonic CPU voice without editing the Dockerfile. This keeps the free CPU image practical while making the higher-quality free OCR paths and fast voice comparison path deployable.
|
| 95 |
|
| 96 |
+
QARI-OCR 0.4 is the strongest Arabic-native OCR upgrade to test for a stronger worker. It is a 4B VLM fine-tuned for Islamic books and Arabic manuscripts, so keep it out of the default free CPU family-site worker unless a short Arabic-book benchmark proves it improves the actual pages and the worker has enough RAM/GPU. KATIB 0.8B and Arabic-Qwen3.5-OCR-v4 are the smaller Arabic-trained OCR upgrades to try when QARI is too heavy. If the worker is too small for QARI, set `QARI_OCR_MODEL=NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct` to test the lighter older QARI path. PaddleOCR-VL-1.6 remains the main general document-parser upgrade to watch. In the website, start with `Tesseract Arabic - Recommended readable`; use `Arabic OCR comparison - slower` or `Maximum Arabic OCR - slower` only for short tests or difficult pages.
|
| 97 |
|
| 98 |
The repo includes optional KATIB, Arabic-Qwen, QARI-OCR, and PaddleOCR-VL sidecar scripts for this evaluation path:
|
| 99 |
|
|
|
|
| 131 |
- `CORS_ORIGINS=https://your-vercel-app.vercel.app`
|
| 132 |
- `COOKIE_SAMESITE=none`
|
| 133 |
- `COOKIE_SECURE=1`
|
| 134 |
+
5. Keep `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for readable full-book runs, or switch to a slower comparison mode only when a short sample proves it is better for that book.
|
| 135 |
|
| 136 |
Then set this on Vercel:
|
| 137 |
|
docs/recommended-decision-card.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"title": "Recommended Free Arabic PDF To Audio Decision Card",
|
| 3 |
"currentDefault": {
|
| 4 |
"pdf": "PyMuPDF embedded text first",
|
| 5 |
-
"scannedPdfOcr": "OCR_ENGINE=
|
| 6 |
"voice": "SILMA TTS (silma-local)",
|
| 7 |
"audioStorage": "worker-local retained downloads",
|
| 8 |
"hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL"
|
|
|
|
| 2 |
"title": "Recommended Free Arabic PDF To Audio Decision Card",
|
| 3 |
"currentDefault": {
|
| 4 |
"pdf": "PyMuPDF embedded text first",
|
| 5 |
+
"scannedPdfOcr": "OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
|
| 6 |
"voice": "SILMA TTS (silma-local)",
|
| 7 |
"audioStorage": "worker-local retained downloads",
|
| 8 |
"hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL"
|
docs/recommended-decision-card.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
| Layer | Choice |
|
| 6 |
| --- | --- |
|
| 7 |
| pdf | PyMuPDF embedded text first |
|
| 8 |
-
| scannedPdfOcr | OCR_ENGINE=
|
| 9 |
| voice | SILMA TTS (silma-local) |
|
| 10 |
| audioStorage | worker-local retained downloads |
|
| 11 |
| hosting | Vercel shell plus Docker worker via WORKER_BASE_URL |
|
|
|
|
| 5 |
| Layer | Choice |
|
| 6 |
| --- | --- |
|
| 7 |
| pdf | PyMuPDF embedded text first |
|
| 8 |
+
| scannedPdfOcr | OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4 |
|
| 9 |
| voice | SILMA TTS (silma-local) |
|
| 10 |
| audioStorage | worker-local retained downloads |
|
| 11 |
| hosting | Vercel shell plus Docker worker via WORKER_BASE_URL |
|
docs/recommended-free-stack.md
CHANGED
|
@@ -7,7 +7,7 @@ This is the compact decision report generated from the current research watchlis
|
|
| 7 |
| Layer | Recommendation | Why |
|
| 8 |
| --- | --- | --- |
|
| 9 |
| Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |
|
| 10 |
-
| Scanned PDFs | `OCR_ENGINE=
|
| 11 |
| Default voice | SILMA TTS | Arabic-focused Fusha/MSA voice with normalization and tashkeel options. |
|
| 12 |
| Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |
|
| 13 |
| Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |
|
|
@@ -83,4 +83,4 @@ Promote a model only when all of these are true:
|
|
| 83 |
5. Its runtime is acceptable for the target worker.
|
| 84 |
6. The generated JSON score passes `scripts\model_promotion_gate.py` after human review.
|
| 85 |
|
| 86 |
-
Current practical default: PyMuPDF -> `
|
|
|
|
| 7 |
| Layer | Recommendation | Why |
|
| 8 |
| --- | --- | --- |
|
| 9 |
| Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |
|
| 10 |
+
| Scanned PDFs | `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` | It produced the most readable text on the 5-page Arabic OCR benchmark while staying much faster than the comparison modes. |
|
| 11 |
| Default voice | SILMA TTS | Arabic-focused Fusha/MSA voice with normalization and tashkeel options. |
|
| 12 |
| Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |
|
| 13 |
| Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |
|
|
|
|
| 83 |
5. Its runtime is acceptable for the target worker.
|
| 84 |
6. The generated JSON score passes `scripts\model_promotion_gate.py` after human review.
|
| 85 |
|
| 86 |
+
Current practical default: PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS -> downloadable worker audio.
|
scripts/audit_goal_readiness.py
CHANGED
|
@@ -259,7 +259,7 @@ def collect_checks(
|
|
| 259 |
"PASS"
|
| 260 |
if has_all(
|
| 261 |
readme + production + deployment_checklist + dockerfile,
|
| 262 |
-
["WORKER_BASE_URL", "Docker", "OCR_ENGINE=
|
| 263 |
)
|
| 264 |
and has_all(deployment_handoff, ["WORKER_BASE_URL", "prove_live_deployment.py", "worker-verification.json"])
|
| 265 |
and has_all(
|
|
@@ -395,12 +395,12 @@ def collect_checks(
|
|
| 395 |
[
|
| 396 |
"Recommended Free Arabic PDF To Audio Stack",
|
| 397 |
"PyMuPDF text extraction first",
|
| 398 |
-
"`OCR_ENGINE=
|
| 399 |
"SILMA TTS",
|
| 400 |
"4.5 MB function payload limit",
|
| 401 |
"50 GB but non-persistent",
|
| 402 |
"Benchmark Before Promoting",
|
| 403 |
-
"PyMuPDF -> `
|
| 404 |
],
|
| 405 |
)
|
| 406 |
and has_all(
|
|
|
|
| 259 |
"PASS"
|
| 260 |
if has_all(
|
| 261 |
readme + production + deployment_checklist + dockerfile,
|
| 262 |
+
["WORKER_BASE_URL", "Docker", "OCR_ENGINE=tesseract", "OCR_RENDER_ZOOM=2", "TESSERACT_PSM=4", "AUDIO_FORMAT=mp3", "worker-verification.json"],
|
| 263 |
)
|
| 264 |
and has_all(deployment_handoff, ["WORKER_BASE_URL", "prove_live_deployment.py", "worker-verification.json"])
|
| 265 |
and has_all(
|
|
|
|
| 395 |
[
|
| 396 |
"Recommended Free Arabic PDF To Audio Stack",
|
| 397 |
"PyMuPDF text extraction first",
|
| 398 |
+
"`OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4`",
|
| 399 |
"SILMA TTS",
|
| 400 |
"4.5 MB function payload limit",
|
| 401 |
"50 GB but non-persistent",
|
| 402 |
"Benchmark Before Promoting",
|
| 403 |
+
"PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS",
|
| 404 |
],
|
| 405 |
)
|
| 406 |
and has_all(
|
scripts/check_deployment_readiness.py
CHANGED
|
@@ -216,7 +216,7 @@ def check_worker(root: Path = ROOT_DIR) -> list[Check]:
|
|
| 216 |
add(checks, "Worker", "base image", "PASS" if "python:3.10" in dockerfile else "WARN", "Python 3.10 is expected")
|
| 217 |
for package in ["tesseract-ocr-ara", "espeak-ng", "ffmpeg"]:
|
| 218 |
add(checks, "Worker", f"apt package {package}", "PASS" if package in dockerfile else "FAIL", package)
|
| 219 |
-
for env_key in ["WORK_DIR", "DATABASE_PATH", "OCR_ENGINE=
|
| 220 |
add(checks, "Worker", f"env {env_key}", "PASS" if env_key in dockerfile else "WARN", env_key)
|
| 221 |
for arg in [
|
| 222 |
"ARG INSTALL_QARI_OCR=0",
|
|
|
|
| 216 |
add(checks, "Worker", "base image", "PASS" if "python:3.10" in dockerfile else "WARN", "Python 3.10 is expected")
|
| 217 |
for package in ["tesseract-ocr-ara", "espeak-ng", "ffmpeg"]:
|
| 218 |
add(checks, "Worker", f"apt package {package}", "PASS" if package in dockerfile else "FAIL", package)
|
| 219 |
+
for env_key in ["WORK_DIR", "DATABASE_PATH", "OCR_ENGINE=tesseract", "OCR_RENDER_ZOOM=2", "TESSERACT_PSM=4", "AUDIO_FORMAT=mp3"]:
|
| 220 |
add(checks, "Worker", f"env {env_key}", "PASS" if env_key in dockerfile else "WARN", env_key)
|
| 221 |
for arg in [
|
| 222 |
"ARG INSTALL_QARI_OCR=0",
|
scripts/check_research_sources.py
CHANGED
|
@@ -265,17 +265,17 @@ REQUIRED_METADATA_MARKERS = [
|
|
| 265 |
REQUIRED_RECOMMENDATION_MARKERS = [
|
| 266 |
"Recommended Free Arabic PDF To Audio Stack",
|
| 267 |
"PyMuPDF text extraction first",
|
| 268 |
-
"`OCR_ENGINE=
|
| 269 |
"SILMA TTS",
|
| 270 |
"Vercel shell plus Docker worker",
|
| 271 |
"Benchmark Before Promoting",
|
| 272 |
"model_promotion_gate.py",
|
| 273 |
-
"PyMuPDF -> `
|
| 274 |
]
|
| 275 |
REQUIRED_DECISION_CARD_MARKERS = [
|
| 276 |
"Recommended Free Arabic PDF To Audio Decision Card",
|
| 277 |
"PyMuPDF embedded text first",
|
| 278 |
-
"OCR_ENGINE=
|
| 279 |
"SILMA TTS",
|
| 280 |
"worker-local retained downloads",
|
| 281 |
"Vercel shell plus Docker worker",
|
|
|
|
| 265 |
REQUIRED_RECOMMENDATION_MARKERS = [
|
| 266 |
"Recommended Free Arabic PDF To Audio Stack",
|
| 267 |
"PyMuPDF text extraction first",
|
| 268 |
+
"`OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4`",
|
| 269 |
"SILMA TTS",
|
| 270 |
"Vercel shell plus Docker worker",
|
| 271 |
"Benchmark Before Promoting",
|
| 272 |
"model_promotion_gate.py",
|
| 273 |
+
"PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS",
|
| 274 |
]
|
| 275 |
REQUIRED_DECISION_CARD_MARKERS = [
|
| 276 |
"Recommended Free Arabic PDF To Audio Decision Card",
|
| 277 |
"PyMuPDF embedded text first",
|
| 278 |
+
"OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
|
| 279 |
"SILMA TTS",
|
| 280 |
"worker-local retained downloads",
|
| 281 |
"Vercel shell plus Docker worker",
|
scripts/deployment_handoff.py
CHANGED
|
@@ -72,7 +72,9 @@ def build_handoff(
|
|
| 72 |
"CORS_ORIGINS": vercel_origin,
|
| 73 |
"COOKIE_SAMESITE": "none",
|
| 74 |
"COOKIE_SECURE": "1",
|
| 75 |
-
"OCR_ENGINE": "
|
|
|
|
|
|
|
| 76 |
"DEFAULT_VOICE_ID": "silma-local",
|
| 77 |
"OUTPUT_RETENTION_DAYS": "7",
|
| 78 |
"OUTPUT_MAX_FILES": "25",
|
|
@@ -223,7 +225,7 @@ def write_markdown(path: Path, handoff: DeploymentHandoff) -> None:
|
|
| 223 |
"- Set Vercel `WORKER_BASE_URL` to the exact Hugging Face worker URL shown above.",
|
| 224 |
"- After both deployments finish, run the Vercel worker diagnostic command below before uploading a large PDF. It must show `site worker reachable from vercel` and `site worker CORS ready`.",
|
| 225 |
"- Remove Vercel's temporary direct Hugging Face TTS fallback variables for production: `ENABLE_DIRECT_CLOUD_TTS`, `HF_API_TOKEN`, `HF_TTS_MODEL`, and `DEFAULT_VOICE_ID`.",
|
| 226 |
-
"- Keep `OCR_ENGINE=
|
| 227 |
"- Do not commit this handoff; it contains the deployment `SECRET_KEY`.",
|
| 228 |
"",
|
| 229 |
"## Hugging Face Docker Build Args",
|
|
|
|
| 72 |
"CORS_ORIGINS": vercel_origin,
|
| 73 |
"COOKIE_SAMESITE": "none",
|
| 74 |
"COOKIE_SECURE": "1",
|
| 75 |
+
"OCR_ENGINE": "tesseract",
|
| 76 |
+
"OCR_RENDER_ZOOM": "2",
|
| 77 |
+
"TESSERACT_PSM": "4",
|
| 78 |
"DEFAULT_VOICE_ID": "silma-local",
|
| 79 |
"OUTPUT_RETENTION_DAYS": "7",
|
| 80 |
"OUTPUT_MAX_FILES": "25",
|
|
|
|
| 225 |
"- Set Vercel `WORKER_BASE_URL` to the exact Hugging Face worker URL shown above.",
|
| 226 |
"- After both deployments finish, run the Vercel worker diagnostic command below before uploading a large PDF. It must show `site worker reachable from vercel` and `site worker CORS ready`.",
|
| 227 |
"- Remove Vercel's temporary direct Hugging Face TTS fallback variables for production: `ENABLE_DIRECT_CLOUD_TTS`, `HF_API_TOKEN`, `HF_TTS_MODEL`, and `DEFAULT_VOICE_ID`.",
|
| 228 |
+
"- Keep `OCR_ENGINE=tesseract`, `OCR_RENDER_ZOOM=2`, and `TESSERACT_PSM=4` for normal scanned Arabic books; use `arabic-max` only when a short sample needs the slower maximum comparison.",
|
| 229 |
"- Do not commit this handoff; it contains the deployment `SECRET_KEY`.",
|
| 230 |
"",
|
| 231 |
"## Hugging Face Docker Build Args",
|
scripts/export_hf_space.py
CHANGED
|
@@ -216,7 +216,9 @@ SECRET_KEY=<generated by outputs\\deployment-handoff.md>
|
|
| 216 |
CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| 217 |
COOKIE_SAMESITE=none
|
| 218 |
COOKIE_SECURE=1
|
| 219 |
-
OCR_ENGINE=
|
|
|
|
|
|
|
| 220 |
DEFAULT_VOICE_ID=silma-local
|
| 221 |
OUTPUT_RETENTION_DAYS=7
|
| 222 |
OUTPUT_MAX_FILES=25
|
|
@@ -232,7 +234,7 @@ python scripts\\deployment_handoff.py https://your-space.hf.space --origin https
|
|
| 232 |
|
| 233 |
Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.
|
| 234 |
|
| 235 |
-
The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=
|
| 236 |
|
| 237 |
Optional stronger-worker build args:
|
| 238 |
|
|
|
|
| 216 |
CORS_ORIGINS=https://your-vercel-app.vercel.app
|
| 217 |
COOKIE_SAMESITE=none
|
| 218 |
COOKIE_SECURE=1
|
| 219 |
+
OCR_ENGINE=tesseract
|
| 220 |
+
OCR_RENDER_ZOOM=2
|
| 221 |
+
TESSERACT_PSM=4
|
| 222 |
DEFAULT_VOICE_ID=silma-local
|
| 223 |
OUTPUT_RETENTION_DAYS=7
|
| 224 |
OUTPUT_MAX_FILES=25
|
|
|
|
| 234 |
|
| 235 |
Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.
|
| 236 |
|
| 237 |
+
The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
|
| 238 |
|
| 239 |
Optional stronger-worker build args:
|
| 240 |
|
scripts/research_watchlist.py
CHANGED
|
@@ -1488,7 +1488,7 @@ def build_recommendation_report(candidates: list[Candidate]) -> str:
|
|
| 1488 |
"| Layer | Recommendation | Why |",
|
| 1489 |
"| --- | --- | --- |",
|
| 1490 |
"| Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |",
|
| 1491 |
-
"| Scanned PDFs | `OCR_ENGINE=
|
| 1492 |
f"| Default voice | {default_voice.name} | {default_voice.why} |",
|
| 1493 |
"| Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |",
|
| 1494 |
"| Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |",
|
|
@@ -1544,7 +1544,7 @@ def build_recommendation_report(candidates: list[Candidate]) -> str:
|
|
| 1544 |
"5. Its runtime is acceptable for the target worker.",
|
| 1545 |
"6. The generated JSON score passes `scripts\\model_promotion_gate.py` after human review.",
|
| 1546 |
"",
|
| 1547 |
-
"Current practical default: PyMuPDF -> `
|
| 1548 |
"",
|
| 1549 |
]
|
| 1550 |
)
|
|
@@ -1563,7 +1563,7 @@ def build_decision_card(candidates: list[Candidate]) -> dict[str, object]:
|
|
| 1563 |
"title": "Recommended Free Arabic PDF To Audio Decision Card",
|
| 1564 |
"currentDefault": {
|
| 1565 |
"pdf": "PyMuPDF embedded text first",
|
| 1566 |
-
"scannedPdfOcr": "OCR_ENGINE=
|
| 1567 |
"voice": "SILMA TTS (silma-local)",
|
| 1568 |
"audioStorage": "worker-local retained downloads",
|
| 1569 |
"hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL",
|
|
|
|
| 1488 |
"| Layer | Recommendation | Why |",
|
| 1489 |
"| --- | --- | --- |",
|
| 1490 |
"| Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |",
|
| 1491 |
+
"| Scanned PDFs | `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` | It produced the most readable text on the 5-page Arabic OCR benchmark while staying much faster than the comparison modes. |",
|
| 1492 |
f"| Default voice | {default_voice.name} | {default_voice.why} |",
|
| 1493 |
"| Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |",
|
| 1494 |
"| Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |",
|
|
|
|
| 1544 |
"5. Its runtime is acceptable for the target worker.",
|
| 1545 |
"6. The generated JSON score passes `scripts\\model_promotion_gate.py` after human review.",
|
| 1546 |
"",
|
| 1547 |
+
"Current practical default: PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS -> downloadable worker audio.",
|
| 1548 |
"",
|
| 1549 |
]
|
| 1550 |
)
|
|
|
|
| 1563 |
"title": "Recommended Free Arabic PDF To Audio Decision Card",
|
| 1564 |
"currentDefault": {
|
| 1565 |
"pdf": "PyMuPDF embedded text first",
|
| 1566 |
+
"scannedPdfOcr": "OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
|
| 1567 |
"voice": "SILMA TTS (silma-local)",
|
| 1568 |
"audioStorage": "worker-local retained downloads",
|
| 1569 |
"hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL",
|
scripts/setup_paddleocr.ps1
CHANGED
|
@@ -15,4 +15,4 @@ $pythonExe = Join-Path $venv "Scripts\python.exe"
|
|
| 15 |
& $pythonExe -m pip install -r (Join-Path $root "requirements-paddleocr.txt")
|
| 16 |
|
| 17 |
Write-Host "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $venv"
|
| 18 |
-
Write-Host "Use OCR_ENGINE=
|
|
|
|
| 15 |
& $pythonExe -m pip install -r (Join-Path $root "requirements-paddleocr.txt")
|
| 16 |
|
| 17 |
Write-Host "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $venv"
|
| 18 |
+
Write-Host "Use OCR_ENGINE=paddleocr only as a fallback; the website's readable default is Tesseract Arabic."
|
scripts/setup_paddleocr.sh
CHANGED
|
@@ -13,4 +13,4 @@ fi
|
|
| 13 |
"$VENV/bin/python" -m pip install -r "$ROOT/requirements-paddleocr.txt"
|
| 14 |
|
| 15 |
echo "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $VENV"
|
| 16 |
-
echo "Use OCR_ENGINE=
|
|
|
|
| 13 |
"$VENV/bin/python" -m pip install -r "$ROOT/requirements-paddleocr.txt"
|
| 14 |
|
| 15 |
echo "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $VENV"
|
| 16 |
+
echo "Use OCR_ENGINE=paddleocr only as a fallback; the website's readable default is Tesseract Arabic."
|
scripts/validate_deployment_env.py
CHANGED
|
@@ -228,8 +228,22 @@ def validate_worker_env(
|
|
| 228 |
checks,
|
| 229 |
"Worker",
|
| 230 |
"OCR_ENGINE",
|
| 231 |
-
"PASS" if env.get("OCR_ENGINE", "
|
| 232 |
-
env.get("OCR_ENGINE", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
)
|
| 234 |
add(
|
| 235 |
checks,
|
|
|
|
| 228 |
checks,
|
| 229 |
"Worker",
|
| 230 |
"OCR_ENGINE",
|
| 231 |
+
"PASS" if env.get("OCR_ENGINE", "tesseract") == "tesseract" else "WARN",
|
| 232 |
+
env.get("OCR_ENGINE", "tesseract"),
|
| 233 |
+
)
|
| 234 |
+
add(
|
| 235 |
+
checks,
|
| 236 |
+
"Worker",
|
| 237 |
+
"OCR_RENDER_ZOOM",
|
| 238 |
+
"PASS" if env.get("OCR_RENDER_ZOOM", "2") == "2" else "WARN",
|
| 239 |
+
env.get("OCR_RENDER_ZOOM", "2"),
|
| 240 |
+
)
|
| 241 |
+
add(
|
| 242 |
+
checks,
|
| 243 |
+
"Worker",
|
| 244 |
+
"TESSERACT_PSM",
|
| 245 |
+
"PASS" if env.get("TESSERACT_PSM", "4") == "4" else "WARN",
|
| 246 |
+
env.get("TESSERACT_PSM", "4"),
|
| 247 |
)
|
| 248 |
add(
|
| 249 |
checks,
|
scripts/verify_site.py
CHANGED
|
@@ -75,7 +75,8 @@ def verify_site(
|
|
| 75 |
checks,
|
| 76 |
"site recommended stack documented",
|
| 77 |
recommended_stack.get("pdf") == "PyMuPDF embedded text first"
|
| 78 |
-
and recommended_stack.get("ocrEngine") == "
|
|
|
|
| 79 |
and recommended_stack.get("voiceId") == "silma-local"
|
| 80 |
and recommended_stack.get("audioStorage") == "worker-local retained downloads",
|
| 81 |
json.dumps(recommended_stack),
|
|
|
|
| 75 |
checks,
|
| 76 |
"site recommended stack documented",
|
| 77 |
recommended_stack.get("pdf") == "PyMuPDF embedded text first"
|
| 78 |
+
and recommended_stack.get("ocrEngine") == "tesseract"
|
| 79 |
+
and recommended_stack.get("ocrSettings") == "OCR_RENDER_ZOOM=2 TESSERACT_PSM=4"
|
| 80 |
and recommended_stack.get("voiceId") == "silma-local"
|
| 81 |
and recommended_stack.get("audioStorage") == "worker-local retained downloads",
|
| 82 |
json.dumps(recommended_stack),
|
scripts/verify_worker.py
CHANGED
|
@@ -113,7 +113,8 @@ def has_recommended_stack(engines: dict[str, Any]) -> bool:
|
|
| 113 |
stack = recommended_stack_summary(engines)
|
| 114 |
return bool(
|
| 115 |
stack.get("pdf") == "PyMuPDF embedded text first"
|
| 116 |
-
and stack.get("ocrEngine") == "
|
|
|
|
| 117 |
and stack.get("voiceId") == "silma-local"
|
| 118 |
and stack.get("audioStorage") == "worker-local retained downloads"
|
| 119 |
)
|
|
|
|
| 113 |
stack = recommended_stack_summary(engines)
|
| 114 |
return bool(
|
| 115 |
stack.get("pdf") == "PyMuPDF embedded text first"
|
| 116 |
+
and stack.get("ocrEngine") == "tesseract"
|
| 117 |
+
and stack.get("ocrSettings") == "OCR_RENDER_ZOOM=2 TESSERACT_PSM=4"
|
| 118 |
and stack.get("voiceId") == "silma-local"
|
| 119 |
and stack.get("audioStorage") == "worker-local retained downloads"
|
| 120 |
)
|
static/app.js
CHANGED
|
@@ -84,19 +84,19 @@ let browserSpeechSourceName = "";
|
|
| 84 |
|
| 85 |
const ocrModeLabels = {
|
| 86 |
"arabic-max": "Maximum Arabic OCR - slower",
|
| 87 |
-
arabic: "Arabic OCR -
|
| 88 |
"qari-ocr": "QARI Arabic books (best)",
|
| 89 |
"tawkeed-ocr": "Tawkeed Arabic OCR",
|
| 90 |
"katib-ocr": "KATIB Arabic OCR (lighter)",
|
| 91 |
"arabic-qwen-ocr": "Arabic-Qwen OCR",
|
| 92 |
"arabic-glm-ocr": "Arabic-GLM OCR v2",
|
| 93 |
"baseer-ocr": "Baseer Arabic OCR",
|
| 94 |
-
paddleocr: "PaddleOCR Arabic -
|
| 95 |
"paddleocr-vl": "PaddleOCR-VL heavy",
|
| 96 |
best: "Best scan test",
|
| 97 |
surya: "Surya heavy OCR",
|
| 98 |
easyocr: "General Arabic OCR",
|
| 99 |
-
tesseract: "Tesseract Arabic
|
| 100 |
auto: "Auto fallback",
|
| 101 |
};
|
| 102 |
|
|
@@ -316,7 +316,7 @@ async function loadHealth() {
|
|
| 316 |
engines.ocr?.preferred === "arabic-max"
|
| 317 |
? "Maximum Arabic OCR is ready, but slower"
|
| 318 |
: engines.ocr?.preferred === "arabic"
|
| 319 |
-
? "
|
| 320 |
: engines.ocr?.preferred === "qari-ocr"
|
| 321 |
? "QARI Arabic book OCR is ready"
|
| 322 |
: engines.ocr?.preferred === "tawkeed-ocr"
|
|
@@ -330,11 +330,13 @@ async function loadHealth() {
|
|
| 330 |
: engines.ocr?.preferred === "baseer-ocr"
|
| 331 |
? "Baseer Arabic OCR is ready"
|
| 332 |
: engines.ocr?.preferred === "paddleocr"
|
| 333 |
-
? "
|
| 334 |
: engines.ocr?.preferred === "paddleocr-vl"
|
| 335 |
? "PaddleOCR-VL heavy OCR is ready"
|
| 336 |
: engines.ocr?.preferred === "surya"
|
| 337 |
? "Surya heavy OCR is ready"
|
|
|
|
|
|
|
| 338 |
: engines.ocr?.preferred === "best"
|
| 339 |
? "Best Arabic OCR test mode is ready"
|
| 340 |
: engines.ocr?.preferred
|
|
@@ -1389,7 +1391,7 @@ function describeOcrMode() {
|
|
| 1389 |
engineNotice.textContent = `Maximum Arabic OCR selected. It tries the most engines and keeps the cleanest text, but it is slower. Use Quick test first.${installedText}`;
|
| 1390 |
engineNotice.classList.remove("warning");
|
| 1391 |
} else if (ocrModeSelect.value === "arabic") {
|
| 1392 |
-
engineNotice.textContent = `
|
| 1393 |
engineNotice.classList.remove("warning");
|
| 1394 |
} else if (ocrModeSelect.value === "qari-ocr") {
|
| 1395 |
engineNotice.textContent = "QARI Arabic books selected. Use this on a short sample or strong worker; it is trained for Arabic books, Islamic texts, manuscripts, and layout-aware Arabic transcription.";
|
|
@@ -1413,7 +1415,10 @@ function describeOcrMode() {
|
|
| 1413 |
engineNotice.textContent = "Best scan test selected. Use this on a short sample, then run the winning engine for the full book.";
|
| 1414 |
engineNotice.classList.remove("warning");
|
| 1415 |
} else if (ocrModeSelect.value === "paddleocr") {
|
| 1416 |
-
engineNotice.textContent = "PaddleOCR Arabic selected.
|
|
|
|
|
|
|
|
|
|
| 1417 |
engineNotice.classList.remove("warning");
|
| 1418 |
} else if (ocrModeSelect.value === "paddleocr-vl") {
|
| 1419 |
engineNotice.textContent = "PaddleOCR-VL selected. Use this only on a short sample or strong worker; it is much heavier than normal Arabic OCR.";
|
|
@@ -1644,8 +1649,8 @@ function showQualityHint(quality) {
|
|
| 1644 |
}
|
| 1645 |
const reasons = quality.reasons?.length ? ` ${quality.reasons.join("; ")}.` : "";
|
| 1646 |
const action = quality.quality === "poor"
|
| 1647 |
-
? "Try Arabic
|
| 1648 |
-
: "Listen to a short sample before running the full book. If it sounds wrong, try Arabic
|
| 1649 |
qualityHint.textContent = `Text needs checking.${reasons} ${action}`;
|
| 1650 |
qualityHint.classList.remove("hidden");
|
| 1651 |
qualityHint.classList.toggle("poor", quality.quality === "poor");
|
|
|
|
| 84 |
|
| 85 |
const ocrModeLabels = {
|
| 86 |
"arabic-max": "Maximum Arabic OCR - slower",
|
| 87 |
+
arabic: "Arabic OCR comparison - slower",
|
| 88 |
"qari-ocr": "QARI Arabic books (best)",
|
| 89 |
"tawkeed-ocr": "Tawkeed Arabic OCR",
|
| 90 |
"katib-ocr": "KATIB Arabic OCR (lighter)",
|
| 91 |
"arabic-qwen-ocr": "Arabic-Qwen OCR",
|
| 92 |
"arabic-glm-ocr": "Arabic-GLM OCR v2",
|
| 93 |
"baseer-ocr": "Baseer Arabic OCR",
|
| 94 |
+
paddleocr: "PaddleOCR Arabic - faster, less readable",
|
| 95 |
"paddleocr-vl": "PaddleOCR-VL heavy",
|
| 96 |
best: "Best scan test",
|
| 97 |
surya: "Surya heavy OCR",
|
| 98 |
easyocr: "General Arabic OCR",
|
| 99 |
+
tesseract: "Tesseract Arabic - Recommended readable",
|
| 100 |
auto: "Auto fallback",
|
| 101 |
};
|
| 102 |
|
|
|
|
| 316 |
engines.ocr?.preferred === "arabic-max"
|
| 317 |
? "Maximum Arabic OCR is ready, but slower"
|
| 318 |
: engines.ocr?.preferred === "arabic"
|
| 319 |
+
? "Arabic OCR comparison is ready, but slower"
|
| 320 |
: engines.ocr?.preferred === "qari-ocr"
|
| 321 |
? "QARI Arabic book OCR is ready"
|
| 322 |
: engines.ocr?.preferred === "tawkeed-ocr"
|
|
|
|
| 330 |
: engines.ocr?.preferred === "baseer-ocr"
|
| 331 |
? "Baseer Arabic OCR is ready"
|
| 332 |
: engines.ocr?.preferred === "paddleocr"
|
| 333 |
+
? "PaddleOCR Arabic is ready, but less readable"
|
| 334 |
: engines.ocr?.preferred === "paddleocr-vl"
|
| 335 |
? "PaddleOCR-VL heavy OCR is ready"
|
| 336 |
: engines.ocr?.preferred === "surya"
|
| 337 |
? "Surya heavy OCR is ready"
|
| 338 |
+
: engines.ocr?.preferred === "tesseract"
|
| 339 |
+
? "Recommended readable Tesseract Arabic OCR is ready"
|
| 340 |
: engines.ocr?.preferred === "best"
|
| 341 |
? "Best Arabic OCR test mode is ready"
|
| 342 |
: engines.ocr?.preferred
|
|
|
|
| 1391 |
engineNotice.textContent = `Maximum Arabic OCR selected. It tries the most engines and keeps the cleanest text, but it is slower. Use Quick test first.${installedText}`;
|
| 1392 |
engineNotice.classList.remove("warning");
|
| 1393 |
} else if (ocrModeSelect.value === "arabic") {
|
| 1394 |
+
engineNotice.textContent = `Arabic OCR comparison selected. It compares installed OCR paths and can be much slower than the recommended Tesseract setting.${installedText}`;
|
| 1395 |
engineNotice.classList.remove("warning");
|
| 1396 |
} else if (ocrModeSelect.value === "qari-ocr") {
|
| 1397 |
engineNotice.textContent = "QARI Arabic books selected. Use this on a short sample or strong worker; it is trained for Arabic books, Islamic texts, manuscripts, and layout-aware Arabic transcription.";
|
|
|
|
| 1415 |
engineNotice.textContent = "Best scan test selected. Use this on a short sample, then run the winning engine for the full book.";
|
| 1416 |
engineNotice.classList.remove("warning");
|
| 1417 |
} else if (ocrModeSelect.value === "paddleocr") {
|
| 1418 |
+
engineNotice.textContent = "PaddleOCR Arabic selected. It works, but the 5-page benchmark produced more fragmented text than Tesseract.";
|
| 1419 |
+
engineNotice.classList.remove("warning");
|
| 1420 |
+
} else if (ocrModeSelect.value === "tesseract") {
|
| 1421 |
+
engineNotice.textContent = "Tesseract Arabic selected. This is the recommended readable option from the 5-page OCR benchmark.";
|
| 1422 |
engineNotice.classList.remove("warning");
|
| 1423 |
} else if (ocrModeSelect.value === "paddleocr-vl") {
|
| 1424 |
engineNotice.textContent = "PaddleOCR-VL selected. Use this only on a short sample or strong worker; it is much heavier than normal Arabic OCR.";
|
|
|
|
| 1649 |
}
|
| 1650 |
const reasons = quality.reasons?.length ? ` ${quality.reasons.join("; ")}.` : "";
|
| 1651 |
const action = quality.quality === "poor"
|
| 1652 |
+
? "Try Tesseract Arabic - Recommended readable, Best scan test, or another OCR mode before creating audio."
|
| 1653 |
+
: "Listen to a short sample before running the full book. If it sounds wrong, try Tesseract Arabic - Recommended readable, Best scan test, or another OCR mode.";
|
| 1654 |
qualityHint.textContent = `Text needs checking.${reasons} ${action}`;
|
| 1655 |
qualityHint.classList.remove("hidden");
|
| 1656 |
qualityHint.classList.toggle("poor", quality.quality === "poor");
|
static/index.html
CHANGED
|
@@ -67,7 +67,8 @@
|
|
| 67 |
<div class="field-group">
|
| 68 |
<label for="ocrModeSelect">Text quality</label>
|
| 69 |
<select id="ocrModeSelect" name="ocrMode">
|
| 70 |
-
<option value="
|
|
|
|
| 71 |
<option value="arabic-max">Maximum Arabic OCR - slower</option>
|
| 72 |
<option value="qari-ocr">QARI Arabic books (best)</option>
|
| 73 |
<option value="tawkeed-ocr">Tawkeed Arabic OCR</option>
|
|
@@ -76,11 +77,10 @@
|
|
| 76 |
<option value="arabic-glm-ocr">Arabic-GLM OCR v2</option>
|
| 77 |
<option value="baseer-ocr">Baseer Arabic OCR</option>
|
| 78 |
<option value="best">Best scan test</option>
|
| 79 |
-
<option value="paddleocr">PaddleOCR Arabic -
|
| 80 |
<option value="paddleocr-vl">PaddleOCR-VL heavy</option>
|
| 81 |
<option value="surya">Surya heavy OCR</option>
|
| 82 |
<option value="easyocr">General Arabic OCR</option>
|
| 83 |
-
<option value="tesseract">Tesseract Arabic fallback</option>
|
| 84 |
<option value="auto">Auto fallback</option>
|
| 85 |
</select>
|
| 86 |
</div>
|
|
|
|
| 67 |
<div class="field-group">
|
| 68 |
<label for="ocrModeSelect">Text quality</label>
|
| 69 |
<select id="ocrModeSelect" name="ocrMode">
|
| 70 |
+
<option value="tesseract">Tesseract Arabic - Recommended readable</option>
|
| 71 |
+
<option value="arabic">Arabic OCR comparison - slower</option>
|
| 72 |
<option value="arabic-max">Maximum Arabic OCR - slower</option>
|
| 73 |
<option value="qari-ocr">QARI Arabic books (best)</option>
|
| 74 |
<option value="tawkeed-ocr">Tawkeed Arabic OCR</option>
|
|
|
|
| 77 |
<option value="arabic-glm-ocr">Arabic-GLM OCR v2</option>
|
| 78 |
<option value="baseer-ocr">Baseer Arabic OCR</option>
|
| 79 |
<option value="best">Best scan test</option>
|
| 80 |
+
<option value="paddleocr">PaddleOCR Arabic - faster, less readable</option>
|
| 81 |
<option value="paddleocr-vl">PaddleOCR-VL heavy</option>
|
| 82 |
<option value="surya">Surya heavy OCR</option>
|
| 83 |
<option value="easyocr">General Arabic OCR</option>
|
|
|
|
| 84 |
<option value="auto">Auto fallback</option>
|
| 85 |
</select>
|
| 86 |
</div>
|