Spaces:

Syncre
/

arabic-audio-reader-worker

Running

App Files Files Community

Syncre commited on about 3 hours ago

Commit

6d5a99d

verified ·

1 Parent(s): 088795a

Deploy Arabic Audio Reader worker

Browse files

Files changed (25) hide show

.export-manifest.json +23 -22
Dockerfile +3 -1
README.md +4 -2
app/main.py +13 -7
docs/best-free-arabic-pdf-audio-stack.md +1 -1
docs/father-user-guide.md +5 -5
docs/live-deployment-checklist.md +4 -2
docs/ocr-readability-benchmark.md +42 -0
docs/production-worker-architecture.md +2 -2
docs/recommended-decision-card.json +1 -1
docs/recommended-decision-card.md +1 -1
docs/recommended-free-stack.md +2 -2
scripts/audit_goal_readiness.py +3 -3
scripts/check_deployment_readiness.py +1 -1
scripts/check_research_sources.py +3 -3
scripts/deployment_handoff.py +4 -2
scripts/export_hf_space.py +4 -2
scripts/research_watchlist.py +3 -3
scripts/setup_paddleocr.ps1 +1 -1
scripts/setup_paddleocr.sh +1 -1
scripts/validate_deployment_env.py +16 -2
scripts/verify_site.py +2 -1
scripts/verify_worker.py +2 -1
static/app.js +14 -9
static/index.html +3 -3

.export-manifest.json CHANGED Viewed

@@ -1,17 +1,18 @@
 {
   "files": {
-    "Dockerfile": "6d7affbb656f2fc67c333ee8059486fbff99f965e72f10a2c9aec0be5436837d",
     "api/index.py": "b0fd5c43eadd241aea79131d12ea40fe032a97f06410ce1b607e81c45f33d6f2",
     "app/__init__.py": "7eb70257593da06f682a3ddda54a9d260d4fc514f645237f5ca74b08f8da61a6",
-    "app/main.py": "73c5814e8fdb13e919ebb6f8efbbc6431021a5a1fe2f58b01bf6b275ac12653c",
-    "docs/best-free-arabic-pdf-audio-stack.md": "d73c1c1fca7a04517ae1abecb893175e238d36318c39c0755e9b4fa570d3910a",
-    "docs/father-user-guide.md": "8ecc38f999b58942cffd4f7626b324a32f62c421cebd280651a6e884b8ae4ecf",
     "docs/huggingface-model-metadata.md": "4590229078c2048b184787e85e5a00dd687ef5fc90e8d8d0af32538b38363cc2",
-    "docs/live-deployment-checklist.md": "9ba8b210b1c37f2dc6cad2556e17fa54621951be095394df595c373956f9a420",
-    "docs/production-worker-architecture.md": "8af1b5e5d9a586957e137f711e4a217a98580efba7848294fa9d3e23b62e0846",
-    "docs/recommended-decision-card.json": "dcedfce18b5c064b9e1f43e42d334e04b42e921e1298f573a672fd0b0ba37b66",
-    "docs/recommended-decision-card.md": "c87d8071e4d4bbd68d6a80dcd477821c38aa8919492efad5b30e32a083c0b8df",
-    "docs/recommended-free-stack.md": "f73d1fdf0e1d1edbab6b8b22c9424a420859d692f69cfc6914dc1bf348802516",
     "docs/research-watchlist.md": "9ea43e6f3f1d434e514e451ccc8974faa469f4533d0ffe924c8db3d80755e592",
     "docs/source-evidence.md": "f308cbd0dc83a5cf34ceb5a010c354cda9acbc690e9b2c93a80cdb519ec07976",
     "requirements-arabic-glm-ocr.txt": "b4c950c1ef221bfe6e2deda1a93605377619059eb02019931dbfe1fe7bd49e10",
@@ -27,21 +28,21 @@
     "requirements.txt": "59d736ba33b31a828a5987f3477bec3f7ed6f60ceafaf730ef027a0dbbcd0def",
     "scripts/arabic_glm_ocr_extract.py": "f56578018b81ac8cd7928baa9576878791214e5659f972520e8817370a9d39ad",
     "scripts/arabic_qwen_ocr_extract.py": "485e9f3cdf2ced92c666b2f483d4aa37a65cb34052a4967beac7183d02c9ddcc",
-    "scripts/audit_goal_readiness.py": "b05d40b8299521a9e98005e51886b5d68993e3ec7541558bef6fbfe7dd4c8e8d",
     "scripts/baseer_ocr_extract.py": "056ca9cc33591db804639030a16d9635931b720d0d499b444ed6e7d0a653605a",
     "scripts/benchmark_ocr.py": "b5ffb17845a7945b2a5c52e38bfabb6d82f3a8fbc8f2cdd5528843e09ad4deb4",
     "scripts/benchmark_voices.py": "705bdfb6260fe90a4a68d9d2455953ea7221d282bbf0cc1cc4fa32cd5ed10205",
-    "scripts/check_deployment_readiness.py": "475e24adaff28c4378d3eea325f4b09f30c1566355e9ea5e2d5b1b07064c9df4",
-    "scripts/check_research_sources.py": "670565c01cb48462eefe3171996963f326a22d4b106889e87a6644f3bf3aca71",
     "scripts/check_test_environment.py": "7150b13aabad03a9b7ae2527f7cdd942511658eeafb76b41128eab7e0a6dc1ea",
     "scripts/cleanup_outputs.py": "de8beacd9b8511dc3775d9c232a2c86dc5cbe91c532cb3c130c304117f0d6bfd",
     "scripts/configure_vercel_worker.py": "76051d6853a60df2ff614b5aa629bac241a32c85baeaf6234f734bac1f49a61f",
     "scripts/deploy_hf_space.py": "173be92b31c6bcb854eaf23004b0cf4029c79498bd1eff27e6b54c32370e2e22",
-    "scripts/deployment_handoff.py": "4b24f16b79b71c53419c1e75ee62dd44fdb024eee8e13094b0f6efaf4e50551e",
     "scripts/deployment_status.py": "0437afcb47147b3825978d63ff36d38157135b2faf1ee658c203ae77735a3418",
     "scripts/dry_run_pdf.py": "f162b566fb51d824d484a479a1337d4ee7e9a6762c0f2ccb5acd3173c1cf1bf8",
     "scripts/easyocr_extract.py": "5a728a80bac7d49281113410316b5004cb6538ad50d1bc7c431eaea9c006ada9",
-    "scripts/export_hf_space.py": "52eb44509cfcf9c9a27f1fc78e907e83d7495bd1dc0c1451e96fb54f219e6110",
     "scripts/export_ocr_sample_images.py": "eaf2ed2dca63f649317d283a1339ddae64d79b8d79eb42fe601d3b4a92ce8f45",
     "scripts/export_tts_sample.py": "477ae98e81c60bc3336012167355b0b8724cd1988047e2af41f443b23de7e9f3",
     "scripts/finish_live_deployment.py": "e3e7e2600071be49b747cf61f9788427339da2d2017825bda12048285e5232f8",
@@ -58,7 +59,7 @@
     "scripts/prove_local_readiness.py": "7ccb00fc2d4aa086f8cec5ac9886f87fe044eb46c6f9a7fa7c5eb095d44095ad",
     "scripts/qari_ocr_extract.py": "82ac22dae63e415c9795f4f597c000beba32af028b0a5afc749ba11bfebe2b22",
     "scripts/refresh_research_evidence.py": "52209edf8485ff459bda6ada6cb1b978f12b22cdacb03413d293dab9245303fc",
-    "scripts/research_watchlist.py": "b29d34c8a5915ec8d2c210feac84186362d170eabd411baef468759ba08b9a25",
     "scripts/score_external_ocr.py": "e4b36187949dd38eaa9395979a97ccef88f7aff24a1404e3bf0793776eea136f",
     "scripts/score_tts_preprocessor.py": "7b9afce94bbc914b884a09bb83bd4d267770f6b712ffb5721cfef7c34e2718a2",
     "scripts/score_voice_listening.py": "22287145ab5677c4e4383a01dc9cb2090f3f22e20a37e0902bcfed8df7c7e5f6",
@@ -73,8 +74,8 @@
     "scripts/setup_habibi.sh": "a737e7a8266fa47eb1eba3deeed52ff2bc91042646fff3b721afe7edefaf41d8",
     "scripts/setup_katib_ocr.ps1": "10b3e2a0781bcccec8a344f79b2639d13abcf26904025bf91ed1646fec34115e",
     "scripts/setup_katib_ocr.sh": "ee59fccd22a76c1d773c40d1e734b6f33e12d04a3e65294c635892d019c9f673",
-    "scripts/setup_paddleocr.ps1": "afaaa71132582aa7bbaaa6cf45ebc6bfffe716ca542da66c0f9e7bb849dca689",
-    "scripts/setup_paddleocr.sh": "2365608707febc705a88df84d4323fae619020011e71cb1f3d0346bcf6deb548",
     "scripts/setup_paddleocr_vl.ps1": "0840fc9f181b246bff754bff6ad6c28a2aaf80fc0771a745f0577737dea3a806",
     "scripts/setup_paddleocr_vl.sh": "38d048a154d8c55e9ab4c068a7177e33618d921951c753745ab54312ec24e1ea",
     "scripts/setup_qari_ocr.ps1": "d8dc9e30df59476dfc737d2538d84523a694c97122196690d175f13e5b5e9e6b",
@@ -91,13 +92,13 @@
     "scripts/supertonic_synthesize.py": "8223e3982de99e06091cff419d9b4584a56823b67c94b1493ce7143dd8c7f4f4",
     "scripts/surya_extract.py": "7361a8a667779c46aed71fa67b7f869d16f8067b55591d202fa968b8fc7628d7",
     "scripts/tawkeed_ocr_extract.py": "da554d5620237b70e234032b5525fcee8e9bebe9a924e5750746530c41972318",
-    "scripts/validate_deployment_env.py": "2acf5dbbed165b549e0f3b4bb72fd47aa49ed705a4a4ac49e908a63592467491",
     "scripts/verify_pipeline.py": "57359e0e4399352976100f633ce780d9a022e96885e18d29d7f5bd4c4a43a857",
-    "scripts/verify_site.py": "275e71cbb1ac19bf18286136601184dd1ab866a8e0318c8263e76e513002d071",
     "scripts/verify_voice.py": "d8fb7e473e47060b2d2f957c5c230807a205e95b1469eef9c32b76d2bc8585b2",
-    "scripts/verify_worker.py": "16b5cbef6c1ea89ada2c9df476033eda93753d0a70830b637b24cf1c833f6054",
-    "static/app.js": "375131fab5c6d9cfc3ff0d46f6443cb350451e994cec168a831f7444ef87812d",
-    "static/index.html": "eb3b3f5eb45eb58b10186f145a62822c8ae352d0783e7cb914eba6221291e853",
     "static/styles.css": "a45485cf99eaae8a46e57437a736ce1ebad2528dbf219c5bc79f124ec3c47164"
   },
   "source": "ArabicTranslator",

 {
   "files": {
+    "Dockerfile": "eb8b1b840c8303bbcc2354a0e60896cf96593ac6828df6179877b19022e97c1c",
     "api/index.py": "b0fd5c43eadd241aea79131d12ea40fe032a97f06410ce1b607e81c45f33d6f2",
     "app/__init__.py": "7eb70257593da06f682a3ddda54a9d260d4fc514f645237f5ca74b08f8da61a6",
+    "app/main.py": "585db9d4acd34f7b69591ae0f6c0807154b9317bb2a6830e1a3642bf50414e47",
+    "docs/best-free-arabic-pdf-audio-stack.md": "08234106caacc0207f404b11023656cdc39525b28fedf526e97369edf926c48f",
+    "docs/father-user-guide.md": "a05534fa8ecc4bee94704b6691947ac189f6767a95fd12eb65ae27c4ede1182f",
     "docs/huggingface-model-metadata.md": "4590229078c2048b184787e85e5a00dd687ef5fc90e8d8d0af32538b38363cc2",
+    "docs/live-deployment-checklist.md": "7fd21a9316c1d018e2bec0620defcaaca2a690f109e51b5902c7d157244834ac",
+    "docs/ocr-readability-benchmark.md": "f93f09729f5e8bd5f938afad9490b471452ca549d081ff7700161cc1dc961453",
+    "docs/production-worker-architecture.md": "1264c16b83948385026aca0fab18e7963fa5056a178fa381380659352274b4ff",
+    "docs/recommended-decision-card.json": "97e4607db20ac19cadc9b894d6406517bcb37f8ccc6ecbe6c0c41f5f2463398e",
+    "docs/recommended-decision-card.md": "f69bbe66d7977a4877f934212862159495ea5a4547997e059f5c4e1b8d6d6cb9",
+    "docs/recommended-free-stack.md": "6156deac80f5656ff4cd33d726061965b6e2a6fbc8db4ee4123b2b43e42aa40d",
     "docs/research-watchlist.md": "9ea43e6f3f1d434e514e451ccc8974faa469f4533d0ffe924c8db3d80755e592",
     "docs/source-evidence.md": "f308cbd0dc83a5cf34ceb5a010c354cda9acbc690e9b2c93a80cdb519ec07976",
     "requirements-arabic-glm-ocr.txt": "b4c950c1ef221bfe6e2deda1a93605377619059eb02019931dbfe1fe7bd49e10",
     "requirements.txt": "59d736ba33b31a828a5987f3477bec3f7ed6f60ceafaf730ef027a0dbbcd0def",
     "scripts/arabic_glm_ocr_extract.py": "f56578018b81ac8cd7928baa9576878791214e5659f972520e8817370a9d39ad",
     "scripts/arabic_qwen_ocr_extract.py": "485e9f3cdf2ced92c666b2f483d4aa37a65cb34052a4967beac7183d02c9ddcc",
+    "scripts/audit_goal_readiness.py": "4fe8f36c4ef9b8e3c492dcef894cabc7afe98b5396e1c4bd15bdcfef3da733d7",
     "scripts/baseer_ocr_extract.py": "056ca9cc33591db804639030a16d9635931b720d0d499b444ed6e7d0a653605a",
     "scripts/benchmark_ocr.py": "b5ffb17845a7945b2a5c52e38bfabb6d82f3a8fbc8f2cdd5528843e09ad4deb4",
     "scripts/benchmark_voices.py": "705bdfb6260fe90a4a68d9d2455953ea7221d282bbf0cc1cc4fa32cd5ed10205",
+    "scripts/check_deployment_readiness.py": "c371706cf94f807354a1a08f274dc17b1c02d68347b98f70b177b4c14f73bf17",
+    "scripts/check_research_sources.py": "49bc5a15cddf040f134d21e042d064d64fce2235f2ff1dd01f6b9c69cdf0c3e0",
     "scripts/check_test_environment.py": "7150b13aabad03a9b7ae2527f7cdd942511658eeafb76b41128eab7e0a6dc1ea",
     "scripts/cleanup_outputs.py": "de8beacd9b8511dc3775d9c232a2c86dc5cbe91c532cb3c130c304117f0d6bfd",
     "scripts/configure_vercel_worker.py": "76051d6853a60df2ff614b5aa629bac241a32c85baeaf6234f734bac1f49a61f",
     "scripts/deploy_hf_space.py": "173be92b31c6bcb854eaf23004b0cf4029c79498bd1eff27e6b54c32370e2e22",
+    "scripts/deployment_handoff.py": "f11b974c9bd9661f6f2fb1f893385515676c4248fce11b1177ca2bac87ce9f71",
     "scripts/deployment_status.py": "0437afcb47147b3825978d63ff36d38157135b2faf1ee658c203ae77735a3418",
     "scripts/dry_run_pdf.py": "f162b566fb51d824d484a479a1337d4ee7e9a6762c0f2ccb5acd3173c1cf1bf8",
     "scripts/easyocr_extract.py": "5a728a80bac7d49281113410316b5004cb6538ad50d1bc7c431eaea9c006ada9",
+    "scripts/export_hf_space.py": "5d6cd097cd7e251f6ced6c2198a9bb0de64000004e60d16b820a87824fe7c223",
     "scripts/export_ocr_sample_images.py": "eaf2ed2dca63f649317d283a1339ddae64d79b8d79eb42fe601d3b4a92ce8f45",
     "scripts/export_tts_sample.py": "477ae98e81c60bc3336012167355b0b8724cd1988047e2af41f443b23de7e9f3",
     "scripts/finish_live_deployment.py": "e3e7e2600071be49b747cf61f9788427339da2d2017825bda12048285e5232f8",
     "scripts/prove_local_readiness.py": "7ccb00fc2d4aa086f8cec5ac9886f87fe044eb46c6f9a7fa7c5eb095d44095ad",
     "scripts/qari_ocr_extract.py": "82ac22dae63e415c9795f4f597c000beba32af028b0a5afc749ba11bfebe2b22",
     "scripts/refresh_research_evidence.py": "52209edf8485ff459bda6ada6cb1b978f12b22cdacb03413d293dab9245303fc",
+    "scripts/research_watchlist.py": "9d50b16d7aeb7838e983e441032ea45e7ecb56015556c88a0fdf9ef1aa273649",
     "scripts/score_external_ocr.py": "e4b36187949dd38eaa9395979a97ccef88f7aff24a1404e3bf0793776eea136f",
     "scripts/score_tts_preprocessor.py": "7b9afce94bbc914b884a09bb83bd4d267770f6b712ffb5721cfef7c34e2718a2",
     "scripts/score_voice_listening.py": "22287145ab5677c4e4383a01dc9cb2090f3f22e20a37e0902bcfed8df7c7e5f6",
     "scripts/setup_habibi.sh": "a737e7a8266fa47eb1eba3deeed52ff2bc91042646fff3b721afe7edefaf41d8",
     "scripts/setup_katib_ocr.ps1": "10b3e2a0781bcccec8a344f79b2639d13abcf26904025bf91ed1646fec34115e",
     "scripts/setup_katib_ocr.sh": "ee59fccd22a76c1d773c40d1e734b6f33e12d04a3e65294c635892d019c9f673",
+    "scripts/setup_paddleocr.ps1": "1bc345d3d0f6bc0614a1b2d50fd6c3325b725a807d98c25d6b9b57c0f363ac49",
+    "scripts/setup_paddleocr.sh": "9ee6d8aa3107bd040a16e84d2b5a62e2084546d847e5b827dcf0483fd464476a",
     "scripts/setup_paddleocr_vl.ps1": "0840fc9f181b246bff754bff6ad6c28a2aaf80fc0771a745f0577737dea3a806",
     "scripts/setup_paddleocr_vl.sh": "38d048a154d8c55e9ab4c068a7177e33618d921951c753745ab54312ec24e1ea",
     "scripts/setup_qari_ocr.ps1": "d8dc9e30df59476dfc737d2538d84523a694c97122196690d175f13e5b5e9e6b",
     "scripts/supertonic_synthesize.py": "8223e3982de99e06091cff419d9b4584a56823b67c94b1493ce7143dd8c7f4f4",
     "scripts/surya_extract.py": "7361a8a667779c46aed71fa67b7f869d16f8067b55591d202fa968b8fc7628d7",
     "scripts/tawkeed_ocr_extract.py": "da554d5620237b70e234032b5525fcee8e9bebe9a924e5750746530c41972318",
+    "scripts/validate_deployment_env.py": "d42531933369e541cc451dabdb2542b9a8cc9b8739a1be292252a8b054613f37",
     "scripts/verify_pipeline.py": "57359e0e4399352976100f633ce780d9a022e96885e18d29d7f5bd4c4a43a857",
+    "scripts/verify_site.py": "7a09c02f0063f913ac76f0793dcf359684cb6d210c3c851e86934527b277295d",
     "scripts/verify_voice.py": "d8fb7e473e47060b2d2f957c5c230807a205e95b1469eef9c32b76d2bc8585b2",
+    "scripts/verify_worker.py": "73329f87852ce805ab7144df6faaab4e081099f7ebc9a2e66e93735ee7fa82cc",
+    "static/app.js": "735d2ba288d8f96b7e99d4009d0ad5ef2db845562ea5defb5a6725b3c4dc6993",
+    "static/index.html": "0877f04c78afa4078c92fea23a93ff2f97851a8c3d17dd005e3c5a56b8508288",
     "static/styles.css": "a45485cf99eaae8a46e57437a736ce1ebad2528dbf219c5bc79f124ec3c47164"
   },
   "source": "ArabicTranslator",

Dockerfile CHANGED Viewed

@@ -6,7 +6,9 @@ ENV PYTHONUNBUFFERED=1 \
     DATABASE_PATH=/data/arabic-translator/data/arabic_reader.sqlite3 \
     TESSDATA_DIR=/usr/share/tesseract-ocr/5/tessdata \
     ESPEAK_NG_EXE=/usr/bin/espeak-ng \
-    OCR_ENGINE=arabic \
     DEFAULT_VOICE_ID=silma-local \
     MAX_UPLOAD_MB=512 \
     OUTPUT_RETENTION_DAYS=7 \

     DATABASE_PATH=/data/arabic-translator/data/arabic_reader.sqlite3 \
     TESSDATA_DIR=/usr/share/tesseract-ocr/5/tessdata \
     ESPEAK_NG_EXE=/usr/bin/espeak-ng \
+    OCR_ENGINE=tesseract \
+    OCR_RENDER_ZOOM=2 \
+    TESSERACT_PSM=4 \
     DEFAULT_VOICE_ID=silma-local \
     MAX_UPLOAD_MB=512 \
     OUTPUT_RETENTION_DAYS=7 \

README.md CHANGED Viewed

@@ -28,7 +28,9 @@ SECRET_KEY=<generated by outputs\deployment-handoff.md>
 CORS_ORIGINS=https://your-vercel-app.vercel.app
 COOKIE_SAMESITE=none
 COOKIE_SECURE=1
-OCR_ENGINE=arabic
 DEFAULT_VOICE_ID=silma-local
 OUTPUT_RETENTION_DAYS=7
 OUTPUT_MAX_FILES=25
@@ -44,7 +46,7 @@ python scripts\deployment_handoff.py https://your-space.hf.space --origin https:
 Keep `outputs\deployment-handoff.md` private because it contains deployment secrets.
-The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=arabic` for balanced scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
 Optional stronger-worker build args:

 CORS_ORIGINS=https://your-vercel-app.vercel.app
 COOKIE_SAMESITE=none
 COOKIE_SECURE=1
+OCR_ENGINE=tesseract
+OCR_RENDER_ZOOM=2
+TESSERACT_PSM=4
 DEFAULT_VOICE_ID=silma-local
 OUTPUT_RETENTION_DAYS=7
 OUTPUT_MAX_FILES=25
 Keep `outputs\deployment-handoff.md` private because it contains deployment secrets.
+The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
 Optional stronger-worker build args:

app/main.py CHANGED Viewed

@@ -104,7 +104,7 @@ PIPER_MODEL = os.getenv("PIPER_MODEL")
 ESPEAK_NG_EXE = os.getenv("ESPEAK_NG_EXE")
 TESSERACT_EXE = os.getenv("TESSERACT_EXE")
 TESSDATA_DIR = Path(os.getenv("TESSDATA_DIR", str(DATA_DIR / "tessdata")))
-OCR_ENGINE = os.getenv("OCR_ENGINE", "arabic").lower()
 OCR_ENGINE_CHOICES = {
     "arabic",
     "arabic-max",
@@ -1203,7 +1203,7 @@ def get_engine_status() -> dict[str, object]:
                     or easyocr_ready
                     or tesseract_path
                 ),
-                "label": "Arabic OCR - Recommended balance",
                 "trainedFor": "Arabic printed text",
                 "models": [
                     "QARI-OCR Arabic book VLM",
@@ -1251,10 +1251,10 @@ def get_engine_status() -> dict[str, object]:
             "easyocr": {"available": easyocr_ready, "label": "General Arabic OCR"},
             "paddleocr": {
                 "available": paddleocr_ready,
-                "label": "PaddleOCR Arabic - Recommended balance",
                 "trainedFor": "Arabic printed text",
                 "model": "arabic_PP-OCRv5_mobile_rec",
-                "recommendedFor": "Best quality/speed balance on the current free worker",
             },
             "paddleocrVl": {
                 "available": paddleocr_vl_ready,
@@ -1311,7 +1311,12 @@ def get_engine_status() -> dict[str, object]:
                 "model": "Surya OCR 2",
                 "recommendedFor": "Hard scans on a real worker, not Vercel serverless",
             },
-            "tesseract": {"available": bool(tesseract_path), "label": "Tesseract Arabic fallback"},
             "language": os.getenv("OCR_LANGUAGE", "ara"),
         },
         "readyForArabic": bool(
@@ -1327,7 +1332,8 @@ def get_engine_status() -> dict[str, object]:
         },
         "recommendedStack": {
             "pdf": "PyMuPDF embedded text first",
-            "ocrEngine": "arabic",
             "voiceId": "silma-local",
             "audioStorage": "worker-local retained downloads",
             "benchmarkRule": "Run a representative 5-page Arabic sample before full-book audio.",
@@ -2627,7 +2633,7 @@ def ocr_pdf_text_with_tesseract(pdf_path: Path, job: Job, render_zoom: float | N
         )
     variant = render_zoom is not None or psm is not None
     render_zoom = render_zoom or float(os.getenv("OCR_RENDER_ZOOM", "2.0"))
-    psm = psm or int(os.getenv("TESSERACT_PSM", "6"))
     temp_dir = UPLOAD_DIR / f"ocr_{uuid.uuid4().hex}"
     temp_dir.mkdir(parents=True, exist_ok=True)
     pieces: list[str] = []

 ESPEAK_NG_EXE = os.getenv("ESPEAK_NG_EXE")
 TESSERACT_EXE = os.getenv("TESSERACT_EXE")
 TESSDATA_DIR = Path(os.getenv("TESSDATA_DIR", str(DATA_DIR / "tessdata")))
+OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
 OCR_ENGINE_CHOICES = {
     "arabic",
     "arabic-max",
                     or easyocr_ready
                     or tesseract_path
                 ),
+                "label": "Arabic OCR comparison - slower",
                 "trainedFor": "Arabic printed text",
                 "models": [
                     "QARI-OCR Arabic book VLM",
             "easyocr": {"available": easyocr_ready, "label": "General Arabic OCR"},
             "paddleocr": {
                 "available": paddleocr_ready,
+                "label": "PaddleOCR Arabic - faster, less readable",
                 "trainedFor": "Arabic printed text",
                 "model": "arabic_PP-OCRv5_mobile_rec",
+                "recommendedFor": "Usable fallback, but the 5-page benchmark produced more fragmented text than Tesseract",
             },
             "paddleocrVl": {
                 "available": paddleocr_vl_ready,
                 "model": "Surya OCR 2",
                 "recommendedFor": "Hard scans on a real worker, not Vercel serverless",
             },
+            "tesseract": {
+                "available": bool(tesseract_path),
+                "label": "Tesseract Arabic - Recommended readable",
+                "trainedFor": "Arabic printed text",
+                "recommendedFor": "Best readable output on the 5-page Arabic benchmark; uses OCR_RENDER_ZOOM=2 and TESSERACT_PSM=4 by default",
+            },
             "language": os.getenv("OCR_LANGUAGE", "ara"),
         },
         "readyForArabic": bool(
         },
         "recommendedStack": {
             "pdf": "PyMuPDF embedded text first",
+            "ocrEngine": "tesseract",
+            "ocrSettings": "OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
             "voiceId": "silma-local",
             "audioStorage": "worker-local retained downloads",
             "benchmarkRule": "Run a representative 5-page Arabic sample before full-book audio.",
         )
     variant = render_zoom is not None or psm is not None
     render_zoom = render_zoom or float(os.getenv("OCR_RENDER_ZOOM", "2.0"))
+    psm = psm or int(os.getenv("TESSERACT_PSM", "4"))
     temp_dir = UPLOAD_DIR / f"ocr_{uuid.uuid4().hex}"
     temp_dir.mkdir(parents=True, exist_ok=True)
     pieces: list[str] = []

docs/best-free-arabic-pdf-audio-stack.md CHANGED Viewed

@@ -9,7 +9,7 @@ The source evidence is summarized in `docs/source-evidence.md`; verify the resea
 For this project, the best practical free local stack is:
 1. PyMuPDF for embedded PDF text.
-2. `OCR_ENGINE=arabic-max` for scanned pages by default; it compares the strongest available Arabic OCR outputs and chooses the cleanest text. When QARI-OCR, Tawkeed, KATIB, Arabic-Qwen, or Baseer is installed, this includes Arabic-trained VLM OCR candidates.
 3. EasyOCR Arabic as a strong alternate for older scans and difficult layouts.
 4. `OCR_ENGINE=best` for short quality tests, which compares the free local OCR engines and picks the best-looking Arabic text.
 5. QARI-OCR as the optional Arabic-native heavy OCR path for strong workers.

 For this project, the best practical free local stack is:
 1. PyMuPDF for embedded PDF text.
+2. `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for scanned pages by default; it produced the most readable text on the 5-page Arabic benchmark. Use `OCR_ENGINE=arabic-max` only on short samples when the default reads a specific book badly.
 3. EasyOCR Arabic as a strong alternate for older scans and difficult layouts.
 4. `OCR_ENGINE=best` for short quality tests, which compares the free local OCR engines and picks the best-looking Arabic text.
 5. QARI-OCR as the optional Arabic-native heavy OCR path for strong workers.

docs/father-user-guide.md CHANGED Viewed

@@ -8,7 +8,7 @@ This guide is for the person using the website, not for setup.
 2. Enter the access code.
 3. Choose the Arabic PDF.
 4. Leave **Voice** on the best Arabic voice unless someone tells you to change it.
-5. Leave **Text quality** on **Arabic OCR - Recommended balance** for a new scanned book.
 6. Leave **Pages** on **Quick test** first.
 7. Press **Create Audio**.
 8. Wait until the status says the audio is ready.
@@ -18,9 +18,9 @@ This guide is for the person using the website, not for setup.
 ## Which Text Quality To Choose
-Use **Arabic OCR - Recommended balance** first. It is the best normal choice for scanned Arabic books because it keeps quality high without running the slowest checks.
-Use **Maximum Arabic OCR - slower** on a short sample when the recommended option reads badly. It compares more OCR results and keeps the cleanest text, but it can take much longer.
 Use **QARI Arabic books** for a difficult scanned book when the normal option reads badly and the worker is strong enough. It is trained for Arabic books and manuscripts, but it can be much slower.
@@ -28,13 +28,13 @@ Use **KATIB Arabic OCR** when QARI is too slow or too heavy. It is also trained
 Use **Best scan test** only on a short sample. It is useful for deciding which OCR engine works best for one book, but it is too slow for most full books.
-Use **PaddleOCR Arabic - Recommended balance** when the test sounds good and you want the full book to run faster.
 Use **Tesseract Arabic fallback** when the other options are broken or when a benchmark says Tesseract worked best for that book.
 ## If Something Fails
-If the app says the text quality is poor, do not make full-book audio yet. Try **Arabic OCR - Recommended balance**, then **Maximum Arabic OCR - slower** on a short test, then **Best scan test**.
 If the first pages are title pages or blank pages, make a 5-page test PDF from better pages and test that before the full book.

 2. Enter the access code.
 3. Choose the Arabic PDF.
 4. Leave **Voice** on the best Arabic voice unless someone tells you to change it.
+5. Leave **Text quality** on **Tesseract Arabic - Recommended readable** for a new scanned book.
 6. Leave **Pages** on **Quick test** first.
 7. Press **Create Audio**.
 8. Wait until the status says the audio is ready.
 ## Which Text Quality To Choose
+Use **Tesseract Arabic - Recommended readable** first. It produced the most readable text in the 5-page Arabic OCR benchmark and is much faster than the comparison modes.
+Use **Arabic OCR comparison - slower** or **Maximum Arabic OCR - slower** on a short sample when the recommended option reads badly. They compare more OCR results and keep the cleanest text, but they can take much longer.
 Use **QARI Arabic books** for a difficult scanned book when the normal option reads badly and the worker is strong enough. It is trained for Arabic books and manuscripts, but it can be much slower.
 Use **Best scan test** only on a short sample. It is useful for deciding which OCR engine works best for one book, but it is too slow for most full books.
+Use **PaddleOCR Arabic - faster, less readable** only when Tesseract is unavailable or a short test sounds better for that book.
 Use **Tesseract Arabic fallback** when the other options are broken or when a benchmark says Tesseract worked best for that book.
 ## If Something Fails
+If the app says the text quality is poor, do not make full-book audio yet. Try **Tesseract Arabic - Recommended readable**, then **Arabic OCR comparison - slower**, then **Best scan test** on a short sample.
 If the first pages are title pages or blank pages, make a 5-page test PDF from better pages and test that before the full book.

docs/live-deployment-checklist.md CHANGED Viewed

@@ -157,7 +157,9 @@ SECRET_KEY=<generated by outputs\deployment-handoff.md>
 CORS_ORIGINS=https://your-vercel-app.vercel.app
 COOKIE_SAMESITE=none
 COOKIE_SECURE=1
-OCR_ENGINE=arabic-max
 DEFAULT_VOICE_ID=silma-local
 OUTPUT_RETENTION_DAYS=7
 OUTPUT_MAX_FILES=25
@@ -168,7 +170,7 @@ SILMA_FORCE_TASHKEEL=0
 SILMA_NORMALIZE_NUMBERS=0
 ```
-Keep `OCR_ENGINE=arabic-max` for the first real deployment. It compares the strongest installed Arabic OCR outputs instead of trusting one engine.
 ## 3. Vercel Website

 CORS_ORIGINS=https://your-vercel-app.vercel.app
 COOKIE_SAMESITE=none
 COOKIE_SECURE=1
+OCR_ENGINE=tesseract
+OCR_RENDER_ZOOM=2
+TESSERACT_PSM=4
 DEFAULT_VOICE_ID=silma-local
 OUTPUT_RETENTION_DAYS=7
 OUTPUT_MAX_FILES=25
 SILMA_NORMALIZE_NUMBERS=0
 ```
+Keep `OCR_ENGINE=tesseract`, `OCR_RENDER_ZOOM=2`, and `TESSERACT_PSM=4` for the first real deployment. That setting produced the most readable text on the 5-page Arabic benchmark. Use `arabic-max` only on short samples when this setting reads a specific book badly.
 ## 3. Vercel Website

docs/ocr-readability-benchmark.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Arabic OCR Readability Benchmark
+Last run: June 8, 2026.
+Benchmark file: `test_pdfs/arabic-reader-5-page-test.pdf`
+Scoring uses the app's `assess_text_quality` and speech-readiness metrics: Arabic word count, common Arabic word hits, one-letter fragment ratio, low-information line ratio, placeholder ratio, and total quality score. Higher score is better; `good` is preferred over `warning`.
+## Result
+Recommended OCR:
+```text
+OCR_ENGINE=tesseract
+OCR_RENDER_ZOOM=2
+TESSERACT_PSM=4
+```
+This setting produced the most readable 5-page output while staying practical for full-book jobs.
+| OCR setting | Pages | Seconds | Quality | Score | Arabic words | Fragment line ratio | Extraction |
+| --- | ---: | ---: | --- | ---: | ---: | ---: | --- |
+| Tesseract 2x PSM 4 | 5 | 37.30 | good | 11919.05 | 3120 | 0.0433 | `tesseract@2x-psm4` |
+| Tesseract default PSM 6 | 5 | 28.88 | good | 11510.50 | 3284 | 0.0166 | `tesseract@1.5x-psm6` |
+| PaddleOCR Arabic | 5 | 106.91 | warning | 8105.80 | 2251 | 0.3133 | `paddleocr` |
+| Auto fallback | 5 | 104.47 | warning | 8105.80 | 2251 | 0.3133 | `paddleocr` |
+| EasyOCR mode | 5 | 102.39 | warning | 8105.80 | 2251 | 0.3133 | `paddleocr` |
+The slower comparison modes were tested on the 1-page sample because the full 5-page comparison exceeded the 10-minute run window. Both selected the same underlying winner, `tesseract@2x-psm4`, but took about 4.5 minutes for one page:
+| OCR setting | Pages | Seconds | Quality | Score | Arabic words | Extraction |
+| --- | ---: | ---: | --- | ---: | ---: | --- |
+| Arabic OCR comparison | 1 | 280.76 | good | 3565.85 | 719 | `arabic:tesseract@2x-psm4` |
+| Maximum Arabic OCR | 1 | 268.47 | good | 3565.85 | 719 | `arabic-max:tesseract@2x-psm4` |
+## Interpretation
+`arabic` and `arabic-max` are useful short-sample diagnostics because they can compare installed OCR engines and pick the cleanest text. They are not the right default for long PDFs on the current free worker because they spend minutes per page and selected Tesseract anyway.
+PaddleOCR is available and works, but on this book sample it returned many low-information lines and more fragmented Arabic text. It remains a fallback, not the recommendation.
+The live/default website setting should therefore be `Tesseract Arabic - Recommended readable`.

docs/production-worker-architecture.md CHANGED Viewed

@@ -93,7 +93,7 @@ The worker bundle also includes setup scripts for optional heavy paths, but they
 `Dockerfile.worker` exposes `INSTALL_QARI_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_BASEER_OCR=1`, `INSTALL_PADDLEOCR_VL=1`, and `INSTALL_SUPERTONIC=1` build args so stronger workers can install QARI-OCR, KATIB, Arabic-Qwen, Baseer, PaddleOCR-VL, and the optional Supertonic CPU voice without editing the Dockerfile. This keeps the free CPU image practical while making the higher-quality free OCR paths and fast voice comparison path deployable.
-QARI-OCR 0.4 is the strongest Arabic-native OCR upgrade to test for a stronger worker. It is a 4B VLM fine-tuned for Islamic books and Arabic manuscripts, so keep it out of the default free CPU family-site worker unless a short Arabic-book benchmark proves it improves the actual pages and the worker has enough RAM/GPU. KATIB 0.8B and Arabic-Qwen3.5-OCR-v4 are the smaller Arabic-trained OCR upgrades to try when QARI is too heavy. If the worker is too small for QARI, set `QARI_OCR_MODEL=NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct` to test the lighter older QARI path. PaddleOCR-VL-1.6 remains the main general document-parser upgrade to watch. In the website, start with `Arabic OCR - Recommended balance` or `PaddleOCR Arabic - Recommended balance`; use `Maximum Arabic OCR - slower` only for short tests or difficult pages.
 The repo includes optional KATIB, Arabic-Qwen, QARI-OCR, and PaddleOCR-VL sidecar scripts for this evaluation path:
@@ -131,7 +131,7 @@ Create a new Space:
    - `CORS_ORIGINS=https://your-vercel-app.vercel.app`
    - `COOKIE_SAMESITE=none`
    - `COOKIE_SECURE=1`
-5. Keep `OCR_ENGINE=arabic-max` for quality, or switch to `OCR_ENGINE=paddleocr` only when you need a faster single-engine run.
 Then set this on Vercel:

 `Dockerfile.worker` exposes `INSTALL_QARI_OCR=1`, `INSTALL_KATIB_OCR=1`, `INSTALL_ARABIC_QWEN_OCR=1`, `INSTALL_BASEER_OCR=1`, `INSTALL_PADDLEOCR_VL=1`, and `INSTALL_SUPERTONIC=1` build args so stronger workers can install QARI-OCR, KATIB, Arabic-Qwen, Baseer, PaddleOCR-VL, and the optional Supertonic CPU voice without editing the Dockerfile. This keeps the free CPU image practical while making the higher-quality free OCR paths and fast voice comparison path deployable.
+QARI-OCR 0.4 is the strongest Arabic-native OCR upgrade to test for a stronger worker. It is a 4B VLM fine-tuned for Islamic books and Arabic manuscripts, so keep it out of the default free CPU family-site worker unless a short Arabic-book benchmark proves it improves the actual pages and the worker has enough RAM/GPU. KATIB 0.8B and Arabic-Qwen3.5-OCR-v4 are the smaller Arabic-trained OCR upgrades to try when QARI is too heavy. If the worker is too small for QARI, set `QARI_OCR_MODEL=NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct` to test the lighter older QARI path. PaddleOCR-VL-1.6 remains the main general document-parser upgrade to watch. In the website, start with `Tesseract Arabic - Recommended readable`; use `Arabic OCR comparison - slower` or `Maximum Arabic OCR - slower` only for short tests or difficult pages.
 The repo includes optional KATIB, Arabic-Qwen, QARI-OCR, and PaddleOCR-VL sidecar scripts for this evaluation path:
    - `CORS_ORIGINS=https://your-vercel-app.vercel.app`
    - `COOKIE_SAMESITE=none`
    - `COOKIE_SECURE=1`
+5. Keep `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for readable full-book runs, or switch to a slower comparison mode only when a short sample proves it is better for that book.
 Then set this on Vercel:

docs/recommended-decision-card.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "title": "Recommended Free Arabic PDF To Audio Decision Card",
   "currentDefault": {
     "pdf": "PyMuPDF embedded text first",
-    "scannedPdfOcr": "OCR_ENGINE=arabic",
     "voice": "SILMA TTS (silma-local)",
     "audioStorage": "worker-local retained downloads",
     "hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL"

   "title": "Recommended Free Arabic PDF To Audio Decision Card",
   "currentDefault": {
     "pdf": "PyMuPDF embedded text first",
+    "scannedPdfOcr": "OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
     "voice": "SILMA TTS (silma-local)",
     "audioStorage": "worker-local retained downloads",
     "hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL"

docs/recommended-decision-card.md CHANGED Viewed

@@ -5,7 +5,7 @@
 | Layer | Choice |
 | --- | --- |
 | pdf | PyMuPDF embedded text first |
-| scannedPdfOcr | OCR_ENGINE=arabic |
 | voice | SILMA TTS (silma-local) |
 | audioStorage | worker-local retained downloads |
 | hosting | Vercel shell plus Docker worker via WORKER_BASE_URL |

 | Layer | Choice |
 | --- | --- |
 | pdf | PyMuPDF embedded text first |
+| scannedPdfOcr | OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4 |
 | voice | SILMA TTS (silma-local) |
 | audioStorage | worker-local retained downloads |
 | hosting | Vercel shell plus Docker worker via WORKER_BASE_URL |

docs/recommended-free-stack.md CHANGED Viewed

@@ -7,7 +7,7 @@ This is the compact decision report generated from the current research watchlis
 | Layer | Recommendation | Why |
 | --- | --- | --- |
 | Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |
-| Scanned PDFs | `OCR_ENGINE=arabic` | It uses the best installed Arabic OCR path without the slowest heavy tests, keeping quality high while staying practical for full books. |
 | Default voice | SILMA TTS | Arabic-focused Fusha/MSA voice with normalization and tashkeel options. |
 | Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |
 | Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |
@@ -83,4 +83,4 @@ Promote a model only when all of these are true:
 5. Its runtime is acceptable for the target worker.
 6. The generated JSON score passes `scripts\model_promotion_gate.py` after human review.
-Current practical default: PyMuPDF -> `arabic` OCR -> SILMA TTS -> downloadable worker audio.

 | Layer | Recommendation | Why |
 | --- | --- | --- |
 | Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |
+| Scanned PDFs | `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` | It produced the most readable text on the 5-page Arabic OCR benchmark while staying much faster than the comparison modes. |
 | Default voice | SILMA TTS | Arabic-focused Fusha/MSA voice with normalization and tashkeel options. |
 | Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |
 | Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |
 5. Its runtime is acceptable for the target worker.
 6. The generated JSON score passes `scripts\model_promotion_gate.py` after human review.
+Current practical default: PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS -> downloadable worker audio.

scripts/audit_goal_readiness.py CHANGED Viewed

@@ -259,7 +259,7 @@ def collect_checks(
         "PASS"
         if has_all(
             readme + production + deployment_checklist + dockerfile,
-            ["WORKER_BASE_URL", "Docker", "OCR_ENGINE=arabic", "AUDIO_FORMAT=mp3", "worker-verification.json"],
         )
         and has_all(deployment_handoff, ["WORKER_BASE_URL", "prove_live_deployment.py", "worker-verification.json"])
         and has_all(
@@ -395,12 +395,12 @@ def collect_checks(
             [
                 "Recommended Free Arabic PDF To Audio Stack",
                 "PyMuPDF text extraction first",
-                "`OCR_ENGINE=arabic`",
                 "SILMA TTS",
                 "4.5 MB function payload limit",
                 "50 GB but non-persistent",
                 "Benchmark Before Promoting",
-                "PyMuPDF -> `arabic` OCR -> SILMA TTS",
             ],
         )
         and has_all(

         "PASS"
         if has_all(
             readme + production + deployment_checklist + dockerfile,
+            ["WORKER_BASE_URL", "Docker", "OCR_ENGINE=tesseract", "OCR_RENDER_ZOOM=2", "TESSERACT_PSM=4", "AUDIO_FORMAT=mp3", "worker-verification.json"],
         )
         and has_all(deployment_handoff, ["WORKER_BASE_URL", "prove_live_deployment.py", "worker-verification.json"])
         and has_all(
             [
                 "Recommended Free Arabic PDF To Audio Stack",
                 "PyMuPDF text extraction first",
+                "`OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4`",
                 "SILMA TTS",
                 "4.5 MB function payload limit",
                 "50 GB but non-persistent",
                 "Benchmark Before Promoting",
+                "PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS",
             ],
         )
         and has_all(

scripts/check_deployment_readiness.py CHANGED Viewed

@@ -216,7 +216,7 @@ def check_worker(root: Path = ROOT_DIR) -> list[Check]:
     add(checks, "Worker", "base image", "PASS" if "python:3.10" in dockerfile else "WARN", "Python 3.10 is expected")
     for package in ["tesseract-ocr-ara", "espeak-ng", "ffmpeg"]:
         add(checks, "Worker", f"apt package {package}", "PASS" if package in dockerfile else "FAIL", package)
-    for env_key in ["WORK_DIR", "DATABASE_PATH", "OCR_ENGINE=arabic", "AUDIO_FORMAT=mp3"]:
         add(checks, "Worker", f"env {env_key}", "PASS" if env_key in dockerfile else "WARN", env_key)
     for arg in [
         "ARG INSTALL_QARI_OCR=0",

     add(checks, "Worker", "base image", "PASS" if "python:3.10" in dockerfile else "WARN", "Python 3.10 is expected")
     for package in ["tesseract-ocr-ara", "espeak-ng", "ffmpeg"]:
         add(checks, "Worker", f"apt package {package}", "PASS" if package in dockerfile else "FAIL", package)
+    for env_key in ["WORK_DIR", "DATABASE_PATH", "OCR_ENGINE=tesseract", "OCR_RENDER_ZOOM=2", "TESSERACT_PSM=4", "AUDIO_FORMAT=mp3"]:
         add(checks, "Worker", f"env {env_key}", "PASS" if env_key in dockerfile else "WARN", env_key)
     for arg in [
         "ARG INSTALL_QARI_OCR=0",

scripts/check_research_sources.py CHANGED Viewed

@@ -265,17 +265,17 @@ REQUIRED_METADATA_MARKERS = [
 REQUIRED_RECOMMENDATION_MARKERS = [
     "Recommended Free Arabic PDF To Audio Stack",
     "PyMuPDF text extraction first",
-    "`OCR_ENGINE=arabic`",
     "SILMA TTS",
     "Vercel shell plus Docker worker",
     "Benchmark Before Promoting",
     "model_promotion_gate.py",
-    "PyMuPDF -> `arabic` OCR -> SILMA TTS",
 ]
 REQUIRED_DECISION_CARD_MARKERS = [
     "Recommended Free Arabic PDF To Audio Decision Card",
     "PyMuPDF embedded text first",
-    "OCR_ENGINE=arabic",
     "SILMA TTS",
     "worker-local retained downloads",
     "Vercel shell plus Docker worker",

 REQUIRED_RECOMMENDATION_MARKERS = [
     "Recommended Free Arabic PDF To Audio Stack",
     "PyMuPDF text extraction first",
+    "`OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4`",
     "SILMA TTS",
     "Vercel shell plus Docker worker",
     "Benchmark Before Promoting",
     "model_promotion_gate.py",
+    "PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS",
 ]
 REQUIRED_DECISION_CARD_MARKERS = [
     "Recommended Free Arabic PDF To Audio Decision Card",
     "PyMuPDF embedded text first",
+    "OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
     "SILMA TTS",
     "worker-local retained downloads",
     "Vercel shell plus Docker worker",

scripts/deployment_handoff.py CHANGED Viewed

@@ -72,7 +72,9 @@ def build_handoff(
         "CORS_ORIGINS": vercel_origin,
         "COOKIE_SAMESITE": "none",
         "COOKIE_SECURE": "1",
-        "OCR_ENGINE": "arabic",
         "DEFAULT_VOICE_ID": "silma-local",
         "OUTPUT_RETENTION_DAYS": "7",
         "OUTPUT_MAX_FILES": "25",
@@ -223,7 +225,7 @@ def write_markdown(path: Path, handoff: DeploymentHandoff) -> None:
                 "- Set Vercel `WORKER_BASE_URL` to the exact Hugging Face worker URL shown above.",
                 "- After both deployments finish, run the Vercel worker diagnostic command below before uploading a large PDF. It must show `site worker reachable from vercel` and `site worker CORS ready`.",
                 "- Remove Vercel's temporary direct Hugging Face TTS fallback variables for production: `ENABLE_DIRECT_CLOUD_TTS`, `HF_API_TOKEN`, `HF_TTS_MODEL`, and `DEFAULT_VOICE_ID`.",
-                "- Keep `OCR_ENGINE=arabic` for normal scanned Arabic books; use `arabic-max` only when a short sample needs the slower maximum comparison.",
                 "- Do not commit this handoff; it contains the deployment `SECRET_KEY`.",
                 "",
                 "## Hugging Face Docker Build Args",

         "CORS_ORIGINS": vercel_origin,
         "COOKIE_SAMESITE": "none",
         "COOKIE_SECURE": "1",
+        "OCR_ENGINE": "tesseract",
+        "OCR_RENDER_ZOOM": "2",
+        "TESSERACT_PSM": "4",
         "DEFAULT_VOICE_ID": "silma-local",
         "OUTPUT_RETENTION_DAYS": "7",
         "OUTPUT_MAX_FILES": "25",
                 "- Set Vercel `WORKER_BASE_URL` to the exact Hugging Face worker URL shown above.",
                 "- After both deployments finish, run the Vercel worker diagnostic command below before uploading a large PDF. It must show `site worker reachable from vercel` and `site worker CORS ready`.",
                 "- Remove Vercel's temporary direct Hugging Face TTS fallback variables for production: `ENABLE_DIRECT_CLOUD_TTS`, `HF_API_TOKEN`, `HF_TTS_MODEL`, and `DEFAULT_VOICE_ID`.",
+                "- Keep `OCR_ENGINE=tesseract`, `OCR_RENDER_ZOOM=2`, and `TESSERACT_PSM=4` for normal scanned Arabic books; use `arabic-max` only when a short sample needs the slower maximum comparison.",
                 "- Do not commit this handoff; it contains the deployment `SECRET_KEY`.",
                 "",
                 "## Hugging Face Docker Build Args",

scripts/export_hf_space.py CHANGED Viewed

@@ -216,7 +216,9 @@ SECRET_KEY=<generated by outputs\\deployment-handoff.md>
 CORS_ORIGINS=https://your-vercel-app.vercel.app
 COOKIE_SAMESITE=none
 COOKIE_SECURE=1
-OCR_ENGINE=arabic
 DEFAULT_VOICE_ID=silma-local
 OUTPUT_RETENTION_DAYS=7
 OUTPUT_MAX_FILES=25
@@ -232,7 +234,7 @@ python scripts\\deployment_handoff.py https://your-space.hf.space --origin https
 Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.
-The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=arabic` for balanced scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
 Optional stronger-worker build args:

 CORS_ORIGINS=https://your-vercel-app.vercel.app
 COOKIE_SAMESITE=none
 COOKIE_SECURE=1
+OCR_ENGINE=tesseract
+OCR_RENDER_ZOOM=2
+TESSERACT_PSM=4
 DEFAULT_VOICE_ID=silma-local
 OUTPUT_RETENTION_DAYS=7
 OUTPUT_MAX_FILES=25
 Keep `outputs\\deployment-handoff.md` private because it contains deployment secrets.
+The compact process recommendation is included at `docs/recommended-free-stack.md`, with the machine-readable deployment decision card at `docs/recommended-decision-card.json` and its readable companion at `docs/recommended-decision-card.md`. The current practical default is PyMuPDF embedded text first, `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` for the most readable tested scanned Arabic OCR, SILMA TTS for the first clean voice, and downloadable worker audio.
 Optional stronger-worker build args:

scripts/research_watchlist.py CHANGED Viewed

@@ -1488,7 +1488,7 @@ def build_recommendation_report(candidates: list[Candidate]) -> str:
         "| Layer | Recommendation | Why |",
         "| --- | --- | --- |",
         "| Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |",
-        "| Scanned PDFs | `OCR_ENGINE=arabic` | It uses the best installed Arabic OCR path without the slowest heavy tests, keeping quality high while staying practical for full books. |",
         f"| Default voice | {default_voice.name} | {default_voice.why} |",
         "| Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |",
         "| Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |",
@@ -1544,7 +1544,7 @@ def build_recommendation_report(candidates: list[Candidate]) -> str:
             "5. Its runtime is acceptable for the target worker.",
             "6. The generated JSON score passes `scripts\\model_promotion_gate.py` after human review.",
             "",
-            "Current practical default: PyMuPDF -> `arabic` OCR -> SILMA TTS -> downloadable worker audio.",
             "",
         ]
     )
@@ -1563,7 +1563,7 @@ def build_decision_card(candidates: list[Candidate]) -> dict[str, object]:
         "title": "Recommended Free Arabic PDF To Audio Decision Card",
         "currentDefault": {
             "pdf": "PyMuPDF embedded text first",
-            "scannedPdfOcr": "OCR_ENGINE=arabic",
             "voice": "SILMA TTS (silma-local)",
             "audioStorage": "worker-local retained downloads",
             "hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL",

         "| Layer | Recommendation | Why |",
         "| --- | --- | --- |",
         "| Embedded PDFs | PyMuPDF text extraction first | It is free, fast, and avoids OCR errors when the PDF already contains usable Arabic text. |",
+        "| Scanned PDFs | `OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4` | It produced the most readable text on the 5-page Arabic OCR benchmark while staying much faster than the comparison modes. |",
         f"| Default voice | {default_voice.name} | {default_voice.why} |",
         "| Download/storage | Worker-local retained audio files | Free by default and avoids Vercel's 4.5 MB function payload limit; Hugging Face free CPU disk is 50 GB but non-persistent, so downloads are short-lived. |",
         "| Hosted shape | Vercel shell plus Docker worker via `WORKER_BASE_URL` | Vercel serves the easy website while the worker handles large PDFs, OCR, and TTS on free CPU Space hardware when the job size is reasonable. |",
             "5. Its runtime is acceptable for the target worker.",
             "6. The generated JSON score passes `scripts\\model_promotion_gate.py` after human review.",
             "",
+            "Current practical default: PyMuPDF -> `tesseract@2x-psm4` OCR -> SILMA TTS -> downloadable worker audio.",
             "",
         ]
     )
         "title": "Recommended Free Arabic PDF To Audio Decision Card",
         "currentDefault": {
             "pdf": "PyMuPDF embedded text first",
+            "scannedPdfOcr": "OCR_ENGINE=tesseract OCR_RENDER_ZOOM=2 TESSERACT_PSM=4",
             "voice": "SILMA TTS (silma-local)",
             "audioStorage": "worker-local retained downloads",
             "hosting": "Vercel shell plus Docker worker via WORKER_BASE_URL",

scripts/setup_paddleocr.ps1 CHANGED Viewed

@@ -15,4 +15,4 @@ $pythonExe = Join-Path $venv "Scripts\python.exe"
 & $pythonExe -m pip install -r (Join-Path $root "requirements-paddleocr.txt")
 Write-Host "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $venv"
-Write-Host "Use OCR_ENGINE=arabic-max or choose Maximum Arabic-trained OCR in the website."

 & $pythonExe -m pip install -r (Join-Path $root "requirements-paddleocr.txt")
 Write-Host "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $venv"
+Write-Host "Use OCR_ENGINE=paddleocr only as a fallback; the website's readable default is Tesseract Arabic."

scripts/setup_paddleocr.sh CHANGED Viewed

@@ -13,4 +13,4 @@ fi
 "$VENV/bin/python" -m pip install -r "$ROOT/requirements-paddleocr.txt"
 echo "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $VENV"
-echo "Use OCR_ENGINE=arabic-max or choose Maximum Arabic-trained OCR in the website."

 "$VENV/bin/python" -m pip install -r "$ROOT/requirements-paddleocr.txt"
 echo "PaddleOCR Arabic PP-OCRv5 sidecar is ready at $VENV"
+echo "Use OCR_ENGINE=paddleocr only as a fallback; the website's readable default is Tesseract Arabic."

scripts/validate_deployment_env.py CHANGED Viewed

@@ -228,8 +228,22 @@ def validate_worker_env(
         checks,
         "Worker",
         "OCR_ENGINE",
-        "PASS" if env.get("OCR_ENGINE", "arabic") == "arabic" else "WARN",
-        env.get("OCR_ENGINE", "arabic"),
     )
     add(
         checks,

         checks,
         "Worker",
         "OCR_ENGINE",
+        "PASS" if env.get("OCR_ENGINE", "tesseract") == "tesseract" else "WARN",
+        env.get("OCR_ENGINE", "tesseract"),
+    )
+    add(
+        checks,
+        "Worker",
+        "OCR_RENDER_ZOOM",
+        "PASS" if env.get("OCR_RENDER_ZOOM", "2") == "2" else "WARN",
+        env.get("OCR_RENDER_ZOOM", "2"),
+    )
+    add(
+        checks,
+        "Worker",
+        "TESSERACT_PSM",
+        "PASS" if env.get("TESSERACT_PSM", "4") == "4" else "WARN",
+        env.get("TESSERACT_PSM", "4"),
     )
     add(
         checks,

scripts/verify_site.py CHANGED Viewed

@@ -75,7 +75,8 @@ def verify_site(
             checks,
             "site recommended stack documented",
             recommended_stack.get("pdf") == "PyMuPDF embedded text first"
-            and recommended_stack.get("ocrEngine") == "arabic"
             and recommended_stack.get("voiceId") == "silma-local"
             and recommended_stack.get("audioStorage") == "worker-local retained downloads",
             json.dumps(recommended_stack),

             checks,
             "site recommended stack documented",
             recommended_stack.get("pdf") == "PyMuPDF embedded text first"
+            and recommended_stack.get("ocrEngine") == "tesseract"
+            and recommended_stack.get("ocrSettings") == "OCR_RENDER_ZOOM=2 TESSERACT_PSM=4"
             and recommended_stack.get("voiceId") == "silma-local"
             and recommended_stack.get("audioStorage") == "worker-local retained downloads",
             json.dumps(recommended_stack),

scripts/verify_worker.py CHANGED Viewed

@@ -113,7 +113,8 @@ def has_recommended_stack(engines: dict[str, Any]) -> bool:
     stack = recommended_stack_summary(engines)
     return bool(
         stack.get("pdf") == "PyMuPDF embedded text first"
-        and stack.get("ocrEngine") == "arabic"
         and stack.get("voiceId") == "silma-local"
         and stack.get("audioStorage") == "worker-local retained downloads"
     )

     stack = recommended_stack_summary(engines)
     return bool(
         stack.get("pdf") == "PyMuPDF embedded text first"
+        and stack.get("ocrEngine") == "tesseract"
+        and stack.get("ocrSettings") == "OCR_RENDER_ZOOM=2 TESSERACT_PSM=4"
         and stack.get("voiceId") == "silma-local"
         and stack.get("audioStorage") == "worker-local retained downloads"
     )

static/app.js CHANGED Viewed

@@ -84,19 +84,19 @@ let browserSpeechSourceName = "";
 const ocrModeLabels = {
   "arabic-max": "Maximum Arabic OCR - slower",
-  arabic: "Arabic OCR - Recommended balance",
   "qari-ocr": "QARI Arabic books (best)",
   "tawkeed-ocr": "Tawkeed Arabic OCR",
   "katib-ocr": "KATIB Arabic OCR (lighter)",
   "arabic-qwen-ocr": "Arabic-Qwen OCR",
   "arabic-glm-ocr": "Arabic-GLM OCR v2",
   "baseer-ocr": "Baseer Arabic OCR",
-  paddleocr: "PaddleOCR Arabic - Recommended balance",
   "paddleocr-vl": "PaddleOCR-VL heavy",
   best: "Best scan test",
   surya: "Surya heavy OCR",
   easyocr: "General Arabic OCR",
-  tesseract: "Tesseract Arabic fallback",
   auto: "Auto fallback",
 };
@@ -316,7 +316,7 @@ async function loadHealth() {
         engines.ocr?.preferred === "arabic-max"
           ? "Maximum Arabic OCR is ready, but slower"
         : engines.ocr?.preferred === "arabic"
-          ? "Recommended balanced Arabic OCR is ready"
           : engines.ocr?.preferred === "qari-ocr"
           ? "QARI Arabic book OCR is ready"
           : engines.ocr?.preferred === "tawkeed-ocr"
@@ -330,11 +330,13 @@ async function loadHealth() {
           : engines.ocr?.preferred === "baseer-ocr"
           ? "Baseer Arabic OCR is ready"
         : engines.ocr?.preferred === "paddleocr"
-          ? "Recommended fast PaddleOCR Arabic is ready"
           : engines.ocr?.preferred === "paddleocr-vl"
           ? "PaddleOCR-VL heavy OCR is ready"
           : engines.ocr?.preferred === "surya"
             ? "Surya heavy OCR is ready"
           : engines.ocr?.preferred === "best"
             ? "Best Arabic OCR test mode is ready"
             : engines.ocr?.preferred
@@ -1389,7 +1391,7 @@ function describeOcrMode() {
     engineNotice.textContent = `Maximum Arabic OCR selected. It tries the most engines and keeps the cleanest text, but it is slower. Use Quick test first.${installedText}`;
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "arabic") {
-    engineNotice.textContent = `Recommended balance selected. It uses the best installed Arabic OCR path without the slowest heavy tests, so it is the best starting choice for full books.${installedText}`;
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "qari-ocr") {
     engineNotice.textContent = "QARI Arabic books selected. Use this on a short sample or strong worker; it is trained for Arabic books, Islamic texts, manuscripts, and layout-aware Arabic transcription.";
@@ -1413,7 +1415,10 @@ function describeOcrMode() {
     engineNotice.textContent = "Best scan test selected. Use this on a short sample, then run the winning engine for the full book.";
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "paddleocr") {
-    engineNotice.textContent = "PaddleOCR Arabic selected. This is the fastest recommended balance on the current worker for scanned Arabic text.";
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "paddleocr-vl") {
     engineNotice.textContent = "PaddleOCR-VL selected. Use this only on a short sample or strong worker; it is much heavier than normal Arabic OCR.";
@@ -1644,8 +1649,8 @@ function showQualityHint(quality) {
   }
   const reasons = quality.reasons?.length ? ` ${quality.reasons.join("; ")}.` : "";
   const action = quality.quality === "poor"
-    ? "Try Arabic OCR - Recommended balance, Best scan test, or another OCR mode before creating audio."
-    : "Listen to a short sample before running the full book. If it sounds wrong, try Arabic OCR - Recommended balance, Best scan test, or another OCR mode.";
   qualityHint.textContent = `Text needs checking.${reasons} ${action}`;
   qualityHint.classList.remove("hidden");
   qualityHint.classList.toggle("poor", quality.quality === "poor");

 const ocrModeLabels = {
   "arabic-max": "Maximum Arabic OCR - slower",
+  arabic: "Arabic OCR comparison - slower",
   "qari-ocr": "QARI Arabic books (best)",
   "tawkeed-ocr": "Tawkeed Arabic OCR",
   "katib-ocr": "KATIB Arabic OCR (lighter)",
   "arabic-qwen-ocr": "Arabic-Qwen OCR",
   "arabic-glm-ocr": "Arabic-GLM OCR v2",
   "baseer-ocr": "Baseer Arabic OCR",
+  paddleocr: "PaddleOCR Arabic - faster, less readable",
   "paddleocr-vl": "PaddleOCR-VL heavy",
   best: "Best scan test",
   surya: "Surya heavy OCR",
   easyocr: "General Arabic OCR",
+  tesseract: "Tesseract Arabic - Recommended readable",
   auto: "Auto fallback",
 };
         engines.ocr?.preferred === "arabic-max"
           ? "Maximum Arabic OCR is ready, but slower"
         : engines.ocr?.preferred === "arabic"
+          ? "Arabic OCR comparison is ready, but slower"
           : engines.ocr?.preferred === "qari-ocr"
           ? "QARI Arabic book OCR is ready"
           : engines.ocr?.preferred === "tawkeed-ocr"
           : engines.ocr?.preferred === "baseer-ocr"
           ? "Baseer Arabic OCR is ready"
         : engines.ocr?.preferred === "paddleocr"
+          ? "PaddleOCR Arabic is ready, but less readable"
           : engines.ocr?.preferred === "paddleocr-vl"
           ? "PaddleOCR-VL heavy OCR is ready"
           : engines.ocr?.preferred === "surya"
             ? "Surya heavy OCR is ready"
+          : engines.ocr?.preferred === "tesseract"
+            ? "Recommended readable Tesseract Arabic OCR is ready"
           : engines.ocr?.preferred === "best"
             ? "Best Arabic OCR test mode is ready"
             : engines.ocr?.preferred
     engineNotice.textContent = `Maximum Arabic OCR selected. It tries the most engines and keeps the cleanest text, but it is slower. Use Quick test first.${installedText}`;
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "arabic") {
+    engineNotice.textContent = `Arabic OCR comparison selected. It compares installed OCR paths and can be much slower than the recommended Tesseract setting.${installedText}`;
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "qari-ocr") {
     engineNotice.textContent = "QARI Arabic books selected. Use this on a short sample or strong worker; it is trained for Arabic books, Islamic texts, manuscripts, and layout-aware Arabic transcription.";
     engineNotice.textContent = "Best scan test selected. Use this on a short sample, then run the winning engine for the full book.";
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "paddleocr") {
+    engineNotice.textContent = "PaddleOCR Arabic selected. It works, but the 5-page benchmark produced more fragmented text than Tesseract.";
+    engineNotice.classList.remove("warning");
+  } else if (ocrModeSelect.value === "tesseract") {
+    engineNotice.textContent = "Tesseract Arabic selected. This is the recommended readable option from the 5-page OCR benchmark.";
     engineNotice.classList.remove("warning");
   } else if (ocrModeSelect.value === "paddleocr-vl") {
     engineNotice.textContent = "PaddleOCR-VL selected. Use this only on a short sample or strong worker; it is much heavier than normal Arabic OCR.";
   }
   const reasons = quality.reasons?.length ? ` ${quality.reasons.join("; ")}.` : "";
   const action = quality.quality === "poor"
+    ? "Try Tesseract Arabic - Recommended readable, Best scan test, or another OCR mode before creating audio."
+    : "Listen to a short sample before running the full book. If it sounds wrong, try Tesseract Arabic - Recommended readable, Best scan test, or another OCR mode.";
   qualityHint.textContent = `Text needs checking.${reasons} ${action}`;
   qualityHint.classList.remove("hidden");
   qualityHint.classList.toggle("poor", quality.quality === "poor");

static/index.html CHANGED Viewed

@@ -67,7 +67,8 @@
           <div class="field-group">
             <label for="ocrModeSelect">Text quality</label>
             <select id="ocrModeSelect" name="ocrMode">
-              <option value="arabic">Arabic OCR - Recommended balance</option>
               <option value="arabic-max">Maximum Arabic OCR - slower</option>
               <option value="qari-ocr">QARI Arabic books (best)</option>
               <option value="tawkeed-ocr">Tawkeed Arabic OCR</option>
@@ -76,11 +77,10 @@
               <option value="arabic-glm-ocr">Arabic-GLM OCR v2</option>
               <option value="baseer-ocr">Baseer Arabic OCR</option>
               <option value="best">Best scan test</option>
-              <option value="paddleocr">PaddleOCR Arabic - Recommended balance</option>
               <option value="paddleocr-vl">PaddleOCR-VL heavy</option>
               <option value="surya">Surya heavy OCR</option>
               <option value="easyocr">General Arabic OCR</option>
-              <option value="tesseract">Tesseract Arabic fallback</option>
               <option value="auto">Auto fallback</option>
             </select>
           </div>

           <div class="field-group">
             <label for="ocrModeSelect">Text quality</label>
             <select id="ocrModeSelect" name="ocrMode">
+              <option value="tesseract">Tesseract Arabic - Recommended readable</option>
+              <option value="arabic">Arabic OCR comparison - slower</option>
               <option value="arabic-max">Maximum Arabic OCR - slower</option>
               <option value="qari-ocr">QARI Arabic books (best)</option>
               <option value="tawkeed-ocr">Tawkeed Arabic OCR</option>
               <option value="arabic-glm-ocr">Arabic-GLM OCR v2</option>
               <option value="baseer-ocr">Baseer Arabic OCR</option>
               <option value="best">Best scan test</option>
+              <option value="paddleocr">PaddleOCR Arabic - faster, less readable</option>
               <option value="paddleocr-vl">PaddleOCR-VL heavy</option>
               <option value="surya">Surya heavy OCR</option>
               <option value="easyocr">General Arabic OCR</option>
               <option value="auto">Auto fallback</option>
             </select>
           </div>