from __future__ import annotations import argparse import re import sys from pathlib import Path TAG_RE = re.compile(r"<[^>]+>") DEFAULT_QARI_OCR_MODEL = "NAMAA-Space/Qari-OCR-0.4.0-VL-4B-Instruct" def clean_model_text(text: str) -> str: text = TAG_RE.sub("\n", text) text = re.sub(r"```(?:html|markdown|text)?", "", text, flags=re.IGNORECASE) text = text.replace("```", "") lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n".join(lines) def main() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") if hasattr(sys.stderr, "reconfigure"): sys.stderr.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Extract Arabic text from page images with QARI-OCR.") parser.add_argument("--image-dir", required=True, type=Path) parser.add_argument("--out", required=True, type=Path) parser.add_argument("--model", default=DEFAULT_QARI_OCR_MODEL) parser.add_argument("--max-new-tokens", type=int, default=2048) args = parser.parse_args() image_paths = sorted(args.image_dir.glob("*.png")) total = max(len(image_paths), 1) print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True) import torch from transformers import AutoProcessor device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 processor = AutoProcessor.from_pretrained(args.model) prompt = ( "Extract only the readable Arabic text from this scanned document page. " "Keep the natural reading order. Do not summarize or translate." ) uses_qwen3_vl = "qari-ocr-0.4" in args.model.lower() or "qwen3" in args.model.lower() if uses_qwen3_vl: from transformers import AutoModelForVision2Seq model = AutoModelForVision2Seq.from_pretrained( args.model, torch_dtype=dtype, device_map="auto" if device == "cuda" else None, ) else: from qwen_vl_utils import process_vision_info from transformers import AutoModelForImageTextToText model = AutoModelForImageTextToText.from_pretrained( args.model, torch_dtype=dtype, device_map="auto" if device == "cuda" else None, ) if device == "cpu": model.to(device) pieces: list[str] = [] for index, image_path in enumerate(image_paths, start=1): image_reference = str(image_path.resolve()) if uses_qwen3_vl else f"file://{image_path.resolve().as_posix()}" messages = [ { "role": "user", "content": [ {"type": "image", "image": image_reference}, {"type": "text", "text": prompt}, ], } ] if uses_qwen3_vl: inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", ).to(model.device) else: text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) generated_ids = model.generate(**inputs, max_new_tokens=args.max_new_tokens, do_sample=False) generated_ids = generated_ids[:, inputs.input_ids.shape[1] :] output = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] page_text = clean_model_text(output) if page_text: pieces.append(page_text) print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True) args.out.parent.mkdir(parents=True, exist_ok=True) args.out.write_text("\n\n".join(pieces), encoding="utf-8") if __name__ == "__main__": main()