from transformers import AutoModel, AutoTokenizer import torch import os from PIL import Image import time # Force CPU for stability on Mac device = "cpu" print(f"Using device: {device}") # Patch to avoid CUDA calls in custom code torch.Tensor.cuda = lambda self, *args, **kwargs: self.to(device) torch.nn.Module.cuda = lambda self, *args, **kwargs: self.to(device) model_name = 'deepseek-ai/DeepSeek-OCR-2' def ocr_full_document(): print(f"Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) print(f"Loading model...") model = AutoModel.from_pretrained( model_name, trust_remote_code=True, use_safetensors=True ) model = model.eval() # Overwrite bfloat16 to float32 for CPU compatibility torch.bfloat16 = torch.float32 image_dir = "doc_images_full" output_dir = "ocr_results_pdf12" os.makedirs(output_dir, exist_ok=True) # Get images sorted by page number import re def get_page_num(filename): match = re.search(r'page_(\d+)', filename) return int(match.group(1)) if match else 0 images = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")], key=get_page_num) full_markdown = [] for i, img_name in enumerate(images): img_path = os.path.join(image_dir, img_name) print(f"\n[{i+1}/{len(images)}] Processing page {get_page_num(img_name)}...") prompt = "\nFree OCR. " start_time = time.time() try: with torch.no_grad(): res = model.infer( tokenizer, prompt=prompt, image_file=img_path, output_path=output_dir, base_size=1024, image_size=768, crop_mode=False, eval_mode=True ) elapsed = time.time() - start_time print(f" Done in {elapsed:.2f}s") # Save individual page result page_file = os.path.join(output_dir, f"{img_name}.md") with open(page_file, "w") as f: f.write(res) full_markdown.append(f"## Page {get_page_num(img_name)}\n\n{res}\n\n---\n") except Exception as e: print(f" Failed: {e}") full_markdown.append(f"## Page {get_page_num(img_name)}\n\n[OCR FAILED]\n\n---\n") # Save combined result combined_file = os.path.join(output_dir, "full_document.md") with open(combined_file, "w") as f: f.write("# OCR Result for pdf12_un.pdf\n\n") f.write("".join(full_markdown)) print(f"\nCompleted! Full result saved to: {combined_file}") if __name__ == "__main__": ocr_full_document()