Spaces:
Sleeping
Sleeping
| from transformers import AutoModel, AutoTokenizer | |
| import torch | |
| import os | |
| from PIL import Image | |
| import time | |
| # Force CPU for stability on Mac | |
| device = "cpu" | |
| print(f"Using device: {device}") | |
| # Patch to avoid CUDA calls in custom code | |
| torch.Tensor.cuda = lambda self, *args, **kwargs: self.to(device) | |
| torch.nn.Module.cuda = lambda self, *args, **kwargs: self.to(device) | |
| model_name = 'deepseek-ai/DeepSeek-OCR-2' | |
| def ocr_full_document(): | |
| print(f"Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| print(f"Loading model...") | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| use_safetensors=True | |
| ) | |
| model = model.eval() | |
| # Overwrite bfloat16 to float32 for CPU compatibility | |
| torch.bfloat16 = torch.float32 | |
| image_dir = "doc_images_full" | |
| output_dir = "ocr_results_pdf12" | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Get images sorted by page number | |
| import re | |
| def get_page_num(filename): | |
| match = re.search(r'page_(\d+)', filename) | |
| return int(match.group(1)) if match else 0 | |
| images = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")], key=get_page_num) | |
| full_markdown = [] | |
| for i, img_name in enumerate(images): | |
| img_path = os.path.join(image_dir, img_name) | |
| print(f"\n[{i+1}/{len(images)}] Processing page {get_page_num(img_name)}...") | |
| prompt = "<image>\nFree OCR. " | |
| start_time = time.time() | |
| try: | |
| with torch.no_grad(): | |
| res = model.infer( | |
| tokenizer, | |
| prompt=prompt, | |
| image_file=img_path, | |
| output_path=output_dir, | |
| base_size=1024, | |
| image_size=768, | |
| crop_mode=False, | |
| eval_mode=True | |
| ) | |
| elapsed = time.time() - start_time | |
| print(f" Done in {elapsed:.2f}s") | |
| # Save individual page result | |
| page_file = os.path.join(output_dir, f"{img_name}.md") | |
| with open(page_file, "w") as f: | |
| f.write(res) | |
| full_markdown.append(f"## Page {get_page_num(img_name)}\n\n{res}\n\n---\n") | |
| except Exception as e: | |
| print(f" Failed: {e}") | |
| full_markdown.append(f"## Page {get_page_num(img_name)}\n\n[OCR FAILED]\n\n---\n") | |
| # Save combined result | |
| combined_file = os.path.join(output_dir, "full_document.md") | |
| with open(combined_file, "w") as f: | |
| f.write("# OCR Result for pdf12_un.pdf\n\n") | |
| f.write("".join(full_markdown)) | |
| print(f"\nCompleted! Full result saved to: {combined_file}") | |
| if __name__ == "__main__": | |
| ocr_full_document() | |