Local_OCR_Demo / ocr_full_pdf12.py
DocUA's picture
Initial commit: DeepSeek-OCR-2 & MedGemma-1.5 multimodal analysis app with ZeroGPU support
b752d16
from transformers import AutoModel, AutoTokenizer
import torch
import os
from PIL import Image
import time
# Force CPU for stability on Mac
device = "cpu"
print(f"Using device: {device}")
# Patch to avoid CUDA calls in custom code
torch.Tensor.cuda = lambda self, *args, **kwargs: self.to(device)
torch.nn.Module.cuda = lambda self, *args, **kwargs: self.to(device)
model_name = 'deepseek-ai/DeepSeek-OCR-2'
def ocr_full_document():
print(f"Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(f"Loading model...")
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
use_safetensors=True
)
model = model.eval()
# Overwrite bfloat16 to float32 for CPU compatibility
torch.bfloat16 = torch.float32
image_dir = "doc_images_full"
output_dir = "ocr_results_pdf12"
os.makedirs(output_dir, exist_ok=True)
# Get images sorted by page number
import re
def get_page_num(filename):
match = re.search(r'page_(\d+)', filename)
return int(match.group(1)) if match else 0
images = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")], key=get_page_num)
full_markdown = []
for i, img_name in enumerate(images):
img_path = os.path.join(image_dir, img_name)
print(f"\n[{i+1}/{len(images)}] Processing page {get_page_num(img_name)}...")
prompt = "<image>\nFree OCR. "
start_time = time.time()
try:
with torch.no_grad():
res = model.infer(
tokenizer,
prompt=prompt,
image_file=img_path,
output_path=output_dir,
base_size=1024,
image_size=768,
crop_mode=False,
eval_mode=True
)
elapsed = time.time() - start_time
print(f" Done in {elapsed:.2f}s")
# Save individual page result
page_file = os.path.join(output_dir, f"{img_name}.md")
with open(page_file, "w") as f:
f.write(res)
full_markdown.append(f"## Page {get_page_num(img_name)}\n\n{res}\n\n---\n")
except Exception as e:
print(f" Failed: {e}")
full_markdown.append(f"## Page {get_page_num(img_name)}\n\n[OCR FAILED]\n\n---\n")
# Save combined result
combined_file = os.path.join(output_dir, "full_document.md")
with open(combined_file, "w") as f:
f.write("# OCR Result for pdf12_un.pdf\n\n")
f.write("".join(full_markdown))
print(f"\nCompleted! Full result saved to: {combined_file}")
if __name__ == "__main__":
ocr_full_document()