Local_OCR_Demo / test_inference.py
DocUA's picture
Initial commit: DeepSeek-OCR-2 & MedGemma-1.5 multimodal analysis app with ZeroGPU support
b752d16
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
import os
from PIL import Image, ImageOps
import math
# Force CPU
device = "cpu"
dtype = torch.float32
print(f"Forcing device: {device} with dtype: {dtype}")
# Patch torch types to avoid mixed precision errors in their custom code
torch.bfloat16 = torch.float32 # Force bfloat16 to float32
torch.Tensor.cuda = lambda self, *args, **kwargs: self.to("cpu")
torch.nn.Module.cuda = lambda self, *args, **kwargs: self.to("cpu")
model_name = 'deepseek-ai/DeepSeek-OCR-2'
def test_inference():
print(f"Loading tokenizer for {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(f"Loading model for {model_name}...")
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
use_safetensors=True,
torch_dtype=torch.float32 # Explicitly float32
)
model = model.eval() # Already on CPU by default if no device_map
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)
prompt = "<image>\nFree OCR. "
image_file = 'sample_test.png'
if not os.path.exists(image_file):
print(f"Error: {image_file} not found.")
return
print("Running inference on CPU...")
try:
with torch.no_grad():
res = model.infer(
tokenizer,
prompt=prompt,
image_file=image_file,
output_path=output_dir,
base_size=512,
image_size=384,
crop_mode=False,
eval_mode=True
)
print("\n--- OCR Result ---")
print(res)
print("------------------")
except Exception as e:
print(f"Inference failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_inference()