from __future__ import annotations import argparse import torch from transformers import AutoModel, AutoProcessor def main() -> None: parser = argparse.ArgumentParser(description="Single-image OCR prediction.") parser.add_argument("--model-id", required=True, help="HF repo id or local model directory.") parser.add_argument("--image", required=True, help="Path to input image.") parser.add_argument("--device", default="cpu", help="cpu or cuda") args = parser.parse_args() processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True) model = AutoModel.from_pretrained(args.model_id, trust_remote_code=True).to(args.device) model.eval() inputs = processor(images=args.image, return_tensors="pt") pixel_values = inputs["pixel_values"].to(args.device) with torch.no_grad(): logits = model(pixel_values=pixel_values).logits text = processor.batch_decode(logits)[0] print(text) if __name__ == "__main__": main()