Spaces:

arghyaxcodes
/

ocr-test

Runtime error

File size: 2,003 Bytes

44144f8
06cff9f
c00f569
44144f8
c00f569
44144f8
2534372
 
 
c00f569
44144f8
 
c00f569
 
 
 
859384c
ec9c6ea
c00f569
 
 
06cff9f
 
c00f569
 
 
06cff9f
44144f8
 
 
 
 
 
 
06cff9f
44144f8
 
06cff9f
44144f8
c00f569
 
44144f8
c00f569
 
 
 
 
44144f8
c00f569
44144f8
c00f569

from fastapi import FastAPI, File, UploadFile, Form
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
import io

app = FastAPI()

checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct"
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(
    checkpoint,
    min_pixels=min_pixels,
    max_pixels=max_pixels
)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    checkpoint,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

@app.get("/")
def read_root():
    return {"message": "API is live. Use the /predict endpoint."}

@app.post("/predict")
async def predict(file: UploadFile = File(...), prompt: str = Form(...)):
    # Load the image from uploaded file
    image_bytes = await file.read()
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

    # Compose vision-text messages
    messages = [
        {"role": "system", "content": "You are a helpful assistant with vision abilities. You are the best OCR reader your task is to do OCR analysis of the given image and return the OCR data"},
        {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]},
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_texts = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return {"response": output_texts[0]}