from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import MllamaForConditionalGeneration, AutoProcessor from PIL import Image import torch import requests from io import BytesIO app = FastAPI() # Initialize model and processor ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( ckpt, torch_dtype=torch.bfloat16 ).to("cuda") processor = AutoProcessor.from_pretrained(ckpt) class ImageRequest(BaseModel): image_path: str @app.post("/extract_text") async def extract_text(request: ImageRequest): try: # Download image from URL response = requests.get(request.image_path) if response.status_code != 200: raise HTTPException(status_code=400, detail="Failed to fetch image from URL") # Open image from bytes image = Image.open(BytesIO(response.content)).convert("RGB") # Create message structure messages = [ { "role": "user", "content": [ {"type": "text", "text": "Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output"}, {"type": "image"} ] } ] # Process input texts = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda") # Generate output outputs = model.generate(**inputs, max_new_tokens=250) result = processor.decode(outputs[0], skip_special_tokens=True) # Clean up the output if "assistant" in result.lower(): result = result[result.lower().find("assistant") + len("assistant"):].strip() result = result.replace("user", "").replace("Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output", "").strip() return {"text": f"\n{result}\n"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)