import os from fastapi import Request from starlette.responses import JSONResponse import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image # Get the secret key from Hugging Face secrets API_KEY = os.environ.get("API_KEY") # Load model and processor processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") model = AutoModelForVision2Seq.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", device_map="auto") # Prompt for spine recognition prompt = "What is the album title and artist on this CD spine?" # Auth wrapper def with_auth(fn): async def wrapper(image: Image.Image, request: Request): if request.headers.get("x-api-key") != API_KEY: return JSONResponse({"error": "unauthorized"}, status_code=401) return fn(image) return wrapper # Inference function @with_auth def extract_text(image: Image.Image): inputs = processor(prompt=prompt, images=image, return_tensors="pt").to(model.device) generated_ids = model.generate(**inputs, max_new_tokens=128) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return {"text": generated_text} # Gradio app iface = gr.Interface(fn=extract_text, inputs="image", outputs="json") iface.launch( server_name="0.0.0.0", server_port=7860, share=True, api_name="extract_text" )