Spaces:

Omnia-cy
/

Hackathon_GDG_Image_Model

Sleeping

App Files Files Community

Omnia-cy commited on 6 days ago

Commit

2ed8a2b

verified ·

1 Parent(s): 1634971

Create app.py

Browse files

Files changed (1) hide show

app.py +85 -0

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+import requests
+from io import BytesIO
+from PIL import Image
+import numpy as np
+import cv2
+import pytesseract
+import torch
+from transformers import LlavaProcessor, LlavaForConditionalGeneration
+app = FastAPI()
+# =====================
+# LOAD MODEL
+# =====================
+model_id = "llava-hf/llava-1.5-7b-hf"
+processor = LlavaProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+# =====================
+# REQUEST FORMAT
+# =====================
+class ImageRequest(BaseModel):
+    url: str
+# =====================
+# LOAD IMAGE
+# =====================
+def load_image_from_url(url):
+    response = requests.get(url)
+    image = Image.open(BytesIO(response.content)).convert("RGB")
+    return image
+# =====================
+# OCR
+# =====================
+def preprocess(image):
+    img = np.array(image)
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    return Image.fromarray(gray)
+def extract_text_ocr(image):
+    processed = preprocess(image)
+    config = r'--oem 3 --psm 6'
+    return pytesseract.image_to_string(processed, config=config).strip()
+# =====================
+# LLaVA
+# =====================
+def get_caption(image):
+    prompt = "USER: <image>\nDescribe the image in detail and extract any visible text.\nASSISTANT:"
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        output = model.generate(**inputs, max_new_tokens=200)
+    return processor.decode(output[0], skip_special_tokens=True)
+# =====================
+# MAIN PIPELINE
+# =====================
+def process_image(url):
+    image = load_image_from_url(url)
+    ocr_text = extract_text_ocr(image)
+    caption = get_caption(image)
+    return {
+        "type": "image",
+        "processed_text": f"{caption} {ocr_text}"
+    }
+# =====================
+# API ROUTE
+# =====================
+@app.post("/predict")
+def predict(req: ImageRequest):
+    return process_image(req.url)