File size: 3,846 Bytes
31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b c4025d8 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b 652a43e 31e936b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | import gradio as gr
import easyocr
import google.generativeai as genai
import os, json, re
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
# =========================
# π Load API Key
# =========================
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# =========================
# π€ Initialize Models
# =========================
reader = easyocr.Reader(['en'])
model = genai.GenerativeModel("gemini-3-flash-preview")
# =========================
# π§Ό Image Preprocessing
# =========================
def preprocess_image(image_path):
img = Image.open(image_path).convert("L") # grayscale
img = ImageEnhance.Contrast(img).enhance(2)
img = img.filter(ImageFilter.SHARPEN)
return img
# =========================
# π OCR Extraction
# =========================
def extract_text(image):
if not isinstance(image, np.ndarray):
image = np.array(image) # β
FIX
result = reader.readtext(image, detail=0)
text = "\n".join(result)
# Clean text
text = re.sub(r'\.{2,}', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text[:4000] # limit for cost/speed
# =========================
# π§ Prompt Builder
# =========================
def build_prompt(text):
return f"""
You are a strict JSON generator.
Convert the menu text into VALID JSON.
FORMAT:
{{
"categories": [
{{
"name": "Category Name",
"items": [
{{
"name": "Item Name",
"sizes": {{
"Regular": 100
}}
}}
]
}}
]
}}
RULES:
- Output ONLY JSON (no explanation)
- Detect categories properly
- Each item MUST have "name" and "sizes"
- If 1 price β "Regular"
- If 2 prices β infer labels (Half/Full, Plain/Butter, Gravy/Dry)
- Fix OCR mistakes intelligently
MENU TEXT:
{text}
"""
# =========================
# π§Ή Extract JSON safely
# =========================
def extract_json(response_text):
try:
start = response_text.index("{")
end = response_text.rindex("}") + 1
return json.loads(response_text[start:end])
except:
return None
# =========================
# β
Validate Output
# =========================
def validate_schema(data):
if not data or "categories" not in data:
return False
for cat in data["categories"]:
if "name" not in cat or "items" not in cat:
return False
for item in cat["items"]:
if "name" not in item or "sizes" not in item:
return False
return True
# =========================
# π Gemini Call with Retry
# =========================
def generate_json(text):
for _ in range(3): # retry
prompt = build_prompt(text)
try:
response = model.generate_content(prompt)
parsed = extract_json(response.text)
if validate_schema(parsed):
return parsed
except Exception as e:
print("Gemini Error:", e)
return {"error": "Failed to generate valid JSON"}
# =========================
# π Main Pipeline
# =========================
def process(image_path):
try:
img = preprocess_image(image_path)
text = extract_text(img)
print("OCR TEXT:", text[:300]) # debug log
result = generate_json(text)
return json.dumps(result, indent=2)
except Exception as e:
return json.dumps({"error": str(e)}, indent=2)
# =========================
# π¨ Gradio UI
# =========================
app = gr.Interface(
fn=process,
inputs=gr.Image(type="filepath"),
outputs=gr.Code(language="json"),
title="π½οΈ Menu Image β JSON (Production Ready)",
description="Upload a restaurant menu image and get structured JSON using OCR + Gemini"
)
# Queue for stability
app.queue(max_size=10)
app.launch() |