Menu_Extraction / app.py
neerajkalyank's picture
Update app.py
c42bcca verified
import re
import pandas as pd
import pytesseract
from PIL import Image
import gradio as gr
import io
# Uncomment and edit this path if you’re on Windows
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# IBS Café schema columns
COLUMNS = [
"Parent Category", "Category", "Store Item Name", "Item Code", "Master Item Name", "EAN Code",
"price", "Active", "Priortiy", "Image", "food type", "NoOfMains", "onlineName",
"Menu/MRP", "itemTaxInclusive", "taxPct", "brandName", "classificationCode", "HSN CODE"
]
DEFAULTS = {
"Item Code": "",
"Master Item Name": "",
"EAN Code": "",
"Active": "Y",
"Priortiy": "",
"Image": "",
"food type": "",
"NoOfMains": "",
"itemTaxInclusive": "Y",
"taxPct": "5",
"brandName": "Nescafe",
"classificationCode": "",
"HSN CODE": "",
}
price_pattern = re.compile(r'(?<!\d)(?:₹|Rs\.?\s*)?\d{2,4}(?:\.\d{1,2})?(?!\d)')
def parse_menu_text(text):
lines = [line.strip() for line in text.splitlines() if line.strip()]
rows = []
parent_category, category = "", ""
for line in lines:
if line.isupper() and len(line.split()) <= 4:
# treat as category or parent
if not parent_category:
parent_category = line.title()
else:
category = line.title()
continue
prices = price_pattern.findall(line)
if prices:
name = re.sub(price_pattern, '', line).strip(" -:–")
if '/' in line and len(prices) > 1:
for i, price in enumerate(prices):
size_label = f" ({['Regular', 'Large', 'XL'][i]})" if i < 3 else f" (Option {i+1})"
store_name = f"{name}{size_label}"
row = build_row(parent_category, category, store_name, price)
rows.append(row)
else:
price = prices[0]
store_name = name
row = build_row(parent_category, category, store_name, price)
rows.append(row)
return rows
def build_row(parent, category, name, price):
base = {**DEFAULTS}
base.update({
"Parent Category": parent,
"Category": category,
"Store Item Name": name,
"price": price,
"onlineName": name,
"Menu/MRP": price
})
return base
def ocr_and_extract(image):
if image is None:
return "Please upload an image.", None, None
img = Image.open(image)
text = pytesseract.image_to_string(img)
rows = parse_menu_text(text)
df = pd.DataFrame(rows, columns=COLUMNS)
# Save Excel file in memory
excel_buffer = io.BytesIO()
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name="Menu")
excel_buffer.seek(0)
json_output = {"rows": rows, "needs_review": []}
return text, excel_buffer, json_output
# Gradio UI
with gr.Blocks(title="Menu → IBS Schema Extractor") as demo:
gr.Markdown("## 🧾 Menu OCR → IBS Café Excel Generator")
gr.Markdown("Upload a menu image and extract structured data in Excel (.xlsx) + JSON formats.")
with gr.Row():
image_input = gr.Image(type="filepath", label="Upload Menu Image")
extract_btn = gr.Button("Extract")
with gr.Tab("Extracted Text"):
text_output = gr.Textbox(label="OCR Text", lines=10)
with gr.Tab("Excel Output"):
excel_file = gr.File(label="Download Excel (.xlsx)")
with gr.Tab("JSON Output"):
json_output = gr.JSON(label="Structured JSON")
extract_btn.click(
ocr_and_extract,
inputs=[image_input],
outputs=[text_output, excel_file, json_output]
)
demo.launch()