Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| import pytesseract | |
| from PIL import Image | |
| import gradio as gr | |
| import io | |
| # Uncomment and edit this path if you’re on Windows | |
| # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" | |
| # IBS Café schema columns | |
| COLUMNS = [ | |
| "Parent Category", "Category", "Store Item Name", "Item Code", "Master Item Name", "EAN Code", | |
| "price", "Active", "Priortiy", "Image", "food type", "NoOfMains", "onlineName", | |
| "Menu/MRP", "itemTaxInclusive", "taxPct", "brandName", "classificationCode", "HSN CODE" | |
| ] | |
| DEFAULTS = { | |
| "Item Code": "", | |
| "Master Item Name": "", | |
| "EAN Code": "", | |
| "Active": "Y", | |
| "Priortiy": "", | |
| "Image": "", | |
| "food type": "", | |
| "NoOfMains": "", | |
| "itemTaxInclusive": "Y", | |
| "taxPct": "5", | |
| "brandName": "Nescafe", | |
| "classificationCode": "", | |
| "HSN CODE": "", | |
| } | |
| price_pattern = re.compile(r'(?<!\d)(?:₹|Rs\.?\s*)?\d{2,4}(?:\.\d{1,2})?(?!\d)') | |
| def parse_menu_text(text): | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| rows = [] | |
| parent_category, category = "", "" | |
| for line in lines: | |
| if line.isupper() and len(line.split()) <= 4: | |
| # treat as category or parent | |
| if not parent_category: | |
| parent_category = line.title() | |
| else: | |
| category = line.title() | |
| continue | |
| prices = price_pattern.findall(line) | |
| if prices: | |
| name = re.sub(price_pattern, '', line).strip(" -:–") | |
| if '/' in line and len(prices) > 1: | |
| for i, price in enumerate(prices): | |
| size_label = f" ({['Regular', 'Large', 'XL'][i]})" if i < 3 else f" (Option {i+1})" | |
| store_name = f"{name}{size_label}" | |
| row = build_row(parent_category, category, store_name, price) | |
| rows.append(row) | |
| else: | |
| price = prices[0] | |
| store_name = name | |
| row = build_row(parent_category, category, store_name, price) | |
| rows.append(row) | |
| return rows | |
| def build_row(parent, category, name, price): | |
| base = {**DEFAULTS} | |
| base.update({ | |
| "Parent Category": parent, | |
| "Category": category, | |
| "Store Item Name": name, | |
| "price": price, | |
| "onlineName": name, | |
| "Menu/MRP": price | |
| }) | |
| return base | |
| def ocr_and_extract(image): | |
| if image is None: | |
| return "Please upload an image.", None, None | |
| img = Image.open(image) | |
| text = pytesseract.image_to_string(img) | |
| rows = parse_menu_text(text) | |
| df = pd.DataFrame(rows, columns=COLUMNS) | |
| # Save Excel file in memory | |
| excel_buffer = io.BytesIO() | |
| with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: | |
| df.to_excel(writer, index=False, sheet_name="Menu") | |
| excel_buffer.seek(0) | |
| json_output = {"rows": rows, "needs_review": []} | |
| return text, excel_buffer, json_output | |
| # Gradio UI | |
| with gr.Blocks(title="Menu → IBS Schema Extractor") as demo: | |
| gr.Markdown("## 🧾 Menu OCR → IBS Café Excel Generator") | |
| gr.Markdown("Upload a menu image and extract structured data in Excel (.xlsx) + JSON formats.") | |
| with gr.Row(): | |
| image_input = gr.Image(type="filepath", label="Upload Menu Image") | |
| extract_btn = gr.Button("Extract") | |
| with gr.Tab("Extracted Text"): | |
| text_output = gr.Textbox(label="OCR Text", lines=10) | |
| with gr.Tab("Excel Output"): | |
| excel_file = gr.File(label="Download Excel (.xlsx)") | |
| with gr.Tab("JSON Output"): | |
| json_output = gr.JSON(label="Structured JSON") | |
| extract_btn.click( | |
| ocr_and_extract, | |
| inputs=[image_input], | |
| outputs=[text_output, excel_file, json_output] | |
| ) | |
| demo.launch() | |