Spaces:
Sleeping
Sleeping
File size: 3,834 Bytes
f372bee c42bcca f372bee c42bcca f372bee c42bcca f372bee c42bcca f372bee c42bcca f372bee c42bcca f372bee c42bcca f372bee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import re
import pandas as pd
import pytesseract
from PIL import Image
import gradio as gr
import io
# Uncomment and edit this path if you’re on Windows
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# IBS Café schema columns
COLUMNS = [
"Parent Category", "Category", "Store Item Name", "Item Code", "Master Item Name", "EAN Code",
"price", "Active", "Priortiy", "Image", "food type", "NoOfMains", "onlineName",
"Menu/MRP", "itemTaxInclusive", "taxPct", "brandName", "classificationCode", "HSN CODE"
]
DEFAULTS = {
"Item Code": "",
"Master Item Name": "",
"EAN Code": "",
"Active": "Y",
"Priortiy": "",
"Image": "",
"food type": "",
"NoOfMains": "",
"itemTaxInclusive": "Y",
"taxPct": "5",
"brandName": "Nescafe",
"classificationCode": "",
"HSN CODE": "",
}
price_pattern = re.compile(r'(?<!\d)(?:₹|Rs\.?\s*)?\d{2,4}(?:\.\d{1,2})?(?!\d)')
def parse_menu_text(text):
lines = [line.strip() for line in text.splitlines() if line.strip()]
rows = []
parent_category, category = "", ""
for line in lines:
if line.isupper() and len(line.split()) <= 4:
# treat as category or parent
if not parent_category:
parent_category = line.title()
else:
category = line.title()
continue
prices = price_pattern.findall(line)
if prices:
name = re.sub(price_pattern, '', line).strip(" -:–")
if '/' in line and len(prices) > 1:
for i, price in enumerate(prices):
size_label = f" ({['Regular', 'Large', 'XL'][i]})" if i < 3 else f" (Option {i+1})"
store_name = f"{name}{size_label}"
row = build_row(parent_category, category, store_name, price)
rows.append(row)
else:
price = prices[0]
store_name = name
row = build_row(parent_category, category, store_name, price)
rows.append(row)
return rows
def build_row(parent, category, name, price):
base = {**DEFAULTS}
base.update({
"Parent Category": parent,
"Category": category,
"Store Item Name": name,
"price": price,
"onlineName": name,
"Menu/MRP": price
})
return base
def ocr_and_extract(image):
if image is None:
return "Please upload an image.", None, None
img = Image.open(image)
text = pytesseract.image_to_string(img)
rows = parse_menu_text(text)
df = pd.DataFrame(rows, columns=COLUMNS)
# Save Excel file in memory
excel_buffer = io.BytesIO()
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name="Menu")
excel_buffer.seek(0)
json_output = {"rows": rows, "needs_review": []}
return text, excel_buffer, json_output
# Gradio UI
with gr.Blocks(title="Menu → IBS Schema Extractor") as demo:
gr.Markdown("## 🧾 Menu OCR → IBS Café Excel Generator")
gr.Markdown("Upload a menu image and extract structured data in Excel (.xlsx) + JSON formats.")
with gr.Row():
image_input = gr.Image(type="filepath", label="Upload Menu Image")
extract_btn = gr.Button("Extract")
with gr.Tab("Extracted Text"):
text_output = gr.Textbox(label="OCR Text", lines=10)
with gr.Tab("Excel Output"):
excel_file = gr.File(label="Download Excel (.xlsx)")
with gr.Tab("JSON Output"):
json_output = gr.JSON(label="Structured JSON")
extract_btn.click(
ocr_and_extract,
inputs=[image_input],
outputs=[text_output, excel_file, json_output]
)
demo.launch()
|