Spaces:

neerajkalyank
/

Menu_Extraction

Sleeping

App Files Files Community

Menu_Extraction / app.py

neerajkalyank

Update app.py

c42bcca verified 3 months ago

raw

history blame contribute delete

3.83 kB

	import re
	import pandas as pd
	import pytesseract
	from PIL import Image
	import gradio as gr
	import io

	# Uncomment and edit this path if you’re on Windows
	# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

	# IBS Café schema columns
	COLUMNS = [
	"Parent Category", "Category", "Store Item Name", "Item Code", "Master Item Name", "EAN Code",
	"price", "Active", "Priortiy", "Image", "food type", "NoOfMains", "onlineName",
	"Menu/MRP", "itemTaxInclusive", "taxPct", "brandName", "classificationCode", "HSN CODE"
	]

	DEFAULTS = {
	"Item Code": "",
	"Master Item Name": "",
	"EAN Code": "",
	"Active": "Y",
	"Priortiy": "",
	"Image": "",
	"food type": "",
	"NoOfMains": "",
	"itemTaxInclusive": "Y",
	"taxPct": "5",
	"brandName": "Nescafe",
	"classificationCode": "",
	"HSN CODE": "",
	}

	price_pattern = re.compile(r'(?<!\d)(?:₹\|Rs\.?\s*)?\d{2,4}(?:\.\d{1,2})?(?!\d)')

	def parse_menu_text(text):
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	rows = []
	parent_category, category = "", ""

	for line in lines:
	if line.isupper() and len(line.split()) <= 4:
	# treat as category or parent
	if not parent_category:
	parent_category = line.title()
	else:
	category = line.title()
	continue

	prices = price_pattern.findall(line)
	if prices:
	name = re.sub(price_pattern, '', line).strip(" -:–")
	if '/' in line and len(prices) > 1:
	for i, price in enumerate(prices):
	size_label = f" ({['Regular', 'Large', 'XL'][i]})" if i < 3 else f" (Option {i+1})"
	store_name = f"{name}{size_label}"
	row = build_row(parent_category, category, store_name, price)
	rows.append(row)
	else:
	price = prices[0]
	store_name = name
	row = build_row(parent_category, category, store_name, price)
	rows.append(row)

	return rows

	def build_row(parent, category, name, price):
	base = {**DEFAULTS}
	base.update({
	"Parent Category": parent,
	"Category": category,
	"Store Item Name": name,
	"price": price,
	"onlineName": name,
	"Menu/MRP": price
	})
	return base

	def ocr_and_extract(image):
	if image is None:
	return "Please upload an image.", None, None

	img = Image.open(image)
	text = pytesseract.image_to_string(img)
	rows = parse_menu_text(text)

	df = pd.DataFrame(rows, columns=COLUMNS)

	# Save Excel file in memory
	excel_buffer = io.BytesIO()
	with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
	df.to_excel(writer, index=False, sheet_name="Menu")
	excel_buffer.seek(0)

	json_output = {"rows": rows, "needs_review": []}
	return text, excel_buffer, json_output

	# Gradio UI
	with gr.Blocks(title="Menu → IBS Schema Extractor") as demo:
	gr.Markdown("## 🧾 Menu OCR → IBS Café Excel Generator")
	gr.Markdown("Upload a menu image and extract structured data in Excel (.xlsx) + JSON formats.")

	with gr.Row():
	image_input = gr.Image(type="filepath", label="Upload Menu Image")

	extract_btn = gr.Button("Extract")

	with gr.Tab("Extracted Text"):
	text_output = gr.Textbox(label="OCR Text", lines=10)

	with gr.Tab("Excel Output"):
	excel_file = gr.File(label="Download Excel (.xlsx)")

	with gr.Tab("JSON Output"):
	json_output = gr.JSON(label="Structured JSON")

	extract_btn.click(
	ocr_and_extract,
	inputs=[image_input],
	outputs=[text_output, excel_file, json_output]
	)

	demo.launch()