Spaces:

viswanani
/

Zaravya

Sleeping

App Files Files Community

viswanani commited on Oct 12, 2025

Commit

a1fd711

verified ·

1 Parent(s): 325160c

Update app.py

Browse files

Files changed (1) hide show

app.py +361 -69

app.py CHANGED Viewed

@@ -1,80 +1,372 @@
 import gradio as gr
-import pytesseract
-from PIL import Image, ImageOps, ImageFilter
 import pandas as pd
 import re
-import os
-import zipfile
 import tempfile
-import uuid
-PRICE_PATTERN = re.compile(r'(?<!\d)(?:₹\s*|Rs\.?\s*|INR\s*)?\d+(?:\.\d{1,2})?(?!\d)')
-CLEAN_PRICE = re.compile(r'[^0-9.]')
-def preprocess_image(img: Image.Image) -> Image.Image:
-    gray = ImageOps.grayscale(img)
-    enhanced = ImageOps.autocontrast(gray)
-    denoised = enhanced.filter(ImageFilter.MedianFilter(size=3))
-    sharpened = denoised.filter(ImageFilter.UnsharpMask(radius=1.5, percent=150, threshold=3))
-    return sharpened
-def simple_parse_lines(text: str):
-    rows = []
-    current_category = None
-    lines = [l.strip() for l in text.splitlines() if l.strip()]
-    for line in lines:
-        if (line.isupper() and len(line.split()) <= 6) or line.endswith(':'):
-            current_category = line.rstrip(':').strip()
             continue
-        price_match = PRICE_PATTERN.search(line)
-        if price_match:
-            price_text = price_match.group(0)
-            price_value = CLEAN_PRICE.sub('', price_text)
-            item = line[:price_match.start()].strip(" -:•\t")
-            item = re.sub(r'\s{2,}', ' ', item)
-            if item:
-                rows.append({
-                    "Item": item,
-                    "Price": price_value,
-                    "Category": current_category if current_category else ""
-                })
-    return rows
-def process_images_to_zip(files):
-    work_dir = tempfile.mkdtemp(prefix="menu_excel_")
-    output_files = []
-    for idx, file_path in enumerate(files, start=1):
-        image = Image.open(file_path).convert("RGB")
-        image = preprocess_image(image)
-        text = pytesseract.image_to_string(image, lang="eng")
-        rows = simple_parse_lines(text)
-        if not rows:
-            df = pd.DataFrame([{"Extracted Text": text}])
         else:
-            df = pd.DataFrame(rows, columns=["Item", "Price", "Category"])
-        excel_name = f"menu_{idx:03d}.xlsx"
-        excel_path = os.path.join(work_dir, excel_name)
-        df.to_excel(excel_path, index=False)
-        output_files.append(excel_path)
-    zip_name = f"menus_output_{uuid.uuid4().hex[:8]}.zip"
-    zip_path = os.path.join(work_dir, zip_name)
-    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
-        for path in output_files:
-            zipf.write(path, arcname=os.path.basename(path))
-    return zip_path
-with gr.Blocks(title="Menu to Excel (one file per image)") as demo:
-    gr.Markdown("## Menu to Excel converter\nUpload menu images to get a ZIP containing separate Excel files (one per image).")
     with gr.Row():
-        input_files = gr.File(
-            label="Upload menu images",
-            file_count="multiple",
-            type="filepath",   # ✅ correct
-            file_types=[".png", ".jpg", ".jpeg"]
-        )
-    run_btn = gr.Button("Process")
-    output_zip = gr.File(label="Download ZIP")
-    run_btn.click(fn=process_images_to_zip, inputs=[input_files], outputs=[output_zip])
-if __name__ == "__main__":
-    demo.launch()

+# app.py
+"""
+Menu OCR -> Excel (Batch) Hugging Face Space app (Gradio)
+Features:
+- Batch upload of menu images (expects filename format: <StoreName>_<StoreCode> <BranchName>.<ext>)
+- Parses filename to fill A1 (Store Name), B1 (Store Code), C1 (Branch Name)
+- OCR with Tesseract via pytesseract
+- Shows raw OCR text, line confidences, editable table for user validation
+- Saves one Excel per image (copy of uploaded template with rows starting at row 3)
+- Returns a ZIP of all processed Excel files
+IMPORTANT:
+- This app requires system Tesseract OCR to be installed on the host (Hugging Face Spaces often has it).
+  If you see errors about "Tesseract not found", install Tesseract or use a runtime that includes it.
+"""
 import gradio as gr
 import pandas as pd
+import pytesseract
+from pytesseract import Output
+import cv2
 import re
 import tempfile
+import shutil
+import os
+import numpy as np
+from PIL import Image
+from io import BytesIO
+from zipfile import ZipFile
+from openpyxl import load_workbook
+import logging
+logging.basicConfig(level=logging.INFO)
+# ---------- CONFIG ----------
+PRICE_REGEX = re.compile(r"(?:₹|Rs\.?|INR)?\s*([0-9]{1,6}(?:\.[0-9]{1,2})?)(?:\s*/-)?\s*$", flags=re.IGNORECASE)
+CATEGORY_HINTS = ["maggi", "noodles", "pizza", "burger", "rice", "continental", "beverages", "coffee", "tea"]
+DEFAULTS = {
+    "Active": "1",
+    "Priority": "",
+    "Image": "",
+    "Food type": "",
+    "NoOfMains": "1",
+    "OnlineName": "",
+    "AlternateClassification": "",
+    "ItemTaxInclusive": "0",
+    "TaxPct": "",
+    "BrandName": "",
+    "ClassificationCode": "",
+    "HSN Code": ""
+}
+# ----------------------------
+def parse_filename(filename: str):
+    base = os.path.splitext(os.path.basename(filename))[0]
+    if "_" in base:
+        left, right = base.split("_", 1)
+        store_name = left.strip()
+        parts = right.strip().split(" ", 1)
+        store_code = parts[0].strip()
+        branch_name = parts[1].strip() if len(parts) > 1 else ""
+    else:
+        m = re.match(r"(.+?)\s*\((.+?)\)", base)
+        if m:
+            store_name = m.group(1).strip()
+            branch_name = m.group(2).strip()
+            store_code = ""
+        else:
+            store_name = base
+            store_code = ""
+            branch_name = ""
+    return store_name, store_code, branch_name
+def preprocess_image(np_img):
+    gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
+    h, w = gray.shape[:2]
+    if min(h, w) < 1000:
+        scale = max(1.5, 1000.0 / min(h, w))
+        gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+    th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                               cv2.THRESH_BINARY, 41, 11)
+    kernel = np.ones((1, 1), np.uint8)
+    opened = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel)
+    return opened
+def ocr_with_confidence(pil_img):
+    # Returns full text and list of dicts: {"line":..., "conf":...}
+    try:
+        data = pytesseract.image_to_data(pil_img, output_type=Output.DICT, lang='eng')
+    except Exception as e:
+        raise RuntimeError(f"Tesseract OCR failed: {e}. Ensure Tesseract is installed on the host.")
+    texts = data.get('text', [])
+    confs = data.get('conf', [])
+    block_nums = data.get('block_num', [])
+    par_nums = data.get('par_num', [])
+    line_nums = data.get('line_num', [])
+    # Group tokens into lines using block/par/line
+    lines_map = {}
+    for t, c, b, p, l in zip(texts, confs, block_nums, par_nums, line_nums):
+        if t is None or str(t).strip()=="":
             continue
+        key = f"{b}_{p}_{l}"
+        if key not in lines_map:
+            lines_map[key] = {"tokens": [], "confs": []}
+        lines_map[key]["tokens"].append(str(t))
+        try:
+            conf_val = float(c)
+        except:
+            conf_val = -1.0
+        if conf_val >= 0:
+            lines_map[key]["confs"].append(conf_val)
+    lines = []
+    for key in sorted(lines_map.keys(), key=lambda x: tuple(map(int, x.split("_")))):
+        tokens = lines_map[key]["tokens"]
+        confs_line = lines_map[key]["confs"]
+        text_line = " ".join(tokens).strip()
+        avg_conf = round(sum(confs_line)/len(confs_line),2) if confs_line else 0.0
+        lines.append({"line": text_line, "conf": avg_conf})
+    full_text = "\n".join([l["line"] for l in lines])
+    return full_text, lines
+def split_lines(text: str):
+    cleaned = re.sub(r"[•·●\t]", " ", text)
+    cleaned = re.sub(r"[ ]{2,}", " ", cleaned)
+    return [ln.strip() for ln in cleaned.splitlines() if ln.strip()]
+def looks_like_category(line: str):
+    low = line.lower()
+    if any(k in low for k in CATEGORY_HINTS):
+        return True
+    if not re.search(r"\d", line) and len(line.split()) <= 6:
+        return True
+    return False
+def parse_menu_lines(lines):
+    rows = []
+    current_parent = ""
+    current_category = ""
+    for ln in lines:
+        if looks_like_category(ln):
+            if ln.isupper() or any(k in ln.lower() for k in CATEGORY_HINTS):
+                current_parent = ln.strip(":- ")
+                continue
+            else:
+                current_category = ln.strip(":- ")
+                continue
+        m = PRICE_REGEX.search(ln)
+        if m:
+            price = m.group(1).strip()
+            name_part = PRICE_REGEX.sub("", ln).strip(" -:.")
+            row = {
+                "Parent Category": current_parent,
+                "Category": current_category,
+                "Name": name_part,
+                "Item Code": "",
+                "Master Item Name": name_part,
+                "EAN Code": "",
+                "Price": price,
+                "Active": DEFAULTS["Active"],
+                "Priority": DEFAULTS["Priority"],
+                "Image": DEFAULTS["Image"],
+                "Food type": DEFAULTS["Food type"],
+                "NoOfMains": DEFAULTS["NoOfMains"],
+                "OnlineName": DEFAULTS["OnlineName"],
+                "AlternateClassification": DEFAULTS["AlternateClassification"],
+                "ItemTaxInclusive": DEFAULTS["ItemTaxInclusive"],
+                "TaxPct": DEFAULTS["TaxPct"],
+                "BrandName": DEFAULTS["BrandName"],
+                "ClassificationCode": DEFAULTS["ClassificationCode"],
+                "HSN Code": DEFAULTS["HSN Code"]
+            }
+            rows.append(row)
         else:
+            if re.search(r"\d", ln):
+                name_part = ln.strip()
+                row = {
+                    "Parent Category": current_parent,
+                    "Category": current_category,
+                    "Name": name_part,
+                    "Item Code": "",
+                    "Master Item Name": name_part,
+                    "EAN Code": "",
+                    "Price": "",
+                    "Active": DEFAULTS["Active"],
+                    "Priority": DEFAULTS["Priority"],
+                    "Image": DEFAULTS["Image"],
+                    "Food type": DEFAULTS["Food type"],
+                    "NoOfMains": DEFAULTS["NoOfMains"],
+                    "OnlineName": DEFAULTS["OnlineName"],
+                    "AlternateClassification": DEFAULTS["AlternateClassification"],
+                    "ItemTaxInclusive": DEFAULTS["ItemTaxInclusive"],
+                    "TaxPct": DEFAULTS["TaxPct"],
+                    "BrandName": DEFAULTS["BrandName"],
+                    "ClassificationCode": DEFAULTS["ClassificationCode"],
+                    "HSN Code": DEFAULTS["HSN Code"]
+                }
+                rows.append(row)
+    return rows
+def fill_template_bytes(template_path, rows, store_name, store_code, branch_name):
+    wb = load_workbook(template_path)
+    ws = wb.active
+    ws["A1"] = store_name
+    ws["B1"] = store_code
+    ws["C1"] = branch_name
+    start_row = 3
+    r = start_row
+    for item in rows:
+        ws.cell(row=r, column=1, value=item.get("Parent Category",""))
+        ws.cell(row=r, column=2, value=item.get("Category",""))
+        ws.cell(row=r, column=3, value=item.get("Name",""))
+        ws.cell(row=r, column=4, value=item.get("Item Code",""))
+        ws.cell(row=r, column=5, value=item.get("Master Item Name",""))
+        ws.cell(row=r, column=6, value=item.get("EAN Code",""))
+        ws.cell(row=r, column=7, value=item.get("Price",""))
+        ws.cell(row=r, column=8, value=item.get("Active",""))
+        ws.cell(row=r, column=9, value=item.get("Priority",""))
+        ws.cell(row=r, column=10, value=item.get("Image",""))
+        ws.cell(row=r, column=11, value=item.get("Food type",""))
+        ws.cell(row=r, column=12, value=item.get("NoOfMains",""))
+        ws.cell(row=r, column=13, value=item.get("OnlineName",""))
+        ws.cell(row=r, column=14, value=item.get("AlternateClassification",""))
+        ws.cell(row=r, column=15, value=item.get("ItemTaxInclusive",""))
+        ws.cell(row=r, column=16, value=item.get("TaxPct",""))
+        ws.cell(row=r, column=17, value=item.get("BrandName",""))
+        ws.cell(row=r, column=18, value=item.get("ClassificationCode",""))
+        ws.cell(row=r, column=19, value=item.get("HSN Code",""))
+        r += 1
+    out = BytesIO()
+    wb.save(out)
+    out.seek(0)
+    return out
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🍽️ Menu OCR → Excel (Batch + Validation)\nUpload multiple menu images (named like <StoreName>_<StoreCode> <BranchName>.jpg) and an Excel template. Parse → review/edit each file → download ZIP.")
+    with gr.Row():
+        img_input = gr.File(label="Upload Menu Images (multiple)", file_count="multiple", file_types=["image"])
+        template_input = gr.File(label="Upload Excel Template (.xlsx)", file_count="single", file_types=[".xlsx"])
+    parse_btn = gr.Button("Parse all images")
+    parsed_state = gr.State({})
+    status = gr.Textbox(label="Status", interactive=False)
+    with gr.Row():
+        file_select = gr.Dropdown(choices=[], label="Select parsed image to review")
+        refresh_btn = gr.Button("Refresh list")
+    with gr.Row():
+        raw_text_area = gr.Textbox(label="Raw OCR Text", lines=10)
+        conf_area = gr.Dataframe(headers=["line","confidence"], interactive=False)
+    df_editor = gr.Dataframe(headers=["Parent Category","Category","Name","Item Code","Master Item Name","EAN Code","Price","Active","Priority","Image","Food type","NoOfMains","OnlineName","AlternateClassification","ItemTaxInclusive","TaxPct","BrandName","ClassificationCode","HSN Code"], interactive=True, datatype="str")
     with gr.Row():
+        save_btn = gr.Button("Save current edits (generate Excel for this file)")
+        save_status = gr.Textbox(label="Save status", interactive=False)
+    download_btn = gr.Button("Download ZIP of all (use after saving/edits)")
+    download_output = gr.File(label="Download ZIP")
+    def parse_all(images, template):
+        if images is None or template is None:
+            return {}, "Please upload images and a template", [], ""
+        parsed = {}
+        for img in images:
+            try:
+                raw = img.read()
+                store_name, store_code, branch_name = parse_filename(img.name)
+                pil = Image.open(BytesIO(raw)).convert("RGB")
+                np_img = np.array(pil)
+                pre = preprocess_image(np_img)
+                pil_pre = Image.fromarray(pre)
+                full_text, lines_conf = ocr_with_confidence(pil_pre)
+                lines = split_lines(full_text)
+                rows = parse_menu_lines(lines)
+                parsed[img.name] = {
+                    "store_name": store_name,
+                    "store_code": store_code,
+                    "branch_name": branch_name,
+                    "rows": rows,
+                    "raw_text": full_text,
+                    "lines_conf": lines_conf
+                }
+            except Exception as e:
+                parsed[img.name] = {
+                    "error": str(e)
+                }
+        choices = list(parsed.keys())
+        return parsed, f"Parsed {len(choices)} images", choices, ""
+    parse_btn.click(fn=parse_all, inputs=[img_input, template_input], outputs=[parsed_state, status, file_select, raw_text_area])
+    def refresh_choices(parsed):
+        if not parsed:
+            return [], ""
+        return list(parsed.keys()), ""
+    refresh_btn.click(fn=refresh_choices, inputs=[parsed_state], outputs=[file_select, status])
+    def show_file(selected, parsed):
+        if not parsed or not selected:
+            return "", pd.DataFrame(), []
+        item = parsed.get(selected)
+        if "error" in item:
+            return f"Error parsing {selected}: {item['error']}", pd.DataFrame(), []
+        raw = item.get("raw_text","")
+        df = pd.DataFrame(item.get("rows",[]))
+        df_conf = pd.DataFrame(item.get("lines_conf",[]))
+        return raw, df, df_conf
+    file_select.change(fn=show_file, inputs=[file_select, parsed_state], outputs=[raw_text_area, df_editor, conf_area])
+    def save_current(selected, parsed, edited_df, template):
+        if not parsed or not selected:
+            return "Nothing to save"
+        item = parsed.get(selected)
+        if "error" in item:
+            return f"Cannot save: {item['error']}"
+        if isinstance(edited_df, pd.DataFrame):
+            rows = edited_df.fillna("").to_dict(orient="records")
+        else:
+            rows = edited_df
+        item["rows"] = rows
+        out_buf = fill_template_bytes(template.name, rows, item["store_name"], item["store_code"], item["branch_name"])
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
+        tmp.write(out_buf.read())
+        tmp.close()
+        item["generated_path"] = tmp.name
+        parsed[selected] = item
+        return f"Saved {selected} -> {os.path.basename(tmp.name)}"
+    save_btn.click(fn=save_current, inputs=[file_select, parsed_state, df_editor, template_input], outputs=[save_status])
+    def download_all(parsed, template):
+        if not parsed:
+            return None
+        tempdir = tempfile.mkdtemp()
+        zip_path = os.path.join(tempdir, "Menu_Results.zip")
+        with ZipFile(zip_path, "w") as zf:
+            for name, item in parsed.items():
+                if "generated_path" in item:
+                    try:
+                        out_name = os.path.splitext(os.path.basename(name))[0] + ".xlsx"
+                        zf.write(item["generated_path"], arcname=out_name)
+                    except Exception as e:
+                        err_name = os.path.splitext(os.path.basename(name))[0] + "_ERROR.txt"
+                        err_path = os.path.join(tempdir, err_name)
+                        with open(err_path, "w", encoding="utf-8") as ef:
+                            ef.write(str(e))
+                        zf.write(err_path, arcname=err_name)
+                else:
+                    # if not saved by user, auto-generate now
+                    if "error" in item:
+                        err_name = os.path.splitext(os.path.basename(name))[0] + "_PARSE_ERROR.txt"
+                        err_path = os.path.join(tempdir, err_name)
+                        with open(err_path, "w", encoding="utf-8") as ef:
+                            ef.write(item["error"])
+                        zf.write(err_path, arcname=err_name)
+                    else:
+                        try:
+                            out_buf = fill_template_bytes(template.name, item.get("rows",[]), item.get("store_name",""), item.get("store_code",""), item.get("branch_name",""))
+                            out_name = os.path.splitext(os.path.basename(name))[0] + ".xlsx"
+                            tmpf = os.path.join(tempdir, out_name)
+                            with open(tmpf, "wb") as f:
+                                f.write(out_buf.read())
+                            zf.write(tmpf, arcname=out_name)
+                        except Exception as e:
+                            err_name = os.path.splitext(os.path.basename(name))[0] + "_SAVE_ERROR.txt"
+                            err_path = os.path.join(tempdir, err_name)
+                            with open(err_path, "w", encoding="utf-8") as ef:
+                                ef.write(str(e))
+                            zf.write(err_path, arcname=err_name)
+        return zip_path
+    download_btn.click(fn=download_all, inputs=[parsed_state, template_input], outputs=[download_output])
+demo.launch()