viswanani commited on
Commit
325160c
·
verified ·
1 Parent(s): e2b566d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -31
app.py CHANGED
@@ -8,14 +8,10 @@ import zipfile
8
  import tempfile
9
  import uuid
10
 
11
- # -----------------------------
12
- # Basic parsing helpers
13
- # -----------------------------
14
  PRICE_PATTERN = re.compile(r'(?<!\d)(?:₹\s*|Rs\.?\s*|INR\s*)?\d+(?:\.\d{1,2})?(?!\d)')
15
  CLEAN_PRICE = re.compile(r'[^0-9.]')
16
 
17
  def preprocess_image(img: Image.Image) -> Image.Image:
18
- # Convert to grayscale, increase contrast, denoise lightly, sharpen
19
  gray = ImageOps.grayscale(img)
20
  enhanced = ImageOps.autocontrast(gray)
21
  denoised = enhanced.filter(ImageFilter.MedianFilter(size=3))
@@ -23,30 +19,18 @@ def preprocess_image(img: Image.Image) -> Image.Image:
23
  return sharpened
24
 
25
  def simple_parse_lines(text: str):
26
- """
27
- Heuristic parser:
28
- - Splits text into lines
29
- - Tries to extract Item and Price from each line
30
- - Category guessed from headings (lines in ALL CAPS or ending with ':')
31
- """
32
  rows = []
33
  current_category = None
34
-
35
  lines = [l.strip() for l in text.splitlines() if l.strip()]
36
  for line in lines:
37
- # Category guess
38
  if (line.isupper() and len(line.split()) <= 6) or line.endswith(':'):
39
  current_category = line.rstrip(':').strip()
40
  continue
41
-
42
- # Find price
43
  price_match = PRICE_PATTERN.search(line)
44
  if price_match:
45
  price_text = price_match.group(0)
46
  price_value = CLEAN_PRICE.sub('', price_text)
47
- # Item is everything before price
48
  item = line[:price_match.start()].strip(" -:•\t")
49
- # Cleanup item
50
  item = re.sub(r'\s{2,}', ' ', item)
51
  if item:
52
  rows.append({
@@ -57,39 +41,26 @@ def simple_parse_lines(text: str):
57
  return rows
58
 
59
  def process_images_to_zip(files):
60
- # Create temp workspace
61
  work_dir = tempfile.mkdtemp(prefix="menu_excel_")
62
  output_files = []
63
-
64
  for idx, file_path in enumerate(files, start=1):
65
- # Load image
66
  image = Image.open(file_path).convert("RGB")
67
  image = preprocess_image(image)
68
-
69
- # OCR
70
  text = pytesseract.image_to_string(image, lang="eng")
71
-
72
- # Parse
73
  rows = simple_parse_lines(text)
74
  if not rows:
75
- # Fallback: dump raw text if parsing failed
76
  df = pd.DataFrame([{"Extracted Text": text}])
77
  else:
78
  df = pd.DataFrame(rows, columns=["Item", "Price", "Category"])
79
-
80
- # Save Excel
81
  excel_name = f"menu_{idx:03d}.xlsx"
82
  excel_path = os.path.join(work_dir, excel_name)
83
  df.to_excel(excel_path, index=False)
84
  output_files.append(excel_path)
85
-
86
- # Bundle ZIP
87
  zip_name = f"menus_output_{uuid.uuid4().hex[:8]}.zip"
88
  zip_path = os.path.join(work_dir, zip_name)
89
  with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
90
  for path in output_files:
91
  zipf.write(path, arcname=os.path.basename(path))
92
-
93
  return zip_path
94
 
95
  with gr.Blocks(title="Menu to Excel (one file per image)") as demo:
@@ -98,12 +69,11 @@ with gr.Blocks(title="Menu to Excel (one file per image)") as demo:
98
  input_files = gr.File(
99
  label="Upload menu images",
100
  file_count="multiple",
101
- type="filepath", # ✅ fixed here
102
  file_types=[".png", ".jpg", ".jpeg"]
103
  )
104
  run_btn = gr.Button("Process")
105
  output_zip = gr.File(label="Download ZIP")
106
-
107
  run_btn.click(fn=process_images_to_zip, inputs=[input_files], outputs=[output_zip])
108
 
109
  if __name__ == "__main__":
 
8
  import tempfile
9
  import uuid
10
 
 
 
 
11
  PRICE_PATTERN = re.compile(r'(?<!\d)(?:₹\s*|Rs\.?\s*|INR\s*)?\d+(?:\.\d{1,2})?(?!\d)')
12
  CLEAN_PRICE = re.compile(r'[^0-9.]')
13
 
14
  def preprocess_image(img: Image.Image) -> Image.Image:
 
15
  gray = ImageOps.grayscale(img)
16
  enhanced = ImageOps.autocontrast(gray)
17
  denoised = enhanced.filter(ImageFilter.MedianFilter(size=3))
 
19
  return sharpened
20
 
21
  def simple_parse_lines(text: str):
 
 
 
 
 
 
22
  rows = []
23
  current_category = None
 
24
  lines = [l.strip() for l in text.splitlines() if l.strip()]
25
  for line in lines:
 
26
  if (line.isupper() and len(line.split()) <= 6) or line.endswith(':'):
27
  current_category = line.rstrip(':').strip()
28
  continue
 
 
29
  price_match = PRICE_PATTERN.search(line)
30
  if price_match:
31
  price_text = price_match.group(0)
32
  price_value = CLEAN_PRICE.sub('', price_text)
 
33
  item = line[:price_match.start()].strip(" -:•\t")
 
34
  item = re.sub(r'\s{2,}', ' ', item)
35
  if item:
36
  rows.append({
 
41
  return rows
42
 
43
  def process_images_to_zip(files):
 
44
  work_dir = tempfile.mkdtemp(prefix="menu_excel_")
45
  output_files = []
 
46
  for idx, file_path in enumerate(files, start=1):
 
47
  image = Image.open(file_path).convert("RGB")
48
  image = preprocess_image(image)
 
 
49
  text = pytesseract.image_to_string(image, lang="eng")
 
 
50
  rows = simple_parse_lines(text)
51
  if not rows:
 
52
  df = pd.DataFrame([{"Extracted Text": text}])
53
  else:
54
  df = pd.DataFrame(rows, columns=["Item", "Price", "Category"])
 
 
55
  excel_name = f"menu_{idx:03d}.xlsx"
56
  excel_path = os.path.join(work_dir, excel_name)
57
  df.to_excel(excel_path, index=False)
58
  output_files.append(excel_path)
 
 
59
  zip_name = f"menus_output_{uuid.uuid4().hex[:8]}.zip"
60
  zip_path = os.path.join(work_dir, zip_name)
61
  with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
62
  for path in output_files:
63
  zipf.write(path, arcname=os.path.basename(path))
 
64
  return zip_path
65
 
66
  with gr.Blocks(title="Menu to Excel (one file per image)") as demo:
 
69
  input_files = gr.File(
70
  label="Upload menu images",
71
  file_count="multiple",
72
+ type="filepath", # ✅ correct
73
  file_types=[".png", ".jpg", ".jpeg"]
74
  )
75
  run_btn = gr.Button("Process")
76
  output_zip = gr.File(label="Download ZIP")
 
77
  run_btn.click(fn=process_images_to_zip, inputs=[input_files], outputs=[output_zip])
78
 
79
  if __name__ == "__main__":