martinofumagalli commited on
Commit
7146dfb
·
verified ·
1 Parent(s): 04bae7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -28
app.py CHANGED
@@ -1,21 +1,29 @@
1
  import io, os, re
2
- from typing import List, Dict, Tuple
3
  import streamlit as st
4
  import pandas as pd
5
 
6
- # PDF text
7
  import pdfplumber
8
  from pypdf import PdfReader
9
 
10
- # OCR
11
  from pdf2image import convert_from_bytes
12
  import pytesseract
13
  from PIL import Image
14
 
15
- SCHEMA = ["Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File"]
16
-
17
- # ------------------ low-level extractors ------------------
 
 
 
 
 
18
 
 
 
 
19
  def extract_text_pages(pdf_bytes: bytes) -> List[str]:
20
  pages = []
21
  # 1) pdfplumber
@@ -46,13 +54,14 @@ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) ->
46
  texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
47
  return texts
48
 
49
- # ------------------ domain parsing ------------------
50
-
51
- SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
52
- TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
53
- CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
 
54
 
55
- def first(text: str, pattern: re.Pattern, group: int = 1) -> str:
56
  m = pattern.search(text or "")
57
  return m.group(group).strip() if m else ""
58
 
@@ -63,14 +72,13 @@ def capacity_from(text: str) -> str:
63
  return f"{m.group(1).replace(',', '.')} {unit}"
64
 
65
  def color_from(text: str) -> str:
66
- # preferisci "Part Color" / "Color" oppure parole in MAIUSCOLO vicino a GREEN/TRANSPARENT ecc.
67
  m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
68
  if m: return m.group(1).strip()
69
  m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
70
  return (m.group(1).strip() if m else "")
71
 
72
  def material_from(text: str) -> str:
73
- # cattura righe con "RESIN" o "SERIOPLAST ... RESIN"
74
  for line in (text or "").splitlines():
75
  if re.search(r"\bRESIN\b", line, re.I):
76
  return line.strip()
@@ -78,30 +86,100 @@ def material_from(text: str) -> str:
78
  return m.group(1).strip() if m else ""
79
 
80
  def piece_from(text: str, cls: str) -> str:
81
- # se trovi "Packaging Material Type Rigid- Bottle" → "Bottle"
82
  m = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text, re.I)
83
  if m:
84
  seg = m.group(1)
85
- m2 = re.search(r"\b(Bottle|Cap|Container|Lid|Carton|Case)\b", seg, re.I)
86
  if m2: return m2.group(1).capitalize()
87
- # dal Class "Bottles" → "Bottle", "Corrugated" → "Container"
88
  if cls:
89
  if "Bottle" in cls or "Bottles" in cls: return "Bottle"
90
  if "Cap" in cls or "Closures" in cls: return "Cap"
91
  if "Corrugated" in cls: return "Container"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  return ""
93
 
94
  def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
95
  full = "\n".join(pages or [""])
96
- sku = first(full, SKU_RE)
97
- title = first(full, TITLE_RE)
98
- cls = first(full, CLASS_RE)
99
- cap = capacity_from(title) or capacity_from(full)
100
  color = color_from(full)
101
  material = material_from(full)
102
  piece = piece_from(full, cls)
103
 
104
- rec = {
 
 
 
 
 
 
105
  "Piece": piece or "",
106
  "SKU": sku or "",
107
  "Title": title or "",
@@ -112,14 +190,19 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
112
  "Material / Resin": material or "",
113
  "Class": cls or "",
114
  "Source File": source_name,
 
 
 
 
 
115
  }
116
- return rec
117
-
118
- # ------------------ UI ------------------
119
 
120
- st.set_page_config(page_title="PDF → Table (OCR)", layout="wide")
 
 
 
121
  st.title("📄→📊 PDF → Table (OCR-ready)")
122
- st.caption("Carica PDF (anche scansioni). Per ogni file compilo: Piece, SKU, Title, Capacity, % Recycled, Weight, Color, Material / Resin, Class, Source File.")
123
 
124
  with st.sidebar:
125
  files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
@@ -150,7 +233,6 @@ for up in files:
150
  if ocr_fallback and not any((p or "").strip() for p in pages):
151
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
152
  rec = parse_record(pages, up.name)
153
- # se nel futuro ci saranno più SKU in un PDF, qui potremmo generare più rec (lista)
154
  rows.append(rec)
155
  except Exception as e:
156
  errors.append((up.name, str(e)))
 
1
  import io, os, re
2
+ from typing import List, Dict
3
  import streamlit as st
4
  import pandas as pd
5
 
6
+ # --- PDF text
7
  import pdfplumber
8
  from pypdf import PdfReader
9
 
10
+ # --- OCR
11
  from pdf2image import convert_from_bytes
12
  import pytesseract
13
  from PIL import Image
14
 
15
+ # ======================================================================
16
+ # SCHEMA TABELLA (colonne fisse)
17
+ # ======================================================================
18
+ SCHEMA = [
19
+ "Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File",
20
+ # nuove colonne
21
+ "Component","Function","General description of the packaging","Material Ref GCAS","Material Family"
22
+ ]
23
 
24
+ # ======================================================================
25
+ # ESTRATTORI LOW-LEVEL
26
+ # ======================================================================
27
  def extract_text_pages(pdf_bytes: bytes) -> List[str]:
28
  pages = []
29
  # 1) pdfplumber
 
54
  texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
55
  return texts
56
 
57
+ # ======================================================================
58
+ # PARSING DOMINIO (euristiche/regex leggere)
59
+ # ======================================================================
60
+ SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
61
+ TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
62
+ CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
63
 
64
+ def _first(text: str, pattern: re.Pattern, group: int = 1) -> str:
65
  m = pattern.search(text or "")
66
  return m.group(group).strip() if m else ""
67
 
 
72
  return f"{m.group(1).replace(',', '.')} {unit}"
73
 
74
  def color_from(text: str) -> str:
 
75
  m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
76
  if m: return m.group(1).strip()
77
  m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
78
  return (m.group(1).strip() if m else "")
79
 
80
  def material_from(text: str) -> str:
81
+ # cattura righe con "RESIN" o frasi simili
82
  for line in (text or "").splitlines():
83
  if re.search(r"\bRESIN\b", line, re.I):
84
  return line.strip()
 
86
  return m.group(1).strip() if m else ""
87
 
88
  def piece_from(text: str, cls: str) -> str:
 
89
  m = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text, re.I)
90
  if m:
91
  seg = m.group(1)
92
+ m2 = re.search(r"\b(Bottle|Cap|Container|Lid|Carton|Case|Label|Tape)\b", seg, re.I)
93
  if m2: return m2.group(1).capitalize()
 
94
  if cls:
95
  if "Bottle" in cls or "Bottles" in cls: return "Bottle"
96
  if "Cap" in cls or "Closures" in cls: return "Cap"
97
  if "Corrugated" in cls: return "Container"
98
+ if "Label" in cls: return "Label"
99
+ return ""
100
+
101
+ # --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
102
+ FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
103
+
104
+ def component_from(text: str, piece: str, cls: str) -> str:
105
+ txt = text.lower()
106
+ # priorità a keyword esplicite
107
+ if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
108
+ if "ink foil" in txt: return "Ink foil"
109
+ if "tape" in txt: return "Tape"
110
+ if "label" in txt and ("psl" in txt or "wet glue" in txt or "iml" in txt or "htl" in txt): return "Labels"
111
+ if "adhesive" in txt or "hot melt" in txt: return "Adhesive"
112
+ if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
113
+ if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
114
+ if "bundle" in txt: return "Bundle"
115
+ # fallback da piece/class
116
+ if piece: return piece
117
+ if cls:
118
+ if "bottle" in cls.lower(): return "Bottle"
119
+ if "cap" in cls.lower(): return "Closure"
120
+ if "corrugated" in cls.lower(): return "Corrugated box"
121
+ if "label" in cls.lower(): return "Labels"
122
+ return ""
123
+
124
+ def function_from(text: str) -> str:
125
+ m = FUNCTION_RE.search(text or "")
126
+ return m.group(1).title() if m else ""
127
+
128
+ def material_ref_gcas_from(text: str) -> str:
129
+ # codici tipo 8 cifre (es. 90082546) o due codici tra parentesi
130
+ m = re.findall(r"\b(\d{7,9})\b", text or "")
131
+ if m:
132
+ # dedup conservando ordine
133
+ seen = set(); out=[]
134
+ for x in m:
135
+ if x not in seen:
136
+ seen.add(x); out.append(x)
137
+ return ", ".join(out[:3]) # limita a 3 per non esagerare
138
+ # anche pattern "(\d{5,}) (xx kg pack)" ecc.
139
+ m2 = re.findall(r"\((\d{5,})\s*kg\s*pack\)", text or "", re.I)
140
+ if m2:
141
+ seen=set(); out=[]
142
+ for x in m2:
143
+ if x not in seen:
144
+ seen.add(x); out.append(x)
145
+ return ", ".join(out[:3])
146
+ return ""
147
+
148
+ def material_family_from(text: str) -> str:
149
+ families = [
150
+ "Monolayer HDPE","Polypropylene (PP)","Paper","Flexible Film – Mono non Metallized",
151
+ "Flexible - Label PSL WGL IML HTL","Rigid Paper – Corrugated Case",
152
+ "Inks and solvents","Hot melt adhesive","Wet Glue Label",
153
+ "Coated paper","Wood","Ink foil","Fasson PE 85 TOP White"
154
+ ]
155
+ t = text or ""
156
+ for fam in families:
157
+ if fam.lower() in t.lower():
158
+ return fam
159
+ # fallback su parole chiave comuni
160
+ if re.search(r"\bHDPE\b", t): return "Monolayer HDPE"
161
+ if re.search(r"\bPP\b|\bPolypropylene\b", t, re.I): return "Polypropylene (PP)"
162
+ if "corrugated" in t.lower(): return "Rigid Paper – Corrugated Case"
163
+ if "paper" in t.lower(): return "Paper"
164
  return ""
165
 
166
  def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
167
  full = "\n".join(pages or [""])
168
+ sku = _first(full, SKU_RE)
169
+ title = _first(full, TITLE_RE)
170
+ cls = _first(full, CLASS_RE)
171
+ cap = capacity_from(title) or capacity_from(full)
172
  color = color_from(full)
173
  material = material_from(full)
174
  piece = piece_from(full, cls)
175
 
176
+ # nuove colonne (euristiche leggere)
177
+ comp = component_from(full, piece, cls)
178
+ func = function_from(full)
179
+ gcas = material_ref_gcas_from(full)
180
+ mfam = material_family_from(full)
181
+
182
+ return {
183
  "Piece": piece or "",
184
  "SKU": sku or "",
185
  "Title": title or "",
 
190
  "Material / Resin": material or "",
191
  "Class": cls or "",
192
  "Source File": source_name,
193
+ "Component": comp or "",
194
+ "Function": func or "",
195
+ "General description of the packaging": "", # da riempire con regole quando ci dai esempi strutturati
196
+ "Material Ref GCAS": gcas or "",
197
+ "Material Family": mfam or ""
198
  }
 
 
 
199
 
200
+ # ======================================================================
201
+ # UI STREAMLIT
202
+ # ======================================================================
203
+ st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
204
  st.title("📄→📊 PDF → Table (OCR-ready)")
205
+ st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR come fallback.")
206
 
207
  with st.sidebar:
208
  files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
 
233
  if ocr_fallback and not any((p or "").strip() for p in pages):
234
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
235
  rec = parse_record(pages, up.name)
 
236
  rows.append(rec)
237
  except Exception as e:
238
  errors.append((up.name, str(e)))