martinofumagalli commited on
Commit
7e449bd
·
verified ·
1 Parent(s): f35ad91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -125
app.py CHANGED
@@ -13,206 +13,351 @@ import pytesseract
13
  from PIL import Image
14
 
15
  # ======================================================================
16
- # SCHEMA TABELLA
17
  # ======================================================================
18
  SCHEMA = [
19
  "Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File",
 
20
  "Component","Function","General description of the packaging","Material Ref GCAS","Material Family"
21
  ]
22
 
23
  # ======================================================================
24
- # LETTURA PDF E OCR
25
  # ======================================================================
26
  def extract_text_pages(pdf_bytes: bytes) -> List[str]:
 
 
27
  try:
28
  with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
29
- return [p.extract_text() or "" for p in pdf.pages]
 
30
  except Exception:
31
- pass
32
- try:
33
- reader = PdfReader(io.BytesIO(pdf_bytes))
34
- return [(p.extract_text() or "") for p in reader.pages]
35
- except Exception:
36
- return []
 
 
 
37
 
38
  def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) -> List[str]:
39
  if tesseract_cmd:
40
  pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
41
- imgs = convert_from_bytes(pdf_bytes, dpi=dpi)
 
42
  config = "--psm 6 -c preserve_interword_spaces=1"
43
- return [pytesseract.image_to_string(im, lang=lang, config=config) or "" for im in imgs]
 
 
 
 
44
 
45
  # ======================================================================
46
- # REGEX E PARSER
47
  # ======================================================================
48
- SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
49
- TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
50
- CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
51
 
52
- def _first(text, pat):
53
- m = pat.search(text or ""); return m.group(1).strip() if m else ""
 
54
 
55
- def capacity_from(t):
56
- m=re.search(r"([0-9]+(?:[.,][0-9]+)?)\s*(L|Liter|ml|mL)\b",t or "",re.I)
57
  if not m: return ""
58
- return f"{m.group(1).replace(',','.')} {m.group(2).upper().replace('LITER','L').replace('ML','ml')}"
 
59
 
60
- def color_from(t):
61
- m=re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})",t,re.I)
62
  if m: return m.group(1).strip()
63
- m=re.search(r"\b([A-Z ]{4,}(?:GREEN|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b",t)
64
- return m.group(1).strip() if m else ""
65
 
66
- def material_from(t):
67
- for l in (t or "").splitlines():
68
- if re.search(r"\bRESIN\b",l,re.I): return l.strip()
69
- m=re.search(r"(SERIOPLAST.*?RESIN)",t,re.I)
 
 
70
  return m.group(1).strip() if m else ""
71
 
72
- # ======================================================================
73
- # WEIGHT PARSER → restituisce solo "94±3g"
74
- # ======================================================================
75
- WEIGHT_VALUE_RE = re.compile(r"\bweight\b[^\n\r]{0,80}?([0-9]+(?:[.,][0-9]+)?\s*(?:±|\+/?-|\+-)?\s*[0-9]*\s*(?:mg|g|kg))\b", re.I)
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- def weight_from(t: str) -> str:
78
- if not t: return ""
79
- m = WEIGHT_VALUE_RE.search(t)
80
- if m: return re.sub(r"\s+", "", m.group(1)).replace(",", ".")
81
- for line in t.splitlines():
 
 
 
 
 
 
 
 
 
 
 
82
  if "weight" in line.lower():
83
- m2 = re.search(r"([0-9]+(?:[.,][0-9]+)?\s*(?:±|\+/?-|\+-)?\s*[0-9]*\s*(?:mg|g|kg))", line, re.I)
84
- if m2: return re.sub(r"\s+", "", m2.group(1)).replace(",", ".")
 
 
 
 
85
  return ""
 
86
 
87
- # ======================================================================
88
- # ALTRE FUNZIONI (uguali a prima)
89
- # ======================================================================
90
- _ALLOWED_PIECES={"ribbon":"ribbon","bottle":"bottle","film bundle":"film bundle","container":"container",
91
- "label - adhesive":"LABEL - ADHESIVE","label adhesive":"LABEL - ADHESIVE","label-adhesive":"LABEL - ADHESIVE",
92
- "label - back":"LABEL - BACK","back label":"LABEL - BACK","label back":"LABEL - BACK","closure":"CLOSURE"}
93
- _PACK_COMP_TYPE_RE=re.compile(r"Packaging\s+Component\s+Type\s*[:\-]?\s*([^\n\r]+)",re.I)
94
-
95
- def _normalize_piece(s):
96
- s2=re.sub(r"\s+"," ",(s or "").strip().lower())
97
- for k,v in _ALLOWED_PIECES.items():
98
- if k in s2: return v
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  return ""
100
 
101
- def piece_from(t,cls):
102
- m=_PACK_COMP_TYPE_RE.search(t or "")
103
- if m: val=_normalize_piece(m.group(1));
104
- if m and val: return val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  if cls:
106
- if "bottle" in cls.lower(): return "bottle"
107
- if "cap" in cls.lower(): return "CLOSURE"
108
- if "corrugated" in cls.lower(): return "container"
109
- if "label" in cls.lower(): return "LABEL - BACK"
 
 
 
 
 
 
 
 
110
  return ""
 
111
 
112
- FUNCTION_RE=re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b",re.I)
 
113
 
114
- def component_from(t,piece,cls):
115
- txt=t.lower()
116
- if "label" in txt: return "Labels"
 
 
 
 
 
 
 
 
 
117
  if piece: return piece
118
- if cls and "bottle" in cls.lower(): return "Bottle"
 
 
 
 
119
  return ""
120
 
121
- def function_from(t):
122
- m=FUNCTION_RE.search(t or ""); return m.group(1).title() if m else ""
 
123
 
124
- def material_ref_gcas_from(t):
125
- m=re.findall(r"\b(\d{7,9})\b",t or ""); return ", ".join(sorted(set(m))) if m else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- def material_family_from(t):
128
- fams=["Monolayer HDPE","Polypropylene (PP)","Paper","Rigid Paper – Corrugated Case"]
129
- for f in fams:
130
- if f.lower() in (t or "").lower(): return f
131
- if re.search(r"\bHDPE\b",t): return "Monolayer HDPE"
132
- if re.search(r"\bPP\b",t,re.I): return "Polypropylene (PP)"
 
 
 
 
 
 
 
 
 
133
  return ""
134
 
135
- # ======================================================================
136
- # PARSER PRINCIPALE
137
- # ======================================================================
138
- def parse_record(pages: List[str], source_name: str) -> Dict[str,str]:
139
- full="\n".join(pages or [""])
140
- sku=_first(full,SKU_RE)
141
- title=_first(full,TITLE_RE)
142
- cls=_first(full,CLASS_RE)
143
- cap=capacity_from(title) or capacity_from(full)
144
- color=color_from(full)
145
- material=material_from(full)
146
- piece=piece_from(full,cls)
147
- comp=component_from(full,piece,cls)
148
- func=function_from(full)
149
- gcas=material_ref_gcas_from(full)
150
- mfam=material_family_from(full)
151
- wght=weight_from(full)
 
 
152
  return {
153
- "Piece":piece or "","SKU":sku or "","Title":title or "","Capacity":cap or "",
154
- "% Recycled":"–","Weight":wght or "–","Color":color or "","Material / Resin":material or "",
155
- "Class":cls or "","Source File":source_name,"Component":comp or "","Function":func or "",
156
- "General description of the packaging":"","Material Ref GCAS":gcas or "","Material Family":mfam or ""
 
 
 
 
 
 
 
 
 
 
 
157
  }
158
 
159
  # ======================================================================
160
- # STREAMLIT UI
161
  # ======================================================================
162
  st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
163
  st.title("📄→📊 PDF → Table (OCR-ready)")
164
- st.caption("Estrae automaticamente i campi, incluso il peso dalle immagini OCR.")
165
 
166
  with st.sidebar:
167
- files=st.file_uploader("Seleziona PDF",type=["pdf"],accept_multiple_files=True)
168
  st.markdown("---")
169
  st.subheader("OCR")
170
- ocr_fallback=st.checkbox("Usa OCR se non c'è testo",value=True)
171
- ocr_lang=st.text_input("Lingue OCR (comma)",value="eng,ita")
172
- ocr_dpi=st.number_input("DPI OCR",200,600,300,50)
173
- tess_path=st.text_input("Percorso Tesseract (se non nel PATH)",value="")
174
- run_btn=st.button("▶️ Estrai")
175
 
176
  if not run_btn:
177
  st.info("Carica i PDF e premi **Estrai**.")
178
  st.stop()
 
179
  if not files:
180
  st.warning("Nessun PDF caricato.")
181
  st.stop()
182
 
183
- lang="+".join([p.strip() for p in ocr_lang.split(",") if p.strip()]) or "eng"
184
- tess_cmd=tess_path.strip() or None
185
- rows,errors=[],[]
186
 
 
187
  for up in files:
188
  try:
189
- raw=up.read()
190
- pages=extract_text_pages(raw)
191
  if ocr_fallback and not any((p or "").strip() for p in pages):
192
- pages=run_ocr(raw,lang=lang,dpi=int(ocr_dpi),tesseract_cmd=tess_cmd)
193
- rec=parse_record(pages,up.name)
194
- # se Weight vuoto, prova OCR
195
- if (not rec.get("Weight") or rec["Weight"]=="–") and ocr_fallback:
196
- ocr_pages=run_ocr(raw,lang=lang,dpi=int(ocr_dpi),tesseract_cmd=tess_cmd)
197
- w_ocr=weight_from("\n".join(ocr_pages))
198
- if w_ocr: rec["Weight"]=w_ocr
199
  rows.append(rec)
200
  except Exception as e:
201
- errors.append((up.name,str(e)))
202
 
203
  if errors:
204
  with st.expander("Errori"):
205
- for n,e in errors: st.error(f"{n}: {e}")
 
206
 
207
- df=pd.DataFrame(rows,columns=SCHEMA)
208
  st.success(f"Creat{ 'e' if len(df)!=1 else 'a' } {len(df)} riga/e.")
209
- st.dataframe(df,use_container_width=True)
210
 
211
- c1,c2=st.columns(2)
212
  with c1:
213
- st.download_button("⬇️ CSV",df.to_csv(index=False).encode("utf-8"),"table.csv","text/csv")
214
  with c2:
215
- bio=io.BytesIO()
216
- with pd.ExcelWriter(bio,engine="openpyxl") as xw:
217
- df.to_excel(xw,index=False,sheet_name="data")
218
- st.download_button("⬇️ Excel",bio.getvalue(),"table.xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
 
 
13
  from PIL import Image
14
 
15
  # ======================================================================
16
+ # SCHEMA TABELLA (colonne fisse)
17
  # ======================================================================
18
  SCHEMA = [
19
  "Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File",
20
+ # nuove colonne
21
  "Component","Function","General description of the packaging","Material Ref GCAS","Material Family"
22
  ]
23
 
24
  # ======================================================================
25
+ # ESTRATTORI LOW-LEVEL
26
  # ======================================================================
27
  def extract_text_pages(pdf_bytes: bytes) -> List[str]:
28
+ pages = []
29
+ # 1) pdfplumber
30
  try:
31
  with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
32
+ for p in pdf.pages:
33
+ pages.append(p.extract_text() or "")
34
  except Exception:
35
+ pages = []
36
+ # 2) pypdf fallback
37
+ if not pages or all(not (t or "").strip() for t in pages):
38
+ try:
39
+ reader = PdfReader(io.BytesIO(pdf_bytes))
40
+ pages = [(p.extract_text() or "") for p in reader.pages]
41
+ except Exception:
42
+ pages = []
43
+ return pages
44
 
45
  def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) -> List[str]:
46
  if tesseract_cmd:
47
  pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
48
+ images = convert_from_bytes(pdf_bytes, dpi=dpi)
49
+ texts = []
50
  config = "--psm 6 -c preserve_interword_spaces=1"
51
+ for img in images:
52
+ if not isinstance(img, Image.Image):
53
+ img = img.convert("RGB")
54
+ texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
55
+ return texts
56
 
57
  # ======================================================================
58
+ # PARSING DOMINIO (euristiche/regex leggere)
59
  # ======================================================================
60
+ SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
61
+ TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
62
+ CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
63
 
64
+ def _first(text: str, pattern: re.Pattern, group: int = 1) -> str:
65
+ m = pattern.search(text or "")
66
+ return m.group(group).strip() if m else ""
67
 
68
+ def capacity_from(text: str) -> str:
69
+ m = re.search(r"([0-9]+(?:[.,][0-9]+)?)\s*(L|Liter|ml|mL)\b", text or "", re.I)
70
  if not m: return ""
71
+ unit = m.group(2).upper().replace("LITER","L").replace("ML","ml")
72
+ return f"{m.group(1).replace(',', '.')} {unit}"
73
 
74
+ def color_from(text: str) -> str:
75
+ m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
76
  if m: return m.group(1).strip()
77
+ m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
78
+ return (m.group(1).strip() if m else "")
79
 
80
+ def material_from(text: str) -> str:
81
+ # cattura righe con "RESIN" o frasi simili
82
+ for line in (text or "").splitlines():
83
+ if re.search(r"\bRESIN\b", line, re.I):
84
+ return line.strip()
85
+ m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
86
  return m.group(1).strip() if m else ""
87
 
88
+ # --- (AGGIUNTA) WEIGHT PARSER ----------------------------------------------
89
+ WEIGHT_TOL_RE = re.compile(
90
+ r"\bWeight\b[^\n\r]{0,15}?([0-9]+(?:[.,][0-9]+)?)\s*(?:±|\+/?-|\+-)\s*([0-9]+(?:[.,][0-9]+)?)\s*(mg|g|kg)?",
91
+ re.I,
92
+ )
93
+ WEIGHT_SIMPLE_RE = re.compile(
94
+ r"\bWeight\b[^\n\r]{0,15}?([0-9]+(?:[.,][0-9]+)?)\s*(mg|g|kg)\b",
95
+ re.I,
96
+ )
97
+ WEIGHT_INLINE_RE = re.compile(
98
+ r"\b([0-9]+(?:[.,][0-9]+)?)\s*(?:±|\+/?-|\+-)\s*([0-9]+(?:[.,][0-9]+)?)\s*(mg|g|kg)\b",
99
+ re.I,
100
+ )
101
+
102
+ def _norm_num(s: str) -> str:
103
+ return (s or "").replace(",", ".").strip().rstrip(".")
104
 
105
+ def weight_from(text: str) -> str:
106
+ # 1) match con tolleranza
107
+ m = WEIGHT_TOL_RE.search(text or "")
108
+ if m:
109
+ val = _norm_num(m.group(1))
110
+ tol = _norm_num(m.group(2))
111
+ unit = (m.group(3) or "g").lower()
112
+ return f"{val} ± {tol} {unit}"
113
+ # 2) match semplice con unità
114
+ m = WEIGHT_SIMPLE_RE.search(text or "")
115
+ if m:
116
+ val = _norm_num(m.group(1))
117
+ unit = (m.group(2) or "g").lower()
118
+ return f"{val} {unit}"
119
+ # 3) riga per riga per casi OCR
120
+ for line in (text or "").splitlines():
121
  if "weight" in line.lower():
122
+ m2 = WEIGHT_INLINE_RE.search(line)
123
+ if m2:
124
+ val = _norm_num(m2.group(1))
125
+ tol = _norm_num(m2.group(2))
126
+ unit = (m2.group(3) or "g").lower()
127
+ return f"{val} ± {tol} {unit}"
128
  return ""
129
+ # ---------------------------------------------------------------------------
130
 
131
+ # --------------------- AGGIUNTA RICHIESTA: PIECE da "Packaging Component Type" ---------------------
132
+ _ALLOWED_PIECES = {
133
+ "ribbon": "ribbon",
134
+ "bottle": "bottle",
135
+ "film bundle": "film bundle",
136
+ "container": "container",
137
+ "label - adhesive": "LABEL - ADHESIVE",
138
+ "label adhesive": "LABEL - ADHESIVE",
139
+ "label-adhesive": "LABEL - ADHESIVE",
140
+ "label - back": "LABEL - BACK",
141
+ "back label": "LABEL - BACK",
142
+ "label back": "LABEL - BACK",
143
+ "closure": "CLOSURE",
144
+ }
145
+
146
+ _PACK_COMP_TYPE_RE = re.compile(
147
+ r"Packaging\s+Component\s+Type\s*[:\-]?\s*([^\n\r]+)", re.I
148
+ )
149
+
150
+ def _normalize_piece(s: str) -> str:
151
+ s0 = (s or "").strip()
152
+ s1 = re.sub(r"\s+", " ", s0)
153
+ s2 = s1.lower()
154
+ s2 = s2.replace("–", "-").replace("—", "-")
155
+ s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
156
+ # prova match diretto
157
+ if s2 in _ALLOWED_PIECES:
158
+ return _ALLOWED_PIECES[s2]
159
+ # prova alcune normalizzazioni
160
+ s2 = s2.replace(" ", " ")
161
+ if s2 in _ALLOWED_PIECES:
162
+ return _ALLOWED_PIECES[s2]
163
+ # fallback per frasi lunghe: cerca la keyword migliore
164
+ for key, canon in _ALLOWED_PIECES.items():
165
+ if key in s2:
166
+ return canon
167
  return ""
168
 
169
+ def piece_from(text: str, cls: str) -> str:
170
+ """
171
+ 1) Cerca 'Packaging Component Type: <valore>' e normalizza al set richiesto.
172
+ 2) Se non trovato, usa vecchi fallback (Class/Material Type).
173
+ """
174
+ # 1) Packaging Component Type (linea dedicata)
175
+ m = _PACK_COMP_TYPE_RE.search(text or "")
176
+ if m:
177
+ val = m.group(1)
178
+ normalized = _normalize_piece(val)
179
+ if normalized:
180
+ return normalized
181
+
182
+ # 2) fallback legacy: Packaging Material Type
183
+ m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
184
+ if m2:
185
+ seg = m2.group(1)
186
+ norm = _normalize_piece(seg)
187
+ if norm:
188
+ return norm
189
+
190
+ # 3) fallback da Class
191
  if cls:
192
+ norm = _normalize_piece(cls)
193
+ if norm:
194
+ return norm
195
+ if "bottle" in cls.lower():
196
+ return "bottle"
197
+ if "cap" in cls.lower() or "closure" in cls.lower():
198
+ return "CLOSURE"
199
+ if "corrugated" in cls.lower():
200
+ return "container"
201
+ if "label" in cls.lower():
202
+ return "LABEL - BACK" # scelta neutra se non specificato
203
+
204
  return ""
205
+ # ----------------------------------------------------------------------------------------------------
206
 
207
+ # --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
208
+ FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
209
 
210
+ def component_from(text: str, piece: str, cls: str) -> str:
211
+ txt = text.lower()
212
+ # priorità a keyword esplicite
213
+ if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
214
+ if "ink foil" in txt: return "Ink foil"
215
+ if "tape" in txt: return "Tape"
216
+ if "label" in txt and ("psl" in txt or "wet glue" in txt or "iml" in txt or "htl" in txt): return "Labels"
217
+ if "adhesive" in txt or "hot melt" in txt: return "Adhesive"
218
+ if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
219
+ if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
220
+ if "bundle" in txt: return "Bundle"
221
+ # fallback da piece/class
222
  if piece: return piece
223
+ if cls:
224
+ if "bottle" in cls.lower(): return "Bottle"
225
+ if "cap" in cls.lower(): return "Closure"
226
+ if "corrugated" in cls.lower(): return "Corrugated box"
227
+ if "label" in cls.lower(): return "Labels"
228
  return ""
229
 
230
+ def function_from(text: str) -> str:
231
+ m = FUNCTION_RE.search(text or "")
232
+ return m.group(1).title() if m else ""
233
 
234
+ def material_ref_gcas_from(text: str) -> str:
235
+ # codici tipo 8 cifre (es. 90082546) o due codici tra parentesi
236
+ m = re.findall(r"\b(\d{7,9})\b", text or "")
237
+ if m:
238
+ seen = set(); out=[]
239
+ for x in m:
240
+ if x not in seen:
241
+ seen.add(x); out.append(x)
242
+ return ", ".join(out[:3])
243
+ m2 = re.findall(r"\((\d{5,})\s*kg\s*pack\)", text or "", re.I)
244
+ if m2:
245
+ seen=set(); out=[]
246
+ for x in m2:
247
+ if x not in seen:
248
+ seen.add(x); out.append(x)
249
+ return ", ".join(out[:3])
250
+ return ""
251
 
252
+ def material_family_from(text: str) -> str:
253
+ families = [
254
+ "Monolayer HDPE","Polypropylene (PP)","Paper","Flexible Film – Mono non Metallized",
255
+ "Flexible - Label PSL WGL IML HTL","Rigid Paper – Corrugated Case",
256
+ "Inks and solvents","Hot melt adhesive","Wet Glue Label",
257
+ "Coated paper","Wood","Ink foil","Fasson PE 85 TOP White"
258
+ ]
259
+ t = text or ""
260
+ for fam in families:
261
+ if fam.lower() in t.lower():
262
+ return fam
263
+ if re.search(r"\bHDPE\b", t): return "Monolayer HDPE"
264
+ if re.search(r"\bPP\b|\bPolypropylene\b", t, re.I): return "Polypropylene (PP)"
265
+ if "corrugated" in t.lower(): return "Rigid Paper – Corrugated Case"
266
+ if "paper" in t.lower(): return "Paper"
267
  return ""
268
 
269
+ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
270
+ full = "\n".join(pages or [""])
271
+ sku = _first(full, SKU_RE)
272
+ title = _first(full, TITLE_RE)
273
+ cls = _first(full, CLASS_RE)
274
+ cap = capacity_from(title) or capacity_from(full)
275
+ color = color_from(full)
276
+ material = material_from(full)
277
+ piece = piece_from(full, cls) # <-- usa la nuova logica
278
+
279
+ # nuove colonne (euristiche leggere)
280
+ comp = component_from(full, piece, cls)
281
+ func = function_from(full)
282
+ gcas = material_ref_gcas_from(full)
283
+ mfam = material_family_from(full)
284
+
285
+ # (AGGIUNTA) estrai peso
286
+ wght = weight_from(full)
287
+
288
  return {
289
+ "Piece": piece or "",
290
+ "SKU": sku or "",
291
+ "Title": title or "",
292
+ "Capacity": cap or "",
293
+ "% Recycled": "–",
294
+ "Weight": wght or "–",
295
+ "Color": color or "",
296
+ "Material / Resin": material or "",
297
+ "Class": cls or "",
298
+ "Source File": source_name,
299
+ "Component": comp or "",
300
+ "Function": func or "",
301
+ "General description of the packaging": "",
302
+ "Material Ref GCAS": gcas or "",
303
+ "Material Family": mfam or ""
304
  }
305
 
306
  # ======================================================================
307
+ # UI STREAMLIT
308
  # ======================================================================
309
  st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
310
  st.title("📄→📊 PDF → Table (OCR-ready)")
311
+ st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR come fallback.")
312
 
313
  with st.sidebar:
314
+ files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
315
  st.markdown("---")
316
  st.subheader("OCR")
317
+ ocr_fallback = st.checkbox("Usa OCR se non c'è testo", value=True)
318
+ ocr_lang = st.text_input("Lingue OCR (comma)", value="eng,ita")
319
+ ocr_dpi = st.number_input("DPI OCR", 200, 600, 300, 50)
320
+ tess_path = st.text_input("Percorso Tesseract (se non nel PATH)", value="")
321
+ run_btn = st.button("▶️ Estrai")
322
 
323
  if not run_btn:
324
  st.info("Carica i PDF e premi **Estrai**.")
325
  st.stop()
326
+
327
  if not files:
328
  st.warning("Nessun PDF caricato.")
329
  st.stop()
330
 
331
+ lang = "+".join([p.strip() for p in ocr_lang.split(",") if p.strip()]) or "eng"
332
+ tess_cmd = tess_path.strip() or None
 
333
 
334
+ rows, errors = [], []
335
  for up in files:
336
  try:
337
+ raw = up.read()
338
+ pages = extract_text_pages(raw)
339
  if ocr_fallback and not any((p or "").strip() for p in pages):
340
+ pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
341
+ rec = parse_record(pages, up.name)
 
 
 
 
 
342
  rows.append(rec)
343
  except Exception as e:
344
+ errors.append((up.name, str(e)))
345
 
346
  if errors:
347
  with st.expander("Errori"):
348
+ for name, err in errors:
349
+ st.error(f"{name}: {err}")
350
 
351
+ df = pd.DataFrame(rows, columns=SCHEMA)
352
  st.success(f"Creat{ 'e' if len(df)!=1 else 'a' } {len(df)} riga/e.")
353
+ st.dataframe(df, use_container_width=True)
354
 
355
+ c1, c2 = st.columns(2)
356
  with c1:
357
+ st.download_button("⬇️ CSV", df.to_csv(index=False).encode("utf-8"), "table.csv", "text/csv")
358
  with c2:
359
+ bio = io.BytesIO()
360
+ with pd.ExcelWriter(bio, engine="openpyxl") as xw:
361
+ df.to_excel(xw, index=False, sheet_name="data")
362
+ st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
363
+