martinofumagalli commited on
Commit
89254c6
·
verified ·
1 Parent(s): 01593d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -13
app.py CHANGED
@@ -1,24 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ======================================================================
2
- # WEIGHT PARSER restituisce SOLO il valore, es. "94±3g" (no "Weight")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  # ======================================================================
4
  WEIGHT_VALUE_RE = re.compile(
5
- r"""(?ix) # i=ignorecase, x=verbose
6
  \bweight\b
7
- [^\n\r]{0,80}? # pochi caratteri sulla stessa riga
8
- ( # ======= GRUPPO CATTURATO =======
9
- (?:\d+(?:[.,]\d+)?\s* # valore principale es. 94 o 94,5
10
- (?:±|\+/?-|\+-)\s* # simbolo tolleranza: ± o +/- o +-
11
- \d+(?:[.,]\d+)?\s* # tolleranza es. 3 o 3,0
12
- (?:mg|g|kg) # unità
13
- )
14
  |
15
- (?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)) # fallback: solo "94 g"
16
- ) # =================================
17
  """,
18
  )
19
 
20
  def _normalize_weight(s: str) -> str:
21
- # compatta spazi, normalizza simbolo e virgole → "94±3g"
22
  s = (s or "").strip()
23
  s = s.replace(" ", "")
24
  s = s.replace("+/-", "±").replace("+-", "±")
@@ -31,7 +102,6 @@ def weight_from(t: str) -> str:
31
  m = WEIGHT_VALUE_RE.search(t)
32
  if m:
33
  return _normalize_weight(m.group(1))
34
- # Fallback riga-per-riga (OCR con spezzature strane)
35
  for line in (t or "").splitlines():
36
  if "weight" in line.lower():
37
  m2 = re.search(
@@ -41,3 +111,143 @@ def weight_from(t: str) -> str:
41
  if m2:
42
  return _normalize_weight(m2.group(1))
43
  return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, os, re
2
+ from typing import List, Dict
3
+ import streamlit as st
4
+ import pandas as pd
5
+
6
+ # --- PDF text
7
+ import pdfplumber
8
+ from pypdf import PdfReader
9
+
10
+ # --- OCR
11
+ from pdf2image import convert_from_bytes
12
+ import pytesseract
13
+ from PIL import Image
14
+
15
+ # ======================================================================
16
+ # SCHEMA TABELLA
17
+ # ======================================================================
18
+ SCHEMA = [
19
+ "Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File",
20
+ "Component","Function","General description of the packaging","Material Ref GCAS","Material Family"
21
+ ]
22
+
23
  # ======================================================================
24
+ # FUNZIONI BASE PDF / OCR
25
+ # ======================================================================
26
+ def extract_text_pages(pdf_bytes: bytes) -> List[str]:
27
+ try:
28
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
29
+ return [p.extract_text() or "" for p in pdf.pages]
30
+ except Exception:
31
+ pass
32
+ try:
33
+ reader = PdfReader(io.BytesIO(pdf_bytes))
34
+ return [(p.extract_text() or "") for p in reader.pages]
35
+ except Exception:
36
+ return []
37
+
38
+ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) -> List[str]:
39
+ if tesseract_cmd:
40
+ pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
41
+ imgs = convert_from_bytes(pdf_bytes, dpi=dpi)
42
+ config = "--psm 6 -c preserve_interword_spaces=1"
43
+ return [pytesseract.image_to_string(im, lang=lang, config=config) or "" for im in imgs]
44
+
45
+ # ======================================================================
46
+ # PARSER CAMPI TESTUALI
47
+ # ======================================================================
48
+ SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
49
+ TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
50
+ CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
51
+
52
+ def _first(text, pat):
53
+ m = pat.search(text or "")
54
+ return m.group(1).strip() if m else ""
55
+
56
+ def capacity_from(t):
57
+ m = re.search(r"([0-9]+(?:[.,][0-9]+)?)\s*(L|Liter|ml|mL)\b", t or "", re.I)
58
+ if not m: return ""
59
+ return f"{m.group(1).replace(',', '.')} {m.group(2).upper().replace('LITER','L').replace('ML','ml')}"
60
+
61
+ def color_from(t):
62
+ m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", t, re.I)
63
+ if m: return m.group(1).strip()
64
+ m = re.search(r"\b([A-Z ]{4,}(?:GREEN|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", t)
65
+ return m.group(1).strip() if m else ""
66
+
67
+ def material_from(t):
68
+ for l in (t or "").splitlines():
69
+ if re.search(r"\bRESIN\b", l, re.I):
70
+ return l.strip()
71
+ m = re.search(r"(SERIOPLAST.*?RESIN)", t, re.I)
72
+ return m.group(1).strip() if m else ""
73
+
74
+ # ======================================================================
75
+ # WEIGHT PARSER → restituisce esattamente "94±3g"
76
  # ======================================================================
77
  WEIGHT_VALUE_RE = re.compile(
78
+ r"""(?ix)
79
  \bweight\b
80
+ [^\n\r]{0,80}?
81
+ (
82
+ (?:\d+(?:[.,]\d+)?\s*
83
+ (?:±|\+/?-|\+-)\s*
84
+ \d+(?:[.,]\d+)?\s*
85
+ (?:mg|g|kg))
 
86
  |
87
+ (?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg))
88
+ )
89
  """,
90
  )
91
 
92
  def _normalize_weight(s: str) -> str:
 
93
  s = (s or "").strip()
94
  s = s.replace(" ", "")
95
  s = s.replace("+/-", "±").replace("+-", "±")
 
102
  m = WEIGHT_VALUE_RE.search(t)
103
  if m:
104
  return _normalize_weight(m.group(1))
 
105
  for line in (t or "").splitlines():
106
  if "weight" in line.lower():
107
  m2 = re.search(
 
111
  if m2:
112
  return _normalize_weight(m2.group(1))
113
  return ""
114
+
115
+ # ======================================================================
116
+ # ALTRE FUNZIONI
117
+ # ======================================================================
118
+ _ALLOWED_PIECES = {
119
+ "ribbon":"ribbon","bottle":"bottle","film bundle":"film bundle","container":"container",
120
+ "label - adhesive":"LABEL - ADHESIVE","label adhesive":"LABEL - ADHESIVE","label-adhesive":"LABEL - ADHESIVE",
121
+ "label - back":"LABEL - BACK","back label":"LABEL - BACK","label back":"LABEL - BACK","closure":"CLOSURE"
122
+ }
123
+ _PACK_COMP_TYPE_RE = re.compile(r"Packaging\s+Component\s+Type\s*[:\-]?\s*([^\n\r]+)", re.I)
124
+
125
+ def _normalize_piece(s):
126
+ s2 = re.sub(r"\s+", " ", (s or "").strip().lower())
127
+ for k,v in _ALLOWED_PIECES.items():
128
+ if k in s2: return v
129
+ return ""
130
+
131
+ def piece_from(t,cls):
132
+ m = _PACK_COMP_TYPE_RE.search(t or "")
133
+ if m:
134
+ val = _normalize_piece(m.group(1))
135
+ if val: return val
136
+ if cls:
137
+ if "bottle" in cls.lower(): return "bottle"
138
+ if "cap" in cls.lower(): return "CLOSURE"
139
+ if "corrugated" in cls.lower(): return "container"
140
+ if "label" in cls.lower(): return "LABEL - BACK"
141
+ return ""
142
+
143
+ FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
144
+
145
+ def component_from(t,piece,cls):
146
+ txt = t.lower()
147
+ if "label" in txt: return "Labels"
148
+ if piece: return piece
149
+ if cls and "bottle" in cls.lower(): return "Bottle"
150
+ return ""
151
+
152
+ def function_from(t):
153
+ m = FUNCTION_RE.search(t or "")
154
+ return m.group(1).title() if m else ""
155
+
156
+ def material_ref_gcas_from(t):
157
+ m = re.findall(r"\b(\d{7,9})\b", t or "")
158
+ return ", ".join(sorted(set(m))) if m else ""
159
+
160
+ def material_family_from(t):
161
+ fams = ["Monolayer HDPE","Polypropylene (PP)","Paper","Rigid Paper – Corrugated Case"]
162
+ for f in fams:
163
+ if f.lower() in (t or "").lower(): return f
164
+ if re.search(r"\bHDPE\b", t): return "Monolayer HDPE"
165
+ if re.search(r"\bPP\b", t, re.I): return "Polypropylene (PP)"
166
+ return ""
167
+
168
+ # ======================================================================
169
+ # PARSER PRINCIPALE
170
+ # ======================================================================
171
+ def parse_record(pages: List[str], source_name: str) -> Dict[str,str]:
172
+ full = "\n".join(pages or [""])
173
+ sku = _first(full, SKU_RE)
174
+ title = _first(full, TITLE_RE)
175
+ cls = _first(full, CLASS_RE)
176
+ cap = capacity_from(title) or capacity_from(full)
177
+ color = color_from(full)
178
+ material = material_from(full)
179
+ piece = piece_from(full, cls)
180
+ comp = component_from(full, piece, cls)
181
+ func = function_from(full)
182
+ gcas = material_ref_gcas_from(full)
183
+ mfam = material_family_from(full)
184
+ wght = weight_from(full)
185
+ return {
186
+ "Piece": piece or "","SKU": sku or "","Title": title or "",
187
+ "Capacity": cap or "","% Recycled": "–","Weight": wght or "–",
188
+ "Color": color or "","Material / Resin": material or "","Class": cls or "",
189
+ "Source File": source_name,"Component": comp or "","Function": func or "",
190
+ "General description of the packaging": "","Material Ref GCAS": gcas or "",
191
+ "Material Family": mfam or ""
192
+ }
193
+
194
+ # ======================================================================
195
+ # STREAMLIT UI
196
+ # ======================================================================
197
+ st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
198
+ st.title("📄→📊 PDF → Table (OCR-ready)")
199
+ st.caption("Estrae automaticamente i campi, incluso il peso dalle immagini OCR.")
200
+
201
+ with st.sidebar:
202
+ files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
203
+ st.markdown("---")
204
+ st.subheader("OCR")
205
+ ocr_fallback = st.checkbox("Usa OCR se non c'è testo", value=True)
206
+ ocr_lang = st.text_input("Lingue OCR (comma)", value="eng,ita")
207
+ ocr_dpi = st.number_input("DPI OCR", 200, 600, 300, 50)
208
+ tess_path = st.text_input("Percorso Tesseract (se non nel PATH)", value="")
209
+ run_btn = st.button("▶️ Estrai")
210
+
211
+ if not run_btn:
212
+ st.info("Carica i PDF e premi **Estrai**.")
213
+ st.stop()
214
+ if not files:
215
+ st.warning("Nessun PDF caricato.")
216
+ st.stop()
217
+
218
+ lang = "+".join([p.strip() for p in ocr_lang.split(",") if p.strip()]) or "eng"
219
+ tess_cmd = tess_path.strip() or None
220
+ rows, errors = [], []
221
+
222
+ for up in files:
223
+ try:
224
+ raw = up.read()
225
+ pages = extract_text_pages(raw)
226
+ if ocr_fallback and not any((p or "").strip() for p in pages):
227
+ pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
228
+ rec = parse_record(pages, up.name)
229
+ # se Weight vuoto, prova OCR diretto
230
+ if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
231
+ ocr_pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
232
+ w_ocr = weight_from("\n".join(ocr_pages))
233
+ if w_ocr: rec["Weight"] = w_ocr
234
+ rows.append(rec)
235
+ except Exception as e:
236
+ errors.append((up.name, str(e)))
237
+
238
+ if errors:
239
+ with st.expander("Errori"):
240
+ for n,e in errors: st.error(f"{n}: {e}")
241
+
242
+ df = pd.DataFrame(rows, columns=SCHEMA)
243
+ st.success(f"Creat{ 'e' if len(df)!=1 else 'a' } {len(df)} riga/e.")
244
+ st.dataframe(df,use_container_width=True)
245
+
246
+ c1,c2 = st.columns(2)
247
+ with c1:
248
+ st.download_button("⬇️ CSV", df.to_csv(index=False).encode("utf-8"), "table.csv", "text/csv")
249
+ with c2:
250
+ bio = io.BytesIO()
251
+ with pd.ExcelWriter(bio, engine="openpyxl") as xw:
252
+ df.to_excel(xw, index=False, sheet_name="data")
253
+ st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")