martinofumagalli commited on
Commit
01593d2
·
verified ·
1 Parent(s): 011a311

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -213
app.py CHANGED
@@ -1,219 +1,43 @@
1
- import io, os, re
2
- from typing import List, Dict
3
- import streamlit as st
4
- import pandas as pd
5
-
6
- # --- PDF text
7
- import pdfplumber
8
- from pypdf import PdfReader
9
-
10
- # --- OCR
11
- from pdf2image import convert_from_bytes
12
- import pytesseract
13
- from PIL import Image
14
-
15
- # ======================================================================
16
- # SCHEMA TABELLA
17
- # ======================================================================
18
- SCHEMA = [
19
- "Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File",
20
- "Component","Function","General description of the packaging","Material Ref GCAS","Material Family"
21
- ]
22
-
23
  # ======================================================================
24
- # LETTURA PDF E OCR
25
- # ======================================================================
26
- def extract_text_pages(pdf_bytes: bytes) -> List[str]:
27
- try:
28
- with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
29
- return [p.extract_text() or "" for p in pdf.pages]
30
- except Exception:
31
- pass
32
- try:
33
- reader = PdfReader(io.BytesIO(pdf_bytes))
34
- return [(p.extract_text() or "") for p in reader.pages]
35
- except Exception:
36
- return []
37
-
38
- def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) -> List[str]:
39
- if tesseract_cmd:
40
- pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
41
- imgs = convert_from_bytes(pdf_bytes, dpi=dpi)
42
- config = "--psm 6 -c preserve_interword_spaces=1"
43
- return [pytesseract.image_to_string(im, lang=lang, config=config) or "" for im in imgs]
44
-
45
- # ======================================================================
46
- # REGEX E PARSER
47
- # ======================================================================
48
- SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
49
- TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
50
- CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
51
-
52
- def _first(text, pat):
53
- m = pat.search(text or ""); return m.group(1).strip() if m else ""
54
-
55
- def capacity_from(t):
56
- m=re.search(r"([0-9]+(?:[.,][0-9]+)?)\s*(L|Liter|ml|mL)\b",t or "",re.I)
57
- if not m: return ""
58
- return f"{m.group(1).replace(',','.')} {m.group(2).upper().replace('LITER','L').replace('ML','ml')}"
59
-
60
- def color_from(t):
61
- m=re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})",t,re.I)
62
- if m: return m.group(1).strip()
63
- m=re.search(r"\b([A-Z ]{4,}(?:GREEN|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b",t)
64
- return m.group(1).strip() if m else ""
65
-
66
- def material_from(t):
67
- for l in (t or "").splitlines():
68
- if re.search(r"\bRESIN\b",l,re.I): return l.strip()
69
- m=re.search(r"(SERIOPLAST.*?RESIN)",t,re.I)
70
- return m.group(1).strip() if m else ""
71
-
72
- # ======================================================================
73
- # WEIGHT PARSER → restituisce solo "94±3g"
74
- # ======================================================================
75
- WEIGHT_VALUE_RE = re.compile(r"\bweight\b[^\n\r]{0,80}?([0-9]+(?:[.,][0-9]+)?\s*(?:±|\+/?-|\+-)?\s*[0-9]*\s*(?:mg|g|kg))\b", re.I)
76
 
77
  def weight_from(t: str) -> str:
78
- if not t: return ""
 
79
  m = WEIGHT_VALUE_RE.search(t)
80
- if m: return re.sub(r"\s+", "", m.group(1)).replace(",", ".")
81
- for line in t.splitlines():
 
 
82
  if "weight" in line.lower():
83
- m2 = re.search(r"([0-9]+(?:[.,][0-9]+)?\s*(?:±|\+/?-|\+-)?\s*[0-9]*\s*(?:mg|g|kg))", line, re.I)
84
- if m2: return re.sub(r"\s+", "", m2.group(1)).replace(",", ".")
85
- return ""
86
-
87
- # ======================================================================
88
- # ALTRE FUNZIONI (uguali a prima)
89
- # ======================================================================
90
- _ALLOWED_PIECES={"ribbon":"ribbon","bottle":"bottle","film bundle":"film bundle","container":"container",
91
- "label - adhesive":"LABEL - ADHESIVE","label adhesive":"LABEL - ADHESIVE","label-adhesive":"LABEL - ADHESIVE",
92
- "label - back":"LABEL - BACK","back label":"LABEL - BACK","label back":"LABEL - BACK","closure":"CLOSURE"}
93
- _PACK_COMP_TYPE_RE=re.compile(r"Packaging\s+Component\s+Type\s*[:\-]?\s*([^\n\r]+)",re.I)
94
-
95
- def _normalize_piece(s):
96
- s2=re.sub(r"\s+"," ",(s or "").strip().lower())
97
- for k,v in _ALLOWED_PIECES.items():
98
- if k in s2: return v
99
  return ""
100
-
101
- def piece_from(t,cls):
102
- m=_PACK_COMP_TYPE_RE.search(t or "")
103
- if m: val=_normalize_piece(m.group(1));
104
- if m and val: return val
105
- if cls:
106
- if "bottle" in cls.lower(): return "bottle"
107
- if "cap" in cls.lower(): return "CLOSURE"
108
- if "corrugated" in cls.lower(): return "container"
109
- if "label" in cls.lower(): return "LABEL - BACK"
110
- return ""
111
-
112
- FUNCTION_RE=re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b",re.I)
113
-
114
- def component_from(t,piece,cls):
115
- txt=t.lower()
116
- if "label" in txt: return "Labels"
117
- if piece: return piece
118
- if cls and "bottle" in cls.lower(): return "Bottle"
119
- return ""
120
-
121
- def function_from(t):
122
- m=FUNCTION_RE.search(t or ""); return m.group(1).title() if m else ""
123
-
124
- def material_ref_gcas_from(t):
125
- m=re.findall(r"\b(\d{7,9})\b",t or ""); return ", ".join(sorted(set(m))) if m else ""
126
-
127
- def material_family_from(t):
128
- fams=["Monolayer HDPE","Polypropylene (PP)","Paper","Rigid Paper – Corrugated Case"]
129
- for f in fams:
130
- if f.lower() in (t or "").lower(): return f
131
- if re.search(r"\bHDPE\b",t): return "Monolayer HDPE"
132
- if re.search(r"\bPP\b",t,re.I): return "Polypropylene (PP)"
133
- return ""
134
-
135
- # ======================================================================
136
- # PARSER PRINCIPALE
137
- # ======================================================================
138
- def parse_record(pages: List[str], source_name: str) -> Dict[str,str]:
139
- full="\n".join(pages or [""])
140
- sku=_first(full,SKU_RE)
141
- title=_first(full,TITLE_RE)
142
- cls=_first(full,CLASS_RE)
143
- cap=capacity_from(title) or capacity_from(full)
144
- color=color_from(full)
145
- material=material_from(full)
146
- piece=piece_from(full,cls)
147
- comp=component_from(full,piece,cls)
148
- func=function_from(full)
149
- gcas=material_ref_gcas_from(full)
150
- mfam=material_family_from(full)
151
- wght=weight_from(full)
152
- return {
153
- "Piece":piece or "","SKU":sku or "","Title":title or "","Capacity":cap or "",
154
- "% Recycled":"–","Weight":wght or "–","Color":color or "","Material / Resin":material or "",
155
- "Class":cls or "","Source File":source_name,"Component":comp or "","Function":func or "",
156
- "General description of the packaging":"","Material Ref GCAS":gcas or "","Material Family":mfam or ""
157
- }
158
-
159
- # ======================================================================
160
- # STREAMLIT UI
161
- # ======================================================================
162
- st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
163
- st.title("📄→📊 PDF → Table (OCR-ready)")
164
- st.caption("Estrae automaticamente i campi, incluso il peso dalle immagini OCR.")
165
-
166
- with st.sidebar:
167
- files=st.file_uploader("Seleziona PDF",type=["pdf"],accept_multiple_files=True)
168
- st.markdown("---")
169
- st.subheader("OCR")
170
- ocr_fallback=st.checkbox("Usa OCR se non c'è testo",value=True)
171
- ocr_lang=st.text_input("Lingue OCR (comma)",value="eng,ita")
172
- ocr_dpi=st.number_input("DPI OCR",200,600,300,50)
173
- tess_path=st.text_input("Percorso Tesseract (se non nel PATH)",value="")
174
- run_btn=st.button("▶️ Estrai")
175
-
176
- if not run_btn:
177
- st.info("Carica i PDF e premi **Estrai**.")
178
- st.stop()
179
- if not files:
180
- st.warning("Nessun PDF caricato.")
181
- st.stop()
182
-
183
- lang="+".join([p.strip() for p in ocr_lang.split(",") if p.strip()]) or "eng"
184
- tess_cmd=tess_path.strip() or None
185
- rows,errors=[],[]
186
-
187
- for up in files:
188
- try:
189
- raw=up.read()
190
- pages=extract_text_pages(raw)
191
- if ocr_fallback and not any((p or "").strip() for p in pages):
192
- pages=run_ocr(raw,lang=lang,dpi=int(ocr_dpi),tesseract_cmd=tess_cmd)
193
- rec=parse_record(pages,up.name)
194
- # se Weight vuoto, prova OCR
195
- if (not rec.get("Weight") or rec["Weight"]=="–") and ocr_fallback:
196
- ocr_pages=run_ocr(raw,lang=lang,dpi=int(ocr_dpi),tesseract_cmd=tess_cmd)
197
- w_ocr=weight_from("\n".join(ocr_pages))
198
- if w_ocr: rec["Weight"]=w_ocr
199
- rows.append(rec)
200
- except Exception as e:
201
- errors.append((up.name,str(e)))
202
-
203
- if errors:
204
- with st.expander("Errori"):
205
- for n,e in errors: st.error(f"{n}: {e}")
206
-
207
- df=pd.DataFrame(rows,columns=SCHEMA)
208
- st.success(f"Creat{ 'e' if len(df)!=1 else 'a' } {len(df)} riga/e.")
209
- st.dataframe(df,use_container_width=True)
210
-
211
- c1,c2=st.columns(2)
212
- with c1:
213
- st.download_button("⬇️ CSV",df.to_csv(index=False).encode("utf-8"),"table.csv","text/csv")
214
- with c2:
215
- bio=io.BytesIO()
216
- with pd.ExcelWriter(bio,engine="openpyxl") as xw:
217
- df.to_excel(xw,index=False,sheet_name="data")
218
- st.download_button("⬇️ Excel",bio.getvalue(),"table.xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
219
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ======================================================================
2
+ # WEIGHT PARSER restituisce SOLO il valore, es. "94±3g" (no "Weight")
3
+ # ======================================================================
4
+ WEIGHT_VALUE_RE = re.compile(
5
+ r"""(?ix) # i=ignorecase, x=verbose
6
+ \bweight\b
7
+ [^\n\r]{0,80}? # pochi caratteri sulla stessa riga
8
+ ( # ======= GRUPPO CATTURATO =======
9
+ (?:\d+(?:[.,]\d+)?\s* # valore principale es. 94 o 94,5
10
+ (?:±|\+/?-|\+-)\s* # simbolo tolleranza: ± o +/- o +-
11
+ \d+(?:[.,]\d+)?\s* # tolleranza es. 3 o 3,0
12
+ (?:mg|g|kg) # unità
13
+ )
14
+ |
15
+ (?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)) # fallback: solo "94 g"
16
+ ) # =================================
17
+ """,
18
+ )
19
+
20
+ def _normalize_weight(s: str) -> str:
21
+ # compatta spazi, normalizza simbolo e virgole "94±3g"
22
+ s = (s or "").strip()
23
+ s = s.replace(" ", "")
24
+ s = s.replace("+/-", "±").replace("+-", "±")
25
+ s = s.replace(",", ".")
26
+ return s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def weight_from(t: str) -> str:
29
+ if not t:
30
+ return ""
31
  m = WEIGHT_VALUE_RE.search(t)
32
+ if m:
33
+ return _normalize_weight(m.group(1))
34
+ # Fallback riga-per-riga (OCR con spezzature strane)
35
+ for line in (t or "").splitlines():
36
  if "weight" in line.lower():
37
+ m2 = re.search(
38
+ r"(?ix)\bweight\b[^\n\r]*?((?:\d+(?:[.,]\d+)?\s*(?:±|\+/?-|\+-)\s*\d+(?:[.,]\d+)?\s*(?:mg|g|kg))|(?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)))",
39
+ line,
40
+ )
41
+ if m2:
42
+ return _normalize_weight(m2.group(1))
 
 
 
 
 
 
 
 
 
 
43
  return ""