Spaces:
Runtime error
Runtime error
Update pipelines/anonymize.py
Browse files- pipelines/anonymize.py +13 -44
pipelines/anonymize.py
CHANGED
|
@@ -9,81 +9,50 @@ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
|
| 9 |
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
| 10 |
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
|
| 11 |
|
| 12 |
-
|
| 13 |
def _unique(seq: List[str]) -> List[str]:
|
| 14 |
-
s = set()
|
| 15 |
-
out = []
|
| 16 |
for x in seq:
|
| 17 |
if x not in s:
|
| 18 |
-
s.add(x)
|
| 19 |
-
out.append(x)
|
| 20 |
return out
|
| 21 |
|
| 22 |
-
|
| 23 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
| 24 |
replace_map: Dict[str, str] = {}
|
| 25 |
-
|
| 26 |
-
# emails
|
| 27 |
-
emails = _unique(EMAIL_RE.findall(text))
|
| 28 |
-
for i, e in enumerate(emails, start=1):
|
| 29 |
replace_map[e] = f"<EMAIL_{i}>"
|
| 30 |
-
|
| 31 |
-
# phones
|
| 32 |
-
phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
|
| 33 |
-
for i, p in enumerate(phones, start=1):
|
| 34 |
replace_map[p] = f"<PHONE_{i}>"
|
| 35 |
-
|
| 36 |
-
# names
|
| 37 |
names = []
|
| 38 |
for m in NAME_LINE_RE.finditer(text):
|
| 39 |
nm = m.group(1).strip()
|
| 40 |
if nm:
|
| 41 |
names.append(nm)
|
| 42 |
-
|
| 43 |
-
for i, n in enumerate(names, start=1):
|
| 44 |
replace_map[n[:80]] = f"<NAME_{i}>"
|
| 45 |
-
|
| 46 |
-
# 置換(長いキーから)
|
| 47 |
for k in sorted(replace_map.keys(), key=len, reverse=True):
|
| 48 |
text = text.replace(k, replace_map[k])
|
| 49 |
-
|
| 50 |
return text, replace_map
|
| 51 |
|
| 52 |
-
|
| 53 |
def render_anonymized_pdf(text: str) -> bytes:
|
| 54 |
buf = io.BytesIO()
|
| 55 |
c = canvas.Canvas(buf, pagesize=A4)
|
| 56 |
width, height = A4
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
top = height - 15 * mm
|
| 60 |
-
line_height = 6 * mm
|
| 61 |
-
x = left
|
| 62 |
-
y = top
|
| 63 |
-
|
| 64 |
c.setFont("Helvetica", 10)
|
| 65 |
-
|
| 66 |
for line in text.splitlines():
|
| 67 |
if y < 20 * mm:
|
| 68 |
-
c.showPage()
|
| 69 |
-
c.setFont("Helvetica", 10)
|
| 70 |
-
y = top
|
| 71 |
max_chars = 110
|
| 72 |
if len(line) <= max_chars:
|
| 73 |
-
c.drawString(x, y, line)
|
| 74 |
-
y -= line_height
|
| 75 |
else:
|
| 76 |
start = 0
|
| 77 |
while start < len(line):
|
| 78 |
-
seg = line[start:start
|
| 79 |
-
c.drawString(x, y, seg)
|
| 80 |
-
y -= line_height
|
| 81 |
if y < 20 * mm:
|
| 82 |
-
c.showPage()
|
| 83 |
-
c.setFont("Helvetica", 10)
|
| 84 |
-
y = top
|
| 85 |
start += max_chars
|
| 86 |
-
|
| 87 |
-
c.showPage()
|
| 88 |
-
c.save()
|
| 89 |
return buf.getvalue()
|
|
|
|
| 9 |
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
| 10 |
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
|
| 11 |
|
|
|
|
| 12 |
def _unique(seq: List[str]) -> List[str]:
|
| 13 |
+
s = set(); out = []
|
|
|
|
| 14 |
for x in seq:
|
| 15 |
if x not in s:
|
| 16 |
+
s.add(x); out.append(x)
|
|
|
|
| 17 |
return out
|
| 18 |
|
|
|
|
| 19 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
| 20 |
replace_map: Dict[str, str] = {}
|
| 21 |
+
for i, e in enumerate(_unique(EMAIL_RE.findall(text)), start=1):
|
|
|
|
|
|
|
|
|
|
| 22 |
replace_map[e] = f"<EMAIL_{i}>"
|
| 23 |
+
for i, p in enumerate(_unique([p.strip() for p in PHONE_RE.findall(text)]), start=1):
|
|
|
|
|
|
|
|
|
|
| 24 |
replace_map[p] = f"<PHONE_{i}>"
|
|
|
|
|
|
|
| 25 |
names = []
|
| 26 |
for m in NAME_LINE_RE.finditer(text):
|
| 27 |
nm = m.group(1).strip()
|
| 28 |
if nm:
|
| 29 |
names.append(nm)
|
| 30 |
+
for i, n in enumerate(_unique(names), start=1):
|
|
|
|
| 31 |
replace_map[n[:80]] = f"<NAME_{i}>"
|
|
|
|
|
|
|
| 32 |
for k in sorted(replace_map.keys(), key=len, reverse=True):
|
| 33 |
text = text.replace(k, replace_map[k])
|
|
|
|
| 34 |
return text, replace_map
|
| 35 |
|
|
|
|
| 36 |
def render_anonymized_pdf(text: str) -> bytes:
|
| 37 |
buf = io.BytesIO()
|
| 38 |
c = canvas.Canvas(buf, pagesize=A4)
|
| 39 |
width, height = A4
|
| 40 |
+
left = 15 * mm; top = height - 15 * mm; line_h = 6 * mm
|
| 41 |
+
x = left; y = top
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
c.setFont("Helvetica", 10)
|
|
|
|
| 43 |
for line in text.splitlines():
|
| 44 |
if y < 20 * mm:
|
| 45 |
+
c.showPage(); c.setFont("Helvetica", 10); y = top
|
|
|
|
|
|
|
| 46 |
max_chars = 110
|
| 47 |
if len(line) <= max_chars:
|
| 48 |
+
c.drawString(x, y, line); y -= line_h
|
|
|
|
| 49 |
else:
|
| 50 |
start = 0
|
| 51 |
while start < len(line):
|
| 52 |
+
seg = line[start:start+max_chars]
|
| 53 |
+
c.drawString(x, y, seg); y -= line_h
|
|
|
|
| 54 |
if y < 20 * mm:
|
| 55 |
+
c.showPage(); c.setFont("Helvetica", 10); y = top
|
|
|
|
|
|
|
| 56 |
start += max_chars
|
| 57 |
+
c.showPage(); c.save()
|
|
|
|
|
|
|
| 58 |
return buf.getvalue()
|