Corin1998's picture
Update pipelines/anonymize.py
c1cc164 verified
import re
from typing import Dict, Tuple, List
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import mm
import io
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
def _unique(seq: List[str]) -> List[str]:
s = set(); out = []
for x in seq:
if x not in s:
s.add(x); out.append(x)
return out
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
replace_map: Dict[str, str] = {}
for i, e in enumerate(_unique(EMAIL_RE.findall(text)), start=1):
replace_map[e] = f"<EMAIL_{i}>"
for i, p in enumerate(_unique([p.strip() for p in PHONE_RE.findall(text)]), start=1):
replace_map[p] = f"<PHONE_{i}>"
names = []
for m in NAME_LINE_RE.finditer(text):
nm = m.group(1).strip()
if nm:
names.append(nm)
for i, n in enumerate(_unique(names), start=1):
replace_map[n[:80]] = f"<NAME_{i}>"
for k in sorted(replace_map.keys(), key=len, reverse=True):
text = text.replace(k, replace_map[k])
return text, replace_map
def render_anonymized_pdf(text: str) -> bytes:
buf = io.BytesIO()
c = canvas.Canvas(buf, pagesize=A4)
width, height = A4
left = 15 * mm; top = height - 15 * mm; line_h = 6 * mm
x = left; y = top
c.setFont("Helvetica", 10)
for line in text.splitlines():
if y < 20 * mm:
c.showPage(); c.setFont("Helvetica", 10); y = top
max_chars = 110
if len(line) <= max_chars:
c.drawString(x, y, line); y -= line_h
else:
start = 0
while start < len(line):
seg = line[start:start+max_chars]
c.drawString(x, y, seg); y -= line_h
if y < 20 * mm:
c.showPage(); c.setFont("Helvetica", 10); y = top
start += max_chars
c.showPage(); c.save()
return buf.getvalue()