File size: 2,128 Bytes
d0bc04c
58ca006
85287ae
58ca006
 
 
 
 
 
 
85287ae
58ca006
c1cc164
58ca006
 
c1cc164
58ca006
d0bc04c
85287ae
58ca006
c1cc164
2418fb0
c1cc164
2418fb0
58ca006
 
 
 
 
c1cc164
2418fb0
 
 
 
d0bc04c
 
58ca006
d0bc04c
85287ae
c1cc164
 
58ca006
d0bc04c
58ca006
c1cc164
2418fb0
58ca006
c1cc164
58ca006
 
 
c1cc164
 
58ca006
c1cc164
58ca006
c1cc164
d0bc04c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from typing import Dict, Tuple, List
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import mm
import io

EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)

def _unique(seq: List[str]) -> List[str]:
    s = set(); out = []
    for x in seq:
        if x not in s:
            s.add(x); out.append(x)
    return out

def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
    replace_map: Dict[str, str] = {}
    for i, e in enumerate(_unique(EMAIL_RE.findall(text)), start=1):
        replace_map[e] = f"<EMAIL_{i}>"
    for i, p in enumerate(_unique([p.strip() for p in PHONE_RE.findall(text)]), start=1):
        replace_map[p] = f"<PHONE_{i}>"
    names = []
    for m in NAME_LINE_RE.finditer(text):
        nm = m.group(1).strip()
        if nm:
            names.append(nm)
    for i, n in enumerate(_unique(names), start=1):
        replace_map[n[:80]] = f"<NAME_{i}>"
    for k in sorted(replace_map.keys(), key=len, reverse=True):
        text = text.replace(k, replace_map[k])
    return text, replace_map

def render_anonymized_pdf(text: str) -> bytes:
    buf = io.BytesIO()
    c = canvas.Canvas(buf, pagesize=A4)
    width, height = A4
    left = 15 * mm; top = height - 15 * mm; line_h = 6 * mm
    x = left; y = top
    c.setFont("Helvetica", 10)
    for line in text.splitlines():
        if y < 20 * mm:
            c.showPage(); c.setFont("Helvetica", 10); y = top
        max_chars = 110
        if len(line) <= max_chars:
            c.drawString(x, y, line); y -= line_h
        else:
            start = 0
            while start < len(line):
                seg = line[start:start+max_chars]
                c.drawString(x, y, seg); y -= line_h
                if y < 20 * mm:
                    c.showPage(); c.setFont("Helvetica", 10); y = top
                start += max_chars
    c.showPage(); c.save()
    return buf.getvalue()