Corin1998 commited on
Commit
85287ae
·
verified ·
1 Parent(s): 5e3c23b

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +44 -35
pipelines/anonymize.py CHANGED
@@ -1,55 +1,64 @@
1
- import io
2
  import re
3
- from reportlab.pdfgen import canvas
4
  from reportlab.lib.pagesizes import A4
5
- from reportlab.lib.units import mm
 
 
 
 
 
 
 
 
6
 
7
- RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
8
- RE_PHONE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
9
- RE_NAME = re.compile(r"(?:(?:氏名[::]?)[\t ]*)(\S+)")
10
 
11
- def anonymize_text(text: str):
12
- anon_map = {}
 
13
 
14
- def repl_email(m):
15
- val = m.group(0)
16
- key = "<EMAIL>"
17
- anon_map.setdefault(key, []).append(val)
18
- return key
19
 
20
- def repl_phone(m):
21
- val = m.group(0)
22
- key = "<PHONE>"
23
- anon_map.setdefault(key, []).append(val)
24
- return key
 
 
 
25
 
26
- def repl_name(m):
27
- val = m.group(1)
28
- key = "<NAME>"
29
- anon_map.setdefault(key, []).append(val)
30
- return m.group(0).replace(val, key)
 
 
31
 
32
- t = RE_EMAIL.sub(repl_email, text)
33
- t = RE_PHONE.sub(repl_phone, t)
34
- t = RE_NAME.sub(repl_name, t)
35
 
36
- return t, anon_map
37
 
38
  def render_anonymized_pdf(text: str) -> bytes:
39
- buf = io.BytesIO()
40
  c = canvas.Canvas(buf, pagesize=A4)
 
41
 
42
- margin = 15 * mm
43
- line_height = 5 * mm
44
- x = margin
45
- y = A4[1] - margin
46
 
47
  for line in text.splitlines():
48
- if y < margin:
49
  c.showPage()
50
- y = A4[1] - margin
51
- c.drawString(x, y, line[:1200])
52
  y -= line_height
53
 
 
54
  c.save()
55
  return buf.getvalue()
 
1
+ from typing import Tuple, Dict
2
  import re
 
3
  from reportlab.lib.pagesizes import A4
4
+ from reportlab.pdfgen import canvas
5
+ from io import BytesIO
6
+
7
+
8
+ PII_PATTERNS = {
9
+ "EMAIL": re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"),
10
+ "PHONE": re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}"),
11
+ "NAME_HINT": re.compile(r"(?:氏名|Name)\s*[::]?\s*([^\n]+)"),
12
+ }
13
 
 
 
 
14
 
15
+ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
16
+ mapping: Dict[str, str] = {}
17
+ redacted = text
18
 
19
+ # Email
20
+ for m in set(PII_PATTERNS["EMAIL"].findall(text)):
21
+ token = f"[EMAIL_{abs(hash(m))%10000:04d}]"
22
+ mapping[m] = token
23
+ redacted = redacted.replace(m, token)
24
 
25
+ # Phone
26
+ for m in set(PII_PATTERNS["PHONE"].findall(text)):
27
+ m_str = m if isinstance(m, str) else "".join(m)
28
+ if len(m_str.strip()) < 7:
29
+ continue
30
+ token = f"[TEL_{abs(hash(m_str))%10000:04d}]"
31
+ mapping[m_str] = token
32
+ redacted = redacted.replace(m_str, token)
33
 
34
+ # Name line (hint-based)
35
+ for nm in set(PII_PATTERNS["NAME_HINT"].findall(text)):
36
+ nm_clean = nm.strip()
37
+ if nm_clean:
38
+ token = f"[NAME_{abs(hash(nm_clean))%10000:04d}]"
39
+ mapping[nm_clean] = token
40
+ redacted = redacted.replace(nm_clean, token)
41
 
42
+ return redacted, mapping
 
 
43
 
 
44
 
45
  def render_anonymized_pdf(text: str) -> bytes:
46
+ buf = BytesIO()
47
  c = canvas.Canvas(buf, pagesize=A4)
48
+ width, height = A4
49
 
50
+ # 簡易レイアウト:1ページあたり約60行
51
+ x_margin = 50
52
+ y = height - 50
53
+ line_height = 12
54
 
55
  for line in text.splitlines():
56
+ if y < 50:
57
  c.showPage()
58
+ y = height - 50
59
+ c.drawString(x_margin, y, line[:120])
60
  y -= line_height
61
 
62
+ c.showPage()
63
  c.save()
64
  return buf.getvalue()