Corin1998 commited on
Commit
58ca006
·
verified ·
1 Parent(s): a82717b

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +88 -44
pipelines/anonymize.py CHANGED
@@ -1,63 +1,107 @@
1
- from typing import Tuple, Dict
2
  import re
3
- from reportlab.lib.pagesizes import A4
4
  from reportlab.pdfgen import canvas
5
- from io import BytesIO
 
 
 
6
 
 
 
 
7
 
8
- PII_PATTERNS = {
9
- "EMAIL": re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"),
10
- "PHONE": re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}"),
11
- "NAME_HINT": re.compile(r"(?:氏名|Name)\s*[::]?\s*([^\n]+)"),
12
- }
 
 
 
 
13
 
14
 
15
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
16
- mapping: Dict[str, str] = {}
17
- redacted = text
18
-
19
- # Email
20
- for m in set(PII_PATTERNS["EMAIL"].findall(text)):
21
- token = f"[EMAIL_{abs(hash(m))%10000:04d}]"
22
- mapping[m] = token
23
- redacted = redacted.replace(m, token)
24
-
25
- # Phone
26
- for m in set(PII_PATTERNS["PHONE"].findall(text)):
27
- m_str = m if isinstance(m, str) else "".join(m)
28
- if len(m_str.strip()) < 7:
29
- continue
30
- token = f"[TEL_{abs(hash(m_str))%10000:04d}]"
31
- mapping[m_str] = token
32
- redacted = redacted.replace(m_str, token)
33
-
34
- # Name line (hint-based)
35
- for nm in set(PII_PATTERNS["NAME_HINT"].findall(text)):
36
- nm_clean = nm.strip()
37
- if nm_clean:
38
- token = f"[NAME_{abs(hash(nm_clean))%10000:04d}]"
39
- mapping[nm_clean] = token
40
- redacted = redacted.replace(nm_clean, token)
41
-
42
- return redacted, mapping
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def render_anonymized_pdf(text: str) -> bytes:
46
- buf = BytesIO()
 
 
 
 
47
  c = canvas.Canvas(buf, pagesize=A4)
48
  width, height = A4
49
 
50
- # 簡易レイアウト:1ページあたり約60行
51
- x_margin = 50
52
- y = height - 50
53
- line_height = 12
 
 
 
54
 
55
  for line in text.splitlines():
56
- if y < 50:
 
57
  c.showPage()
58
- y = height - 50
59
- c.drawString(x_margin, y, line[:120])
60
- y -= line_height
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  c.showPage()
63
  c.save()
 
 
1
  import re
2
+ from typing import Dict, Tuple, List
3
  from reportlab.pdfgen import canvas
4
+ from reportlab.lib.pagesizes import A4
5
+ from reportlab.lib.units import mm
6
+ import io
7
+
8
 
9
+ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
10
+ PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
11
+ NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
12
 
13
+
14
+ def _unique(seq: List[str]) -> List[str]:
15
+ s = set()
16
+ out = []
17
+ for x in seq:
18
+ if x not in s:
19
+ s.add(x)
20
+ out.append(x)
21
+ return out
22
 
23
 
24
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
25
+ """
26
+ 氏名・メール・電話を難読化。置換マップも返す。
27
+ - 氏名: '氏名: 山田太郎' / 'Name: Taro Yamada' などの行を検出
28
+ - メール/電話:正規表現で検出
29
+ """
30
+ replace_map: Dict[str, str] = {}
31
+
32
+ # emails
33
+ emails = _unique(EMAIL_RE.findall(text))
34
+ for i, e in enumerate(emails, start=1):
35
+ token = f"<EMAIL_{i}>"
36
+ replace_map[e] = token
37
+
38
+ # phones
39
+ phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
40
+ for i, p in enumerate(phones, start=1):
41
+ token = f"<PHONE_{i}>"
42
+ replace_map[p] = token
43
+
44
+ # names (line-based)
45
+ names = []
46
+ for m in NAME_LINE_RE.finditer(text):
47
+ nm = m.group(1).strip()
48
+ if nm:
49
+ names.append(nm)
50
+ names = _unique(names)
51
+ for i, n in enumerate(names, start=1):
52
+ # 長すぎる場合を適当にトリム
53
+ n_short = n[:80]
54
+ replace_map[n_short] = f"<NAME_{i}>"
55
+
56
+ # 置換(長い文字列から先に)
57
+ def _repl_all(t: str, mapping: Dict[str, str]) -> str:
58
+ for k in sorted(mapping.keys(), key=len, reverse=True):
59
+ t = t.replace(k, mapping[k])
60
+ return t
61
+
62
+ anonymized = _repl_all(text, replace_map)
63
+ return anonymized, replace_map
64
 
65
 
66
  def render_anonymized_pdf(text: str) -> bytes:
67
+ """
68
+ 単純にA4 1カラムでテキストを書き出す簡易PDF。
69
+ (元PDFのレイアウト再現は行わないが、“匿名化済み本文”を配布できる)
70
+ """
71
+ buf = io.BytesIO()
72
  c = canvas.Canvas(buf, pagesize=A4)
73
  width, height = A4
74
 
75
+ left = 15 * mm
76
+ top = height - 15 * mm
77
+ line_height = 6 * mm
78
+ x = left
79
+ y = top
80
+
81
+ c.setFont("Helvetica", 10)
82
 
83
  for line in text.splitlines():
84
+ # 改ページ
85
+ if y < 20 * mm:
86
  c.showPage()
87
+ c.setFont("Helvetica", 10)
88
+ y = top
89
+ # 長い行を適当に折り返し
90
+ max_chars = 110 # おおよその幅
91
+ if len(line) <= max_chars:
92
+ c.drawString(x, y, line)
93
+ y -= line_height
94
+ else:
95
+ start = 0
96
+ while start < len(line):
97
+ seg = line[start:start + max_chars]
98
+ c.drawString(x, y, seg)
99
+ y -= line_height
100
+ if y < 20 * mm:
101
+ c.showPage()
102
+ c.setFont("Helvetica", 10)
103
+ y = top
104
+ start += max_chars
105
 
106
  c.showPage()
107
  c.save()