Corin1998 commited on
Commit
bb3db47
·
verified ·
1 Parent(s): e1db6bc

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +67 -25
pipelines/anonymize.py CHANGED
@@ -1,51 +1,93 @@
 
 
1
  from reportlab.pdfgen import canvas
2
  from reportlab.lib.pagesizes import A4
 
3
  from io import BytesIO
4
- import re
5
- from typing import Dict, Tuple
6
 
 
 
 
7
 
8
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
9
  """
10
- 最小実装:メール/電話を [REDACTED] に置換。氏名などの高度な匿名化は未実装。
11
- 実運用では固有表現抽出や辞書を組み合わせて拡張してください。
12
  """
13
  mapping: Dict[str, str] = {}
14
 
15
- def _replace(pattern: str, label: str, s: str) -> str:
16
- def _sub(m):
17
- val = m.group(0)
18
- if val not in mapping:
19
- mapping[val] = f"[{label}_REDACTED]"
20
- return mapping[val]
21
- return re.sub(pattern, _sub, s)
 
 
 
 
22
 
23
- # email
24
- text = _replace(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "EMAIL", text)
25
- # phone (緩め)
26
- text = _replace(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", "PHONE", text)
 
 
 
27
 
28
- return text, mapping
29
 
30
 
31
  def render_anonymized_pdf(text: str) -> bytes:
32
  """
33
- 簡易PDF描画:A4 に左上から行単位で出力(自動改ページ)。
34
  """
35
  buf = BytesIO()
36
  c = canvas.Canvas(buf, pagesize=A4)
37
  width, height = A4
38
- x_margin, y_margin = 40, 40
 
 
 
 
 
 
39
  line_height = 14
40
 
41
- y = height - y_margin
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  for line in text.splitlines():
43
- if y < y_margin:
44
- c.showPage()
45
- y = height - y_margin
46
- # 長い行は雑に切る(ReportLab簡易実装)
47
- c.drawString(x_margin, y, line[:120])
48
- y -= line_height
49
 
 
50
  c.save()
51
  return buf.getvalue()
 
1
+ import re
2
+ from typing import Tuple, Dict
3
  from reportlab.pdfgen import canvas
4
  from reportlab.lib.pagesizes import A4
5
+ from reportlab.pdfbase.pdfmetrics import stringWidth
6
  from io import BytesIO
 
 
7
 
8
+ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
+ PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
10
+ NAME_HINT_RE = re.compile(r"(氏名[::]?\s*)(\S+)", re.IGNORECASE)
11
 
12
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
13
  """
14
+ 超軽量匿名化:メール/電話/氏名っぽい先頭行を [REDACTED_*] に置換。
 
15
  """
16
  mapping: Dict[str, str] = {}
17
 
18
+ def _sub_and_store(pattern, repl_key, s):
19
+ def _repl(m):
20
+ original = m.group(0)
21
+ masked = f"[REDACTED_{repl_key}]"
22
+ mapping[original] = masked
23
+ return masked
24
+ return pattern.sub(_repl, s)
25
+
26
+ out = text
27
+ out = _sub_and_store(EMAIL_RE, "EMAIL", out)
28
+ out = _sub_and_store(PHONE_RE, "PHONE", out)
29
 
30
+ # 氏名ヒント(例: "氏名: 山田太郎")
31
+ def _name_repl(m):
32
+ original = m.group(2)
33
+ masked = "[REDACTED_NAME]"
34
+ mapping[original] = masked
35
+ return m.group(1) + masked
36
+ out = NAME_HINT_RE.sub(_name_repl, out, count=1)
37
 
38
+ return out, mapping
39
 
40
 
41
  def render_anonymized_pdf(text: str) -> bytes:
42
  """
43
+ 依存を増やさずにReportLabでテキストをA4に流し込む最小PDFレンダラ。
44
  """
45
  buf = BytesIO()
46
  c = canvas.Canvas(buf, pagesize=A4)
47
  width, height = A4
48
+
49
+ left_margin = 40
50
+ right_margin = 40
51
+ top_margin = 40
52
+ bottom_margin = 40
53
+ y = height - top_margin
54
+ max_width = width - left_margin - right_margin
55
  line_height = 14
56
 
57
+ def _draw_wrapped(line: str):
58
+ nonlocal y
59
+ if not line:
60
+ y -= line_height
61
+ if y < bottom_margin:
62
+ c.showPage()
63
+ y = height - top_margin
64
+ return
65
+
66
+ words = line.split(" ")
67
+ current = ""
68
+ for w in words:
69
+ trial = (current + " " + w).strip()
70
+ if stringWidth(trial, "Helvetica", 11) <= max_width:
71
+ current = trial
72
+ else:
73
+ c.setFont("Helvetica", 11)
74
+ c.drawString(left_margin, y, current)
75
+ y -= line_height
76
+ if y < bottom_margin:
77
+ c.showPage()
78
+ y = height - top_margin
79
+ current = w
80
+ if current:
81
+ c.setFont("Helvetica", 11)
82
+ c.drawString(left_margin, y, current)
83
+ y -= line_height
84
+ if y < bottom_margin:
85
+ c.showPage()
86
+ y = height - top_margin
87
+
88
  for line in text.splitlines():
89
+ _draw_wrapped(line)
 
 
 
 
 
90
 
91
+ c.showPage()
92
  c.save()
93
  return buf.getvalue()