Corin1998 commited on
Commit
2418fb0
·
verified ·
1 Parent(s): abef298

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +9 -28
pipelines/anonymize.py CHANGED
@@ -5,7 +5,6 @@ from reportlab.lib.pagesizes import A4
5
  from reportlab.lib.units import mm
6
  import io
7
 
8
-
9
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
10
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
11
  NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
@@ -22,26 +21,19 @@ def _unique(seq: List[str]) -> List[str]:
22
 
23
 
24
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
25
- """
26
- 氏名・メール・電話を難読化。置換マップも返す。
27
- - 氏名: '氏名: 山田太郎' / 'Name: Taro Yamada' などの行を検出
28
- - メール/電話:正規表現で検出
29
- """
30
  replace_map: Dict[str, str] = {}
31
 
32
  # emails
33
  emails = _unique(EMAIL_RE.findall(text))
34
  for i, e in enumerate(emails, start=1):
35
- token = f"<EMAIL_{i}>"
36
- replace_map[e] = token
37
 
38
  # phones
39
  phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
40
  for i, p in enumerate(phones, start=1):
41
- token = f"<PHONE_{i}>"
42
- replace_map[p] = token
43
 
44
- # names (line-based)
45
  names = []
46
  for m in NAME_LINE_RE.finditer(text):
47
  nm = m.group(1).strip()
@@ -49,25 +41,16 @@ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
49
  names.append(nm)
50
  names = _unique(names)
51
  for i, n in enumerate(names, start=1):
52
- # 長すぎる場合を適当にトリム
53
- n_short = n[:80]
54
- replace_map[n_short] = f"<NAME_{i}>"
55
 
56
- # 置換(長い文字列から先に
57
- def _repl_all(t: str, mapping: Dict[str, str]) -> str:
58
- for k in sorted(mapping.keys(), key=len, reverse=True):
59
- t = t.replace(k, mapping[k])
60
- return t
61
 
62
- anonymized = _repl_all(text, replace_map)
63
- return anonymized, replace_map
64
 
65
 
66
  def render_anonymized_pdf(text: str) -> bytes:
67
- """
68
- 単純にA4 1カラムでテキストを書き出す簡易PDF。
69
- (元PDFのレイアウト再現は行わないが、“匿名化済み本文”を配布できる)
70
- """
71
  buf = io.BytesIO()
72
  c = canvas.Canvas(buf, pagesize=A4)
73
  width, height = A4
@@ -81,13 +64,11 @@ def render_anonymized_pdf(text: str) -> bytes:
81
  c.setFont("Helvetica", 10)
82
 
83
  for line in text.splitlines():
84
- # 改ページ
85
  if y < 20 * mm:
86
  c.showPage()
87
  c.setFont("Helvetica", 10)
88
  y = top
89
- # 長い行を適当に折り返し
90
- max_chars = 110 # おおよその幅
91
  if len(line) <= max_chars:
92
  c.drawString(x, y, line)
93
  y -= line_height
 
5
  from reportlab.lib.units import mm
6
  import io
7
 
 
8
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
10
  NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
 
21
 
22
 
23
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
 
 
 
 
 
24
  replace_map: Dict[str, str] = {}
25
 
26
  # emails
27
  emails = _unique(EMAIL_RE.findall(text))
28
  for i, e in enumerate(emails, start=1):
29
+ replace_map[e] = f"<EMAIL_{i}>"
 
30
 
31
  # phones
32
  phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
33
  for i, p in enumerate(phones, start=1):
34
+ replace_map[p] = f"<PHONE_{i}>"
 
35
 
36
+ # names
37
  names = []
38
  for m in NAME_LINE_RE.finditer(text):
39
  nm = m.group(1).strip()
 
41
  names.append(nm)
42
  names = _unique(names)
43
  for i, n in enumerate(names, start=1):
44
+ replace_map[n[:80]] = f"<NAME_{i}>"
 
 
45
 
46
+ # 置換(長いキーから)
47
+ for k in sorted(replace_map.keys(), key=len, reverse=True):
48
+ text = text.replace(k, replace_map[k])
 
 
49
 
50
+ return text, replace_map
 
51
 
52
 
53
  def render_anonymized_pdf(text: str) -> bytes:
 
 
 
 
54
  buf = io.BytesIO()
55
  c = canvas.Canvas(buf, pagesize=A4)
56
  width, height = A4
 
64
  c.setFont("Helvetica", 10)
65
 
66
  for line in text.splitlines():
 
67
  if y < 20 * mm:
68
  c.showPage()
69
  c.setFont("Helvetica", 10)
70
  y = top
71
+ max_chars = 110
 
72
  if len(line) <= max_chars:
73
  c.drawString(x, y, line)
74
  y -= line_height