Corin1998 commited on
Commit
c1cc164
·
verified ·
1 Parent(s): 894c91e

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +13 -44
pipelines/anonymize.py CHANGED
@@ -9,81 +9,50 @@ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
10
  NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
11
 
12
-
13
  def _unique(seq: List[str]) -> List[str]:
14
- s = set()
15
- out = []
16
  for x in seq:
17
  if x not in s:
18
- s.add(x)
19
- out.append(x)
20
  return out
21
 
22
-
23
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
24
  replace_map: Dict[str, str] = {}
25
-
26
- # emails
27
- emails = _unique(EMAIL_RE.findall(text))
28
- for i, e in enumerate(emails, start=1):
29
  replace_map[e] = f"<EMAIL_{i}>"
30
-
31
- # phones
32
- phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
33
- for i, p in enumerate(phones, start=1):
34
  replace_map[p] = f"<PHONE_{i}>"
35
-
36
- # names
37
  names = []
38
  for m in NAME_LINE_RE.finditer(text):
39
  nm = m.group(1).strip()
40
  if nm:
41
  names.append(nm)
42
- names = _unique(names)
43
- for i, n in enumerate(names, start=1):
44
  replace_map[n[:80]] = f"<NAME_{i}>"
45
-
46
- # 置換(長いキーから)
47
  for k in sorted(replace_map.keys(), key=len, reverse=True):
48
  text = text.replace(k, replace_map[k])
49
-
50
  return text, replace_map
51
 
52
-
53
  def render_anonymized_pdf(text: str) -> bytes:
54
  buf = io.BytesIO()
55
  c = canvas.Canvas(buf, pagesize=A4)
56
  width, height = A4
57
-
58
- left = 15 * mm
59
- top = height - 15 * mm
60
- line_height = 6 * mm
61
- x = left
62
- y = top
63
-
64
  c.setFont("Helvetica", 10)
65
-
66
  for line in text.splitlines():
67
  if y < 20 * mm:
68
- c.showPage()
69
- c.setFont("Helvetica", 10)
70
- y = top
71
  max_chars = 110
72
  if len(line) <= max_chars:
73
- c.drawString(x, y, line)
74
- y -= line_height
75
  else:
76
  start = 0
77
  while start < len(line):
78
- seg = line[start:start + max_chars]
79
- c.drawString(x, y, seg)
80
- y -= line_height
81
  if y < 20 * mm:
82
- c.showPage()
83
- c.setFont("Helvetica", 10)
84
- y = top
85
  start += max_chars
86
-
87
- c.showPage()
88
- c.save()
89
  return buf.getvalue()
 
9
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
10
  NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
11
 
 
12
  def _unique(seq: List[str]) -> List[str]:
13
+ s = set(); out = []
 
14
  for x in seq:
15
  if x not in s:
16
+ s.add(x); out.append(x)
 
17
  return out
18
 
 
19
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
20
  replace_map: Dict[str, str] = {}
21
+ for i, e in enumerate(_unique(EMAIL_RE.findall(text)), start=1):
 
 
 
22
  replace_map[e] = f"<EMAIL_{i}>"
23
+ for i, p in enumerate(_unique([p.strip() for p in PHONE_RE.findall(text)]), start=1):
 
 
 
24
  replace_map[p] = f"<PHONE_{i}>"
 
 
25
  names = []
26
  for m in NAME_LINE_RE.finditer(text):
27
  nm = m.group(1).strip()
28
  if nm:
29
  names.append(nm)
30
+ for i, n in enumerate(_unique(names), start=1):
 
31
  replace_map[n[:80]] = f"<NAME_{i}>"
 
 
32
  for k in sorted(replace_map.keys(), key=len, reverse=True):
33
  text = text.replace(k, replace_map[k])
 
34
  return text, replace_map
35
 
 
36
  def render_anonymized_pdf(text: str) -> bytes:
37
  buf = io.BytesIO()
38
  c = canvas.Canvas(buf, pagesize=A4)
39
  width, height = A4
40
+ left = 15 * mm; top = height - 15 * mm; line_h = 6 * mm
41
+ x = left; y = top
 
 
 
 
 
42
  c.setFont("Helvetica", 10)
 
43
  for line in text.splitlines():
44
  if y < 20 * mm:
45
+ c.showPage(); c.setFont("Helvetica", 10); y = top
 
 
46
  max_chars = 110
47
  if len(line) <= max_chars:
48
+ c.drawString(x, y, line); y -= line_h
 
49
  else:
50
  start = 0
51
  while start < len(line):
52
+ seg = line[start:start+max_chars]
53
+ c.drawString(x, y, seg); y -= line_h
 
54
  if y < 20 * mm:
55
+ c.showPage(); c.setFont("Helvetica", 10); y = top
 
 
56
  start += max_chars
57
+ c.showPage(); c.save()
 
 
58
  return buf.getvalue()