Corin1998 commited on
Commit
d0bc04c
·
verified ·
1 Parent(s): 819818d

Create anonyize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonyize.py +58 -0
pipelines/anonyize.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import re
3
+ from reportlab.pdfgen import canvas
4
+ from reportlab.lib.pagesizes import A4
5
+ from reportlab.lib.units import mm
6
+
7
+ RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
8
+ RE_PHONE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
9
+ RE_NAME = re.compile(r"(?:(?:氏名[::]?)[\t ]*)(\S+)")
10
+
11
+
12
+ def anonymize_text(text: str):
13
+ anon_map = {}
14
+
15
+ def repl_email(m):
16
+ val = m.group(0)
17
+ key = "<EMAIL>"
18
+ anon_map.setdefault(key, []).append(val)
19
+ return key
20
+
21
+ def repl_phone(m):
22
+ val = m.group(0)
23
+ key = "<PHONE>"
24
+ anon_map.setdefault(key, []).append(val)
25
+ return key
26
+
27
+ def repl_name(m):
28
+ val = m.group(1)
29
+ key = "<NAME>"
30
+ anon_map.setdefault(key, []).append(val)
31
+ return m.group(0).replace(val, key)
32
+
33
+ t = RE_EMAIL.sub(repl_email, text)
34
+ t = RE_PHONE.sub(repl_phone, t)
35
+ t = RE_NAME.sub(repl_name, t)
36
+
37
+ return t, anon_map
38
+
39
+
40
+ def render_anonymized_pdf(text: str) -> bytes:
41
+ buf = io.BytesIO()
42
+ c = canvas.Canvas(buf, pagesize=A4)
43
+ width, height = A4
44
+
45
+ margin = 15 * mm
46
+ line_height = 5 * mm
47
+ x = margin
48
+ y = height - margin
49
+
50
+ for line in text.splitlines():
51
+ if y < margin:
52
+ c.showPage()
53
+ y = height - margin
54
+ c.drawString(x, y, line[:1200])
55
+ y -= line_height
56
+
57
+ c.save()
58
+ return buf.getvalue()