Spaces:
Runtime error
Runtime error
Update pipelines/anonymize.py
Browse files- pipelines/anonymize.py +88 -44
pipelines/anonymize.py
CHANGED
|
@@ -1,63 +1,107 @@
|
|
| 1 |
-
from typing import Tuple, Dict
|
| 2 |
import re
|
| 3 |
-
from
|
| 4 |
from reportlab.pdfgen import canvas
|
| 5 |
-
from
|
|
|
|
|
|
|
|
|
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def render_anonymized_pdf(text: str) -> bytes:
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
c = canvas.Canvas(buf, pagesize=A4)
|
| 48 |
width, height = A4
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
for line in text.splitlines():
|
| 56 |
-
|
|
|
|
| 57 |
c.showPage()
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
c.showPage()
|
| 63 |
c.save()
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
+
from typing import Dict, Tuple, List
|
| 3 |
from reportlab.pdfgen import canvas
|
| 4 |
+
from reportlab.lib.pagesizes import A4
|
| 5 |
+
from reportlab.lib.units import mm
|
| 6 |
+
import io
|
| 7 |
+
|
| 8 |
|
| 9 |
+
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
| 10 |
+
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
| 11 |
+
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
|
| 12 |
|
| 13 |
+
|
| 14 |
+
def _unique(seq: List[str]) -> List[str]:
|
| 15 |
+
s = set()
|
| 16 |
+
out = []
|
| 17 |
+
for x in seq:
|
| 18 |
+
if x not in s:
|
| 19 |
+
s.add(x)
|
| 20 |
+
out.append(x)
|
| 21 |
+
return out
|
| 22 |
|
| 23 |
|
| 24 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
| 25 |
+
"""
|
| 26 |
+
氏名・メール・電話を難読化。置換マップも返す。
|
| 27 |
+
- 氏名: '氏名: 山田太郎' / 'Name: Taro Yamada' などの行を検出
|
| 28 |
+
- メール/電話:正規表現で検出
|
| 29 |
+
"""
|
| 30 |
+
replace_map: Dict[str, str] = {}
|
| 31 |
+
|
| 32 |
+
# emails
|
| 33 |
+
emails = _unique(EMAIL_RE.findall(text))
|
| 34 |
+
for i, e in enumerate(emails, start=1):
|
| 35 |
+
token = f"<EMAIL_{i}>"
|
| 36 |
+
replace_map[e] = token
|
| 37 |
+
|
| 38 |
+
# phones
|
| 39 |
+
phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
|
| 40 |
+
for i, p in enumerate(phones, start=1):
|
| 41 |
+
token = f"<PHONE_{i}>"
|
| 42 |
+
replace_map[p] = token
|
| 43 |
+
|
| 44 |
+
# names (line-based)
|
| 45 |
+
names = []
|
| 46 |
+
for m in NAME_LINE_RE.finditer(text):
|
| 47 |
+
nm = m.group(1).strip()
|
| 48 |
+
if nm:
|
| 49 |
+
names.append(nm)
|
| 50 |
+
names = _unique(names)
|
| 51 |
+
for i, n in enumerate(names, start=1):
|
| 52 |
+
# 長すぎる場合を適当にトリム
|
| 53 |
+
n_short = n[:80]
|
| 54 |
+
replace_map[n_short] = f"<NAME_{i}>"
|
| 55 |
+
|
| 56 |
+
# 置換(長い文字列から先に)
|
| 57 |
+
def _repl_all(t: str, mapping: Dict[str, str]) -> str:
|
| 58 |
+
for k in sorted(mapping.keys(), key=len, reverse=True):
|
| 59 |
+
t = t.replace(k, mapping[k])
|
| 60 |
+
return t
|
| 61 |
+
|
| 62 |
+
anonymized = _repl_all(text, replace_map)
|
| 63 |
+
return anonymized, replace_map
|
| 64 |
|
| 65 |
|
| 66 |
def render_anonymized_pdf(text: str) -> bytes:
|
| 67 |
+
"""
|
| 68 |
+
単純にA4 1カラムでテキストを書き出す簡易PDF。
|
| 69 |
+
(元PDFのレイアウト再現は行わないが、“匿名化済み本文”を配布できる)
|
| 70 |
+
"""
|
| 71 |
+
buf = io.BytesIO()
|
| 72 |
c = canvas.Canvas(buf, pagesize=A4)
|
| 73 |
width, height = A4
|
| 74 |
|
| 75 |
+
left = 15 * mm
|
| 76 |
+
top = height - 15 * mm
|
| 77 |
+
line_height = 6 * mm
|
| 78 |
+
x = left
|
| 79 |
+
y = top
|
| 80 |
+
|
| 81 |
+
c.setFont("Helvetica", 10)
|
| 82 |
|
| 83 |
for line in text.splitlines():
|
| 84 |
+
# 改ページ
|
| 85 |
+
if y < 20 * mm:
|
| 86 |
c.showPage()
|
| 87 |
+
c.setFont("Helvetica", 10)
|
| 88 |
+
y = top
|
| 89 |
+
# 長い行を適当に折り返し
|
| 90 |
+
max_chars = 110 # おおよその幅
|
| 91 |
+
if len(line) <= max_chars:
|
| 92 |
+
c.drawString(x, y, line)
|
| 93 |
+
y -= line_height
|
| 94 |
+
else:
|
| 95 |
+
start = 0
|
| 96 |
+
while start < len(line):
|
| 97 |
+
seg = line[start:start + max_chars]
|
| 98 |
+
c.drawString(x, y, seg)
|
| 99 |
+
y -= line_height
|
| 100 |
+
if y < 20 * mm:
|
| 101 |
+
c.showPage()
|
| 102 |
+
c.setFont("Helvetica", 10)
|
| 103 |
+
y = top
|
| 104 |
+
start += max_chars
|
| 105 |
|
| 106 |
c.showPage()
|
| 107 |
c.save()
|