Spaces:
Runtime error
Runtime error
Update pipelines/anonymize.py
Browse files- pipelines/anonymize.py +9 -28
pipelines/anonymize.py
CHANGED
|
@@ -5,7 +5,6 @@ from reportlab.lib.pagesizes import A4
|
|
| 5 |
from reportlab.lib.units import mm
|
| 6 |
import io
|
| 7 |
|
| 8 |
-
|
| 9 |
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
| 10 |
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
| 11 |
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
|
|
@@ -22,26 +21,19 @@ def _unique(seq: List[str]) -> List[str]:
|
|
| 22 |
|
| 23 |
|
| 24 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
| 25 |
-
"""
|
| 26 |
-
氏名・メール・電話を難読化。置換マップも返す。
|
| 27 |
-
- 氏名: '氏名: 山田太郎' / 'Name: Taro Yamada' などの行を検出
|
| 28 |
-
- メール/電話:正規表現で検出
|
| 29 |
-
"""
|
| 30 |
replace_map: Dict[str, str] = {}
|
| 31 |
|
| 32 |
# emails
|
| 33 |
emails = _unique(EMAIL_RE.findall(text))
|
| 34 |
for i, e in enumerate(emails, start=1):
|
| 35 |
-
|
| 36 |
-
replace_map[e] = token
|
| 37 |
|
| 38 |
# phones
|
| 39 |
phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
|
| 40 |
for i, p in enumerate(phones, start=1):
|
| 41 |
-
|
| 42 |
-
replace_map[p] = token
|
| 43 |
|
| 44 |
-
# names
|
| 45 |
names = []
|
| 46 |
for m in NAME_LINE_RE.finditer(text):
|
| 47 |
nm = m.group(1).strip()
|
|
@@ -49,25 +41,16 @@ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
|
| 49 |
names.append(nm)
|
| 50 |
names = _unique(names)
|
| 51 |
for i, n in enumerate(names, start=1):
|
| 52 |
-
|
| 53 |
-
n_short = n[:80]
|
| 54 |
-
replace_map[n_short] = f"<NAME_{i}>"
|
| 55 |
|
| 56 |
-
# 置換(長い
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
t = t.replace(k, mapping[k])
|
| 60 |
-
return t
|
| 61 |
|
| 62 |
-
|
| 63 |
-
return anonymized, replace_map
|
| 64 |
|
| 65 |
|
| 66 |
def render_anonymized_pdf(text: str) -> bytes:
|
| 67 |
-
"""
|
| 68 |
-
単純にA4 1カラムでテキストを書き出す簡易PDF。
|
| 69 |
-
(元PDFのレイアウト再現は行わないが、“匿名化済み本文”を配布できる)
|
| 70 |
-
"""
|
| 71 |
buf = io.BytesIO()
|
| 72 |
c = canvas.Canvas(buf, pagesize=A4)
|
| 73 |
width, height = A4
|
|
@@ -81,13 +64,11 @@ def render_anonymized_pdf(text: str) -> bytes:
|
|
| 81 |
c.setFont("Helvetica", 10)
|
| 82 |
|
| 83 |
for line in text.splitlines():
|
| 84 |
-
# 改ページ
|
| 85 |
if y < 20 * mm:
|
| 86 |
c.showPage()
|
| 87 |
c.setFont("Helvetica", 10)
|
| 88 |
y = top
|
| 89 |
-
|
| 90 |
-
max_chars = 110 # おおよその幅
|
| 91 |
if len(line) <= max_chars:
|
| 92 |
c.drawString(x, y, line)
|
| 93 |
y -= line_height
|
|
|
|
| 5 |
from reportlab.lib.units import mm
|
| 6 |
import io
|
| 7 |
|
|
|
|
| 8 |
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
| 9 |
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
| 10 |
NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]?\s*(.+)$", re.MULTILINE)
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
replace_map: Dict[str, str] = {}
|
| 25 |
|
| 26 |
# emails
|
| 27 |
emails = _unique(EMAIL_RE.findall(text))
|
| 28 |
for i, e in enumerate(emails, start=1):
|
| 29 |
+
replace_map[e] = f"<EMAIL_{i}>"
|
|
|
|
| 30 |
|
| 31 |
# phones
|
| 32 |
phones = _unique([p.strip() for p in PHONE_RE.findall(text)])
|
| 33 |
for i, p in enumerate(phones, start=1):
|
| 34 |
+
replace_map[p] = f"<PHONE_{i}>"
|
|
|
|
| 35 |
|
| 36 |
+
# names
|
| 37 |
names = []
|
| 38 |
for m in NAME_LINE_RE.finditer(text):
|
| 39 |
nm = m.group(1).strip()
|
|
|
|
| 41 |
names.append(nm)
|
| 42 |
names = _unique(names)
|
| 43 |
for i, n in enumerate(names, start=1):
|
| 44 |
+
replace_map[n[:80]] = f"<NAME_{i}>"
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
# 置換(長いキーから)
|
| 47 |
+
for k in sorted(replace_map.keys(), key=len, reverse=True):
|
| 48 |
+
text = text.replace(k, replace_map[k])
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
return text, replace_map
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def render_anonymized_pdf(text: str) -> bytes:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
buf = io.BytesIO()
|
| 55 |
c = canvas.Canvas(buf, pagesize=A4)
|
| 56 |
width, height = A4
|
|
|
|
| 64 |
c.setFont("Helvetica", 10)
|
| 65 |
|
| 66 |
for line in text.splitlines():
|
|
|
|
| 67 |
if y < 20 * mm:
|
| 68 |
c.showPage()
|
| 69 |
c.setFont("Helvetica", 10)
|
| 70 |
y = top
|
| 71 |
+
max_chars = 110
|
|
|
|
| 72 |
if len(line) <= max_chars:
|
| 73 |
c.drawString(x, y, line)
|
| 74 |
y -= line_height
|