Spaces:

Prakritttt
/

Pii-marking

Running

App Files Files Community

Pii-marking / test_model.py

Prakritttt

Change workflow [delete hf model and use only wangchang]

f0611dd about 1 month ago

raw

history blame contribute delete

14.4 kB

	import json
	import re

	from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

	try:
	from langdetect import DetectorFactory, detect

	DetectorFactory.seed = 0
	HAS_LANGDETECT = True
	except Exception:
	HAS_LANGDETECT = False

	try:
	import spacy

	HAS_SPACY = True
	except Exception:
	HAS_SPACY = False

	MODEL_NAME = "pythainlp/thainer-corpus-v2-base-model"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

	hf_ner = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple",
	)

	spacy_en_nlp = None

	CANONICAL_TAGS = {
	"ID_CARD",
	"TIME",
	"EMAIL",
	"LEN",
	"LOCATION",
	"ORGANIZATION",
	"PERSON",
	"PHONE",
	"TEMPERATURE",
	"URL",
	"ZIP",
	"MONEY",
	"LAW",
	"PERCENT",
	}

	ENTITY_PRIORITY = {
	"ID_CARD": 100,
	"EMAIL": 95,
	"URL": 93,
	"PHONE": 92,
	"PERSON": 90,
	"ORGANIZATION": 80,
	"LOCATION": 78,
	"MONEY": 75,
	"ZIP": 72,
	"PERCENT": 70,
	"TIME": 65,
	"TEMPERATURE": 60,
	"LEN": 55,
	"LAW": 50,
	}

	SOURCE_PRIORITY = {
	"regex": 4,
	"wangchanberta_hf": 2,
	"spacy_en": 1,
	}

	THAI_FUNCTION_WORDS = {
	"ที่",
	"จาก",
	"ใน",
	"บน",
	"และ",
	"หรือ",
	"คือ",
	"ได้",
	"กับ",
	"ของ",
	}


	def _normalize_label(entity: dict) -> str:
	"""Return a stable label from model output."""
	return (entity.get("entity_group") or entity.get("entity") or "PII").upper()


	def init_models() -> None:
	"""Lazy-load optional models so the core pipeline can still run without them."""
	global spacy_en_nlp

	if spacy_en_nlp is None and HAS_SPACY:
	try:
	spacy_en_nlp = spacy.load("en_core_web_sm")
	except Exception:
	spacy_en_nlp = None


	def detect_text_profile(text: str) -> dict:
	"""Return language profile with mode: th/en/mixed."""
	thai_chars = len(re.findall(r"[ก-๙]", text))
	en_chars = len(re.findall(r"[A-Za-z]", text))
	total_chars = max(1, thai_chars + en_chars)

	th_ratio = thai_chars / total_chars
	en_ratio = en_chars / total_chars
	mode = "mixed"

	if th_ratio >= 0.65:
	mode = "th"
	elif en_ratio >= 0.65:
	mode = "en"

	if HAS_LANGDETECT:
	try:
	detected = detect(text)
	if detected == "th" and th_ratio >= 0.2:
	mode = "th"
	elif detected == "en" and en_ratio >= 0.2:
	mode = "en"
	elif detected not in {"th", "en"} and th_ratio > 0.2 and en_ratio > 0.2:
	mode = "mixed"
	except Exception:
	pass

	return {
	"th_ratio": th_ratio,
	"en_ratio": en_ratio,
	"mode": mode,
	}


	def _to_tag(label: str) -> str:
	"""Map model labels to requested canonical tags."""
	if "PER" in label or "PERSON" in label or "NAME" in label:
	return "PERSON"
	if "ORG" in label or "COMPANY" in label:
	return "ORGANIZATION"
	if "LOC" in label or "ADDRESS" in label or "GPE" in label:
	return "LOCATION"
	if "EMAIL" in label or "MAIL" in label:
	return "EMAIL"
	if "URL" in label or "WEB" in label:
	return "URL"
	if "ZIP" in label or "POSTAL" in label:
	return "ZIP"
	if "MONEY" in label or "PRICE" in label or "CURRENCY" in label:
	return "MONEY"
	if "LAW" in label or "ACT" in label:
	return "LAW"
	if "PERCENT" in label:
	return "PERCENT"
	if "TEMP" in label:
	return "TEMPERATURE"
	if "LEN" in label or "LENGTH" in label or "AGE" in label:
	return "LEN"
	if "TIME" in label:
	return "TIME"
	if "DATE" in label or "DATA" in label:
	return "DATA"
	if "ID_CARD" in label or ("ID" in label and "CARD" in label):
	return "ID_CARD"
	if "PHONE" in label or "TEL" in label or "MOBILE" in label:
	return "PHONE"
	return "PERSON"


	def normalize_entity(raw: dict, source: str, text: str) -> dict \| None:
	"""Normalize source-specific entity shape into common schema."""
	start, end = _trim_span(text, int(raw["start"]), int(raw["end"]))
	if start >= end:
	return None

	label = _to_tag(raw.get("label", ""))
	if label == "DATA":
	# Business rule: drop DATE/DATA-like entities.
	return None

	value = text[start:end]
	if _is_noise_span(value, label, source):
	return None

	return {
	"start": start,
	"end": end,
	"label": label,
	"source": source,
	}


	def _is_phone_number(text: str, label: str) -> bool:
	"""Skip masking for phone numbers as requested."""
	digits = re.sub(r"\D", "", text)
	has_phone_like_length = 9 <= len(digits) <= 12
	starts_with_zero_or_plus = text.strip().startswith("0") or text.strip().startswith("+")
	phone_label = any(key in label for key in ("PHONE", "TEL", "MOBILE"))
	return phone_label or (has_phone_like_length and starts_with_zero_or_plus)


	def _trim_span(text: str, start: int, end: int) -> tuple:
	"""Trim whitespace around entity span to reduce broken tags."""
	while start < end and text[start].isspace():
	start += 1
	while end > start and text[end - 1].isspace():
	end -= 1
	return start, end


	def _is_noise_span(value: str, tag: str, source: str) -> bool:
	"""Drop low-information spans that often appear from model noise."""
	stripped = value.strip()
	if not stripped:
	return True
	if len(stripped) == 1 and tag not in {"ZIP", "PERCENT", "MONEY", "LEN"}:
	return True
	if source in {"model", "wangchanberta_hf"} and stripped in THAI_FUNCTION_WORDS:
	return True
	if re.fullmatch(r"[\W_]+", stripped):
	return True
	return False


	def _collect_regex_entities(text: str) -> list:
	"""Force-detect patterns that are often missed by NER."""
	patterns = [
	("ID_CARD", r"(?<!\d)(?:\d{13}\|\d-\d{4}-\d{5}-\d{2}-\d)(?!\d)"),
	("EMAIL", r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"),
	("URL", r"https?://[^\s]+\|www\.[^\s]+"),
	("PERCENT", r"\b\d+(?:\.\d+)?%"),
	("MONEY", r"(?:฿\|THB\s?)\d+(?:,\d{3})(?:\.\d+)?\|\b\d+(?:,\d{3})(?:\.\d+)?\s?บาท"),
	("ZIP", r"\b\d{5}\b"),
	("TIME", r"\b\d{1,2}:\d{2}\b"),
	("TEMPERATURE", r"\b\d+(?:\.\d+)?\s?(?:°C\|°F\|องศา)\b"),
	("PHONE", r"(?:\+66\|0)\d(?:[\s-]?\d){7,9}"),
	("PERSON", r"(?:นาย\|นางสาว\|นาง\|ดร\.?\|คุณ)\s?[ก-๙]{2,}(?:\s[ก-๙]{2,}){0,2}"),
	("PERSON", r"\b(?:Mr\|Mrs\|Ms\|Miss\|Dr)\.?\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}\b"),
	]

	found = []
	for label, pattern in patterns:
	for match in re.finditer(pattern, text, flags=re.IGNORECASE):
	found.append(
	{
	"start": match.start(),
	"end": match.end(),
	"label": label,
	"source": "regex",
	}
	)
	return found


	def collect_model_entities_th(text: str) -> list:
	"""Collect Thai entities from HF model only."""
	found = []

	hf_entities = hf_ner(text)
	for entity in hf_entities:
	found.append(
	{
	"start": entity["start"],
	"end": entity["end"],
	"label": _normalize_label(entity),
	"source": "wangchanberta_hf",
	}
	)

	return found


	def collect_model_entities_en(text: str) -> list:
	"""Collect English entities from spaCy if available."""
	if spacy_en_nlp is None:
	return []

	found = []
	doc = spacy_en_nlp(text)
	for ent in doc.ents:
	found.append(
	{
	"start": ent.start_char,
	"end": ent.end_char,
	"label": ent.label_,
	"source": "spacy_en",
	}
	)
	return found


	def _entity_score(entity: dict) -> tuple:
	return (
	ENTITY_PRIORITY.get(entity["label"], 0),
	SOURCE_PRIORITY.get(entity["source"], 0),
	entity["end"] - entity["start"],
	)


	def _is_overlap(a: dict, b: dict) -> bool:
	return not (a["end"] <= b["start"] or b["end"] <= a["start"])


	def merge_entities_with_priority(entities: list) -> list:
	"""Resolve overlap by label priority, source confidence, then span length."""
	if not entities:
	return []

	entities = sorted(
	entities,
	key=lambda x: (
	x["start"],
	-_entity_score(x)[0],
	-_entity_score(x)[1],
	-_entity_score(x)[2],
	),
	)

	resolved = []
	for item in entities:
	conflict_idx = None
	for idx, kept in enumerate(resolved):
	if _is_overlap(item, kept):
	conflict_idx = idx
	break

	if conflict_idx is None:
	resolved.append(item)
	continue

	kept = resolved[conflict_idx]
	if item["label"] == kept["label"]:
	# Same label: keep longer span.
	if (item["end"] - item["start"]) > (kept["end"] - kept["start"]):
	resolved[conflict_idx] = item
	elif _entity_score(item) > _entity_score(kept):
	# Different labels: use priority score.
	resolved[conflict_idx] = item

	return sorted(resolved, key=lambda x: x["start"])


	def _merge_entities(text: str) -> list:
	"""Collect from all enabled sources, normalize, and merge by priority."""
	init_models()
	profile = detect_text_profile(text)
	mode = profile["mode"]

	raw_entities = []
	raw_entities.extend(_collect_regex_entities(text))

	if mode in {"th", "mixed"}:
	raw_entities.extend(collect_model_entities_th(text))
	# Run spaCy when text is English/mixed or still contains enough English signal.
	if mode in {"en", "mixed"} or profile["en_ratio"] >= 0.12:
	raw_entities.extend(collect_model_entities_en(text))

	normalized = []
	for entity in raw_entities:
	cleaned = normalize_entity(entity, entity["source"], text)
	if cleaned is not None:
	normalized.append(cleaned)

	return merge_entities_with_priority(normalized)


	def mark_pii_context(text: str) -> dict:
	"""Return tagged sentence and long context, excluding phone masking."""
	entities = _merge_entities(text)

	parts = []
	cursor = 0
	tagged_entities = []
	last_was_placeholder = False
	last_placeholder_tag = None

	for entity in entities:
	start, end = entity["start"], entity["end"]
	value = text[start:end]
	label = entity["label"]

	if start < cursor:
	continue

	if _is_noise_span(value, label, entity.get("source", "model")):
	continue

	parts.append(text[cursor:start])
	last_was_placeholder = False
	last_placeholder_tag = None

	if _is_phone_number(value, label):
	parts.append(value)
	action = "not_marked_phone"
	tag_name = "PHONE"
	else:
	tag_name = label if label in CANONICAL_TAGS else _to_tag(label)
	if tag_name == "DATA":
	# Defensive guard: date-like entity should not be masked.
	parts.append(value)
	action = "kept_original"
	tagged_entities.append(
	{
	"text": value,
	"label": label,
	"tag": tag_name,
	"start": start,
	"end": end,
	"action": action,
	"source": entity.get("source", "model"),
	}
	)
	cursor = end
	continue
	placeholder = f"<{tag_name}>"
	if not (last_was_placeholder and last_placeholder_tag == tag_name):
	parts.append(placeholder)
	action = "marked"
	last_was_placeholder = True
	last_placeholder_tag = tag_name

	tagged_entities.append(
	{
	"text": value,
	"label": label,
	"tag": tag_name,
	"start": start,
	"end": end,
	"action": action,
	"source": entity.get("source", "model"),
	}
	)
	cursor = end

	parts.append(text[cursor:])
	tagged_text = re.sub(r"\s+", " ", "".join(parts)).strip()

	context = (
	"PII Context Output\n"
	f"Original: {text}\n\n"
	f"Tagged: {tagged_text}\n\n"
	"Entities:\n"
	f"{json.dumps(tagged_entities, ensure_ascii=False, indent=2)}"
	)

	return {
	"original_text": text,
	"sentence": tagged_text,
	"tagged_text": tagged_text,
	"entities": tagged_entities,
	"context": context,
	}


	if __name__ == "__main__":
	sample_text = (
	"ฉันชื่อ นางสาวมะลิวา บุญสระดี อาศัยอยู่ที่อำเภอนางรอง จังหวัดบุรีรัมย์ "
	"เรียนจบจากมหาวิทยาลัยขอนแก่น ติดต่อได้ที่ 089-123-4567 และอีเมล abc@example.com"
	)
	output = mark_pii_context(sample_text)
	print(output["sentence"])




	# ในไฟล์ test_model.py

	# ใส่ local_files_only=True เพื่อรองรับคำสั่ง Offline ที่คุณจะใช้
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=True)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, local_files_only=True)

	ner = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple",
	)