Spaces:

Adisri99
/

PII-Scan

Sleeping

App Files Files Community

PII-Scan / scanner /detectors.py

Adisri99

Upload 9 files

0cb7559 verified about 2 months ago

raw

history blame contribute delete

5.41 kB

	import re
	from typing import Dict, List, Optional

	KEYWORDS = {
	"EMAIL": ["email", "e-mail", "mail", "contact"],
	"PHONE": ["phone", "mobile", "cell", "tel", "telephone", "contact"],
	"SSN": ["ssn", "social security", "social-security", "tax id"],
	"CREDIT_CARD": ["card", "credit", "visa", "mastercard", "amex", "payment"],
	"IP_ADDRESS": ["ip", "ipv4", "address", "host"],
	"DOB": ["dob", "birth", "date of birth", "born"],
	"ZIP_CODE": ["zip", "postal", "postcode", "address"],
	"PERSON": ["name", "employee", "customer", "patient", "person", "contact"],
	"LOCATION": ["address", "city", "state", "country", "location", "office"],
	"ORGANIZATION": ["company", "organization", "org", "employer", "business"],
	}

	REGEX_PATTERNS = {
	"EMAIL": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,}\b"),
	"PHONE": re.compile(r"(?:(?<!\d)(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)\d{3}[-.\s]?\d{4}(?!\d))"),
	"SSN": re.compile(r"\b\d{3}-?\d{2}-?\d{4}\b"),
	"CREDIT_CARD": re.compile(r"\b(?:\d[ -]*?){13,19}\b"),
	"IP_ADDRESS": re.compile(r"\b(?:25[0-5]\|2[0-4]\d\|1?\d?\d)(?:\.(?:25[0-5]\|2[0-4]\d\|1?\d?\d)){3}\b"),
	"DOB": re.compile(r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\|(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Sept\|Oct\|Nov\|Dec)[a-z]\s+\d{1,2},\s\d{4})\b", re.IGNORECASE),
	"ZIP_CODE": re.compile(r"\b\d{5}(?:-\d{4})?\b"),
	}


	def normalize_value(value: str) -> str:
	return re.sub(r"\s+", " ", value.strip().lower())



	def redact_value(value: str) -> str:
	if len(value) <= 4:
	return "" len(value)
	return f"{value[:2]}{'' max(len(value) - 4, 1)}{value[-2:]}"



	def luhn_check(number: str) -> bool:
	digits = re.sub(r"\D", "", number)
	if len(digits) < 13 or len(digits) > 19:
	return False
	total = 0
	reverse_digits = digits[::-1]
	for i, d in enumerate(reverse_digits):
	n = int(d)
	if i % 2 == 1:
	n *= 2
	if n > 9:
	n -= 9
	total += n
	return total % 10 == 0



	def looks_like_zip_code(value: str, context: str) -> bool:
	if re.fullmatch(r"\d{5}(?:-\d{4})?", value) is None:
	return False
	lowered = context.lower()
	return any(k in lowered for k in KEYWORDS["ZIP_CODE"])



	def has_keyword_context(pii_type: str, context: str, field: Optional[str] = None) -> bool:
	combined = f"{field or ''} {context}".lower()
	return any(keyword in combined for keyword in KEYWORDS.get(pii_type, []))



	def regex_findings(text: str, context: str, field: Optional[str], line: Optional[int], column: Optional[int]) -> List[Dict]:
	findings: List[Dict] = []
	for pii_type, pattern in REGEX_PATTERNS.items():
	for match in pattern.finditer(text):
	value = match.group(0).strip()
	if pii_type == "CREDIT_CARD" and not luhn_check(value):
	continue
	if pii_type == "ZIP_CODE" and not looks_like_zip_code(value, context + " " + (field or "")):
	continue
	confidence = 0.95 if has_keyword_context(pii_type, context, field) else 0.80
	findings.append(
	{
	"pii_type": pii_type,
	"matched_value": value,
	"confidence": confidence,
	"line": line,
	"column": column,
	"field": field,
	"source": "regex",
	"start": match.start(),
	"end": match.end(),
	}
	)
	return findings



	def ner_findings(nlp, text: str, context: str, field: Optional[str], line: Optional[int], column: Optional[int]) -> List[Dict]:
	if not text.strip():
	return []
	doc = nlp(text)
	findings: List[Dict] = []
	allowed = {"PERSON", "GPE", "LOC", "ORG"}
	label_map = {"PERSON": "PERSON", "GPE": "LOCATION", "LOC": "LOCATION", "ORG": "ORGANIZATION"}
	for ent in doc.ents:
	if ent.label_ not in allowed:
	continue
	value = ent.text.strip()
	if len(value) < 2:
	continue
	mapped = label_map[ent.label_]
	score = 0.60
	if has_keyword_context(mapped, context, field):
	score = 0.70
	findings.append(
	{
	"pii_type": mapped,
	"matched_value": value,
	"confidence": score,
	"line": line,
	"column": column,
	"field": field,
	"source": "ner",
	"start": ent.start_char,
	"end": ent.end_char,
	}
	)
	return findings



	def merge_findings(findings: List[Dict]) -> List[Dict]:
	merged: Dict[str, Dict] = {}
	for item in findings:
	key = "\|".join(
	[
	item.get("pii_type", ""),
	normalize_value(item.get("matched_value", "")),
	str(item.get("line")),
	str(item.get("column")),
	str(item.get("field")),
	]
	)
	existing = merged.get(key)
	if existing is None:
	merged[key] = dict(item)
	continue
	existing["confidence"] = max(existing["confidence"], item["confidence"])
	if existing.get("source") != item.get("source"):
	existing["confidence"] = 1.0
	existing["source"] = "both"
	return list(merged.values())