Spaces:

MedVietAI
/

processing

Sleeping

App Files Files Community

processing / utils /augment.py

LiamKhoaLe

Upd 14/9

80cb919 4 months ago

raw

history blame

3.49 kB

	# augmentation utility agent
	import re
	import random
	from typing import Dict, Tuple
	import ftfy
	import langid

	P_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
	P_PHONE = re.compile(r"(?:(?:\+?\d{1,3})?[\s-]?)?(?:$?\d{2,4}$?[\s-]?)?\d{3,4}[\s-]?\d{3,4}")
	P_URL = re.compile(r"https?://\S+\|www\.\S+")
	P_IP = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")

	def fix_unicode(s: str) -> str:
	return ftfy.fix_text(s or "")

	def normalize_whitespace(s: str) -> str:
	s = s.replace("\u00A0", " ")
	s = re.sub(r"[ \t]+", " ", s)
	s = re.sub(r"\s+\n", "\n", s)
	s = re.sub(r"\n{3,}", "\n\n", s)
	return s.strip()

	def canonicalize_quotes(s: str) -> str:
	return s.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")

	def ensure_terminal_punct(s: str) -> str:
	if not s: return s
	if s[-1] in ".!?": return s
	return s + "."

	def deidentify(s: str) -> str:
	s = P_EMAIL.sub("[REDACTED_EMAIL]", s)
	s = P_PHONE.sub("[REDACTED_PHONE]", s)
	s = P_URL.sub("[REDACTED_URL]", s)
	s = P_IP.sub("[REDACTED_IP]", s)
	return s

	def lang_is_english(s: str) -> bool:
	try:
	lang, _ = langid.classify((s or "")[:2000])
	return lang == "en"
	except Exception:
	return True

	def length_cap(s: str, max_chars: int) -> str:
	if len(s) <= max_chars:
	return s
	# try to cut at sentence boundary
	cut = s[:max_chars]
	last_dot = cut.rfind(". ")
	if last_dot > 300: # don't cut too aggressively
	return cut[:last_dot+1] + " …"
	return cut + " …"

	def fingerprint(instr: str, user: str, out: str) -> str:
	# Simple, fast fingerprint for dedupe
	def norm(x: str) -> str:
	x = x.lower()
	x = re.sub(r"[^a-z0-9]+", " ", x)
	x = re.sub(r"\s+", " ", x).strip()
	return x
	core = "\|\|".join([norm(instr), norm(user), norm(out)])
	# lightweight hash
	import hashlib
	return hashlib.md5(core.encode("utf-8")).hexdigest()

	def style_standardize_answer(ans: str) -> str:
	if not ans: return ans
	ans = ans.strip()
	# Gentle guardrails, neutral voice
	prefix = ""
	# Avoid absolute guarantees
	ans = re.sub(r"\b(guarantee\|100%\|certainly\|always\|never)\b", "likely", ans, flags=re.I)
	# Remove sign-offs typical of forums
	ans = re.sub(r"\n(thanks\|thank you\|regards\|cheers)[^\n]$", "", ans, flags=re.I)
	return ensure_terminal_punct(ans)

	def base_cleanup(s: str, max_chars: int, do_deid: bool) -> str:
	s = fix_unicode(s)
	s = canonicalize_quotes(s)
	s = normalize_whitespace(s)
	if do_deid:
	s = deidentify(s)
	s = length_cap(s, max_chars)
	return s

	def maybe_paraphrase(text: str, ratio: float, paraphraser, difficulty: str) -> Tuple[str, bool]:
	if ratio <= 0 or not text: return text, False
	if random.random() < ratio:
	return paraphraser.paraphrase(text, difficulty=difficulty), True
	return text, False

	def maybe_backtranslate(text: str, ratio: float, paraphraser) -> Tuple[str, bool]:
	if ratio <= 0 or not text: return text, False
	if random.random() < ratio:
	bt = paraphraser.backtranslate(text, via_lang="de")
	return bt if bt else text, bool(bt)
	return text, False

	def consistency_ok(user: str, out: str, ratio: float, paraphraser) -> bool:
	if ratio <= 0 or (not user) or (not out):
	return True
	if random.random() >= ratio:
	return True
	return paraphraser.consistency_check(user, out)