Spaces:

Doan-NLP
/

PhanLoai_Toxic_Comment

Sleeping

Upload preprocessing.py

8e72307 verified about 2 months ago

1.31 kB

	# preprocessing.py
	import re
	import string
	import unicodedata

	def normalize(text: str) -> str:
	# 1. Chuyển viết thường, bỏ URL và email
	text = str(text).lower()
	text = re.sub(r"http\S+\|www\S+", " ", text)
	text = re.sub(r"\S+@\S+", " ", text)

	# 2. Loại bỏ dấu câu nhưng GIỮ LẠI emoji
	punctuation_pattern = re.compile(f"[{re.escape(string.punctuation)}]")
	text = punctuation_pattern.sub(" ", text)

	# 3. Khử lặp ký tự >= 3 lần → giữ lại 2
	# "ngonnnnn" → "ngonn" \| "vuiiiii" → "vuii" \| "cc","đmm" giữ nguyên
	text = re.sub(
	r'([a-zàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ])\1{2,}',
	r'\1\1', text
	)

	# 4. Loại bỏ khoảng trắng thừa
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	# 🔥 HÀM CHÍNH ĐƯỢC APP.PY GỌI SỬ DỤNG
	def preprocess_text_for_Visobert(text: str) -> str:
	# Chuẩn hóa chuẩn Unicode tổ hợp/dựng sẵn tránh lỗi font chữ tiếng Việt ẩn
	text = unicodedata.normalize("NFC", str(text))

	# Áp dụng hàm normalize đồng bộ hoàn toàn với Colab
	return normalize(text)