| import re
|
| import unicodedata
|
|
|
|
|
|
|
|
|
| _KEEP_PATTERN = re.compile(
|
| r"[^\u0E00-\u0E7F"
|
| r"a-zA-Z"
|
| r"0-9"
|
| r"\s"
|
| r"\.\,\!\?\(\)\-\:\;\"\'\/"
|
| r"]"
|
| )
|
|
|
|
|
| _ZERO_WIDTH = re.compile(
|
| r"[\u200B"
|
| r"\u200C"
|
| r"\u200D"
|
| r"\uFEFF"
|
| r"\u00AD"
|
| r"]"
|
| )
|
|
|
|
|
| _FULLWIDTH_MAP = str.maketrans(
|
| "!"#$%&'()*+,-./"
|
| "0123456789"
|
| ":;<=>?@"
|
| "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| "[\]^_`"
|
| "abcdefghijklmnopqrstuvwxyz"
|
| "{|}~",
|
| "!\"#$%&'()*+,-./"
|
| "0123456789"
|
| ":;<=>?@"
|
| "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| "[\\]^_`"
|
| "abcdefghijklmnopqrstuvwxyz"
|
| "{|}~"
|
| )
|
|
|
|
|
| _THAI_DIGITS = str.maketrans("๐๑๒๓๔๕๖๗๘๙", "0123456789")
|
|
|
|
|
| def preprocess_thai(text: str) -> str:
|
| """
|
| Clean Thai text ก่อนส่งเข้า SentencePiece training หรือ inference
|
|
|
| ลำดับสำคัญ — อย่าสลับขั้นตอน:
|
| 1. NFC ก่อน เพราะ regex ที่ใช้ codepoint range จะทำงานถูกต้องหลัง normalize เท่านั้น
|
| 2. Zero-width ก่อน noise อื่น เพราะบางครั้ง ZWSP อยู่ติดกับ HTML entity
|
| 3. HTML/URL ก่อน fullwidth เพราะ URL บางตัวมี fullwidth chars
|
| 4. Thai-specific หลัง noise เพราะต้องการ text ที่สะอาดแล้ว
|
| 5. Whitespace สุดท้ายเสมอ
|
| """
|
| if not text or not text.strip():
|
| return ""
|
|
|
|
|
| text = unicodedata.normalize("NFC", text)
|
|
|
|
|
| text = _ZERO_WIDTH.sub("", text)
|
|
|
|
|
| text = re.sub(r"<[^>]{1,100}>", " ", text)
|
| text = re.sub(r"&[a-zA-Z]{2,8};", " ", text)
|
| text = re.sub(r"&#\d{1,6};", " ", text)
|
|
|
|
|
| text = re.sub(r"https?://\S{1,500}", " ", text)
|
| text = re.sub(r"www\.\S{1,500}", " ", text)
|
| text = re.sub(r"\S{1,100}@\S{1,100}\.\S{2,10}", " ", text)
|
|
|
|
|
| text = text.translate(_FULLWIDTH_MAP)
|
|
|
|
|
| text = text.translate(_THAI_DIGITS)
|
|
|
|
|
|
|
| text = text.replace("\u0E40\u0E40", "\u0E41")
|
|
|
|
|
|
|
| text = re.sub(r"([\u0E48-\u0E4B])\1+", r"\1", text)
|
|
|
|
|
| text = _KEEP_PATTERN.sub(" ", text)
|
|
|
|
|
| text = re.sub(r"[ \t]+", " ", text)
|
| text = re.sub(r"\n{3,}", "\n\n", text)
|
| text = text.strip()
|
|
|
| return text
|
|
|
|
|
| def preprocess_file(input_path: str, output_path: str, min_length: int = 10) -> int:
|
| """
|
| Process ทั้งไฟล์ corpus ทีละบรรทัด
|
| return จำนวน lines ที่เก็บไว้
|
| """
|
| kept = 0
|
| with open(input_path, encoding="utf-8") as fin, \
|
| open(output_path, "w", encoding="utf-8") as fout:
|
| for line in fin:
|
| clean = preprocess_thai(line)
|
|
|
| if len(clean) >= min_length:
|
| fout.write(clean + "\n")
|
| kept += 1
|
| return kept
|
|
|
| if __name__ == "__main__":
|
| cases = [
|
|
|
| ("สวัสดี\u200Bครับ", "สวัสดีครับ", "ลบ ZWSP"),
|
| ("<p>ข้อความ</p>", "ข้อความ", "ลบ HTML tags"),
|
| ("ดู https://example.com ด้วย", "ดู ด้วย", "ลบ URL"),
|
| ("A B C ๑๒๓", "A B C 123", "fullwidth + Thai digits"),
|
| ("เเมว", "แมว", "เเ → แ"),
|
| ("ไม้้้โท", "ไม้โท", "tone mark ซ้ำ"),
|
| (" &", "", "HTML entities"),
|
| ("", "", "empty string"),
|
| ]
|
|
|
| for text, expected, desc in cases:
|
| result = preprocess_thai(text)
|
| status = "✓" if result == expected else "✗"
|
| print(f"{status} {desc}: {repr(result)}")
|
| if result != expected:
|
| print(f" expected: {repr(expected)}") |