File size: 2,133 Bytes
ebb8326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Common utility functions used across the project."""

import re
import unicodedata


def normalize_text(text: str) -> str:
    """Normalize text: clean whitespace, unicode, and formatting.
    
    Applies:
    - Unicode NFKC normalization
    - Zero-width character removal
    - Whitespace normalization
    - Multiple newline compression
    
    Args:
        text: Raw text to normalize
        
    Returns:
        Cleaned and normalized text
    """
    if not text:
        return ""
    
    # Unicode NFKC normalization (composing characters)
    text = unicodedata.normalize("NFKC", text)
    
    # Remove zero-width characters
    text = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", text)
    
    # Normalize whitespace & newlines
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"http[s]?://\S+", "", text)
    # Strip leading/trailing whitespace from each line
    lines = [line.strip() for line in text.splitlines()]
    text = "\n".join(lines)
    
    return text.strip()


def remove_diacritics(text: str) -> str:
    """Remove Vietnamese diacritics from text.
    
    Example: 'văn hóa' -> 'van hoa'
    
    Args:
        text: Text with Vietnamese diacritics
        
    Returns:
        Text with diacritics removed, lowercased
    """
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c)).lower()


def _extract_qid_number(qid: str) -> tuple[str, int]:
    """Extract prefix and numeric part from qid for natural sorting.
    
    Args:
        qid: Question ID like "test_0001" or "val_123"
        
    Returns:
        Tuple of (prefix, number) for sorting
    """
    match = re.match(r"^([a-zA-Z_]+)(\d+)$", qid)
    if match:
        return (match.group(1), int(match.group(2)))
    return (qid, 0)


def sort_qids(qids: list[str]) -> list[str]:
    """Sort question IDs naturally (test_0001 < test_0002 < test_0010).
    
    Args:
        qids: List of question IDs
        
    Returns:
        Sorted list of question IDs
    """
    return sorted(qids, key=_extract_qid_number)