File size: 3,407 Bytes
bf54d75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""
Unicode Normalization Script for Burmese Text
Converts various Unicode encodings to standardized form
"""

import os
import unicodedata
import json
from pathlib import Path


def normalize_unicode(text: str) -> str:
    """
    Normalize Unicode text to NFC form.
    Handles various Burmese Unicode variants.
    """
    # Normalize to NFC form
    normalized = unicodedata.normalize('NFC', text)
    
    # Additional Burmese-specific normalization
    # Fix common encoding issues
    
    return normalized


def process_file(input_path: str, output_path: str = None):
    """Process a single text file."""
    if output_path is None:
        output_path = input_path.replace('raw', 'processed')
    
    print(f"Processing: {input_path}")
    
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    normalized = normalize_unicode(content)
    
    # Create output directory if needed
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(normalized)
    
    print(f"Saved to: {output_path}")


def process_jsonl(input_path: str, output_path: str = None):
    """Process a JSONL file."""
    if output_path is None:
        output_path = input_path.replace('raw', 'processed')
    
    print(f"Processing: {input_path}")
    
    output_lines = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data = json.loads(line)
                # Normalize text fields
                for key, value in data.items():
                    if isinstance(value, str):
                        data[key] = normalize_unicode(value)
                output_lines.append(json.dumps(data, ensure_ascii=False))
    
    # Create output directory if needed
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(output_lines))
    
    print(f"Saved to: {output_path}")


def process_directory(input_dir: str, output_dir: str = None):
    """Process all files in a directory."""
    if output_dir is None:
        output_dir = input_dir.replace('raw', 'processed')
    
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    for file_path in input_path.rglob('*'):
        if file_path.is_file():
            rel_path = file_path.relative_to(input_path)
            out_file = output_path / rel_path
            
            if file_path.suffix == '.jsonl':
                process_jsonl(str(file_path), str(out_file))
            elif file_path.suffix in ['.txt', '.json']:
                process_file(str(file_path), str(out_file))


def main():
    import argparse
    
    parser = argparse.ArgumentParser(description='Normalize Unicode text')
    parser.add_argument('input', help='Input file or directory')
    parser.add_argument('-o', '--output', help='Output file or directory')
    
    args = parser.parse_args()
    
    input_path = Path(args.input)
    
    if input_path.is_dir():
        process_directory(args.input, args.output)
    elif input_path.suffix == '.jsonl':
        process_jsonl(args.input, args.output)
    else:
        process_file(args.input, args.output)


if __name__ == "__main__":
    main()