| |
| """ |
| Unicode Normalization Script for Burmese Text |
| Converts various Unicode encodings to standardized form |
| """ |
|
|
| import os |
| import unicodedata |
| import json |
| from pathlib import Path |
|
|
|
|
| def normalize_unicode(text: str) -> str: |
| """ |
| Normalize Unicode text to NFC form. |
| Handles various Burmese Unicode variants. |
| """ |
| |
| normalized = unicodedata.normalize('NFC', text) |
| |
| |
| |
| |
| return normalized |
|
|
|
|
| def process_file(input_path: str, output_path: str = None): |
| """Process a single text file.""" |
| if output_path is None: |
| output_path = input_path.replace('raw', 'processed') |
| |
| print(f"Processing: {input_path}") |
| |
| with open(input_path, 'r', encoding='utf-8') as f: |
| content = f.read() |
| |
| normalized = normalize_unicode(content) |
| |
| |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write(normalized) |
| |
| print(f"Saved to: {output_path}") |
|
|
|
|
| def process_jsonl(input_path: str, output_path: str = None): |
| """Process a JSONL file.""" |
| if output_path is None: |
| output_path = input_path.replace('raw', 'processed') |
| |
| print(f"Processing: {input_path}") |
| |
| output_lines = [] |
| with open(input_path, 'r', encoding='utf-8') as f: |
| for line in f: |
| if line.strip(): |
| data = json.loads(line) |
| |
| for key, value in data.items(): |
| if isinstance(value, str): |
| data[key] = normalize_unicode(value) |
| output_lines.append(json.dumps(data, ensure_ascii=False)) |
| |
| |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write('\n'.join(output_lines)) |
| |
| print(f"Saved to: {output_path}") |
|
|
|
|
| def process_directory(input_dir: str, output_dir: str = None): |
| """Process all files in a directory.""" |
| if output_dir is None: |
| output_dir = input_dir.replace('raw', 'processed') |
| |
| input_path = Path(input_dir) |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| for file_path in input_path.rglob('*'): |
| if file_path.is_file(): |
| rel_path = file_path.relative_to(input_path) |
| out_file = output_path / rel_path |
| |
| if file_path.suffix == '.jsonl': |
| process_jsonl(str(file_path), str(out_file)) |
| elif file_path.suffix in ['.txt', '.json']: |
| process_file(str(file_path), str(out_file)) |
|
|
|
|
| def main(): |
| import argparse |
| |
| parser = argparse.ArgumentParser(description='Normalize Unicode text') |
| parser.add_argument('input', help='Input file or directory') |
| parser.add_argument('-o', '--output', help='Output file or directory') |
| |
| args = parser.parse_args() |
| |
| input_path = Path(args.input) |
| |
| if input_path.is_dir(): |
| process_directory(args.input, args.output) |
| elif input_path.suffix == '.jsonl': |
| process_jsonl(args.input, args.output) |
| else: |
| process_file(args.input, args.output) |
|
|
|
|
| if __name__ == "__main__": |
| main() |