#!/usr/bin/env python3 """ Unicode Normalization Script for Burmese Text Converts various Unicode encodings to standardized form """ import os import unicodedata import json from pathlib import Path def normalize_unicode(text: str) -> str: """ Normalize Unicode text to NFC form. Handles various Burmese Unicode variants. """ # Normalize to NFC form normalized = unicodedata.normalize('NFC', text) # Additional Burmese-specific normalization # Fix common encoding issues return normalized def process_file(input_path: str, output_path: str = None): """Process a single text file.""" if output_path is None: output_path = input_path.replace('raw', 'processed') print(f"Processing: {input_path}") with open(input_path, 'r', encoding='utf-8') as f: content = f.read() normalized = normalize_unicode(content) # Create output directory if needed Path(output_path).parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: f.write(normalized) print(f"Saved to: {output_path}") def process_jsonl(input_path: str, output_path: str = None): """Process a JSONL file.""" if output_path is None: output_path = input_path.replace('raw', 'processed') print(f"Processing: {input_path}") output_lines = [] with open(input_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): data = json.loads(line) # Normalize text fields for key, value in data.items(): if isinstance(value, str): data[key] = normalize_unicode(value) output_lines.append(json.dumps(data, ensure_ascii=False)) # Create output directory if needed Path(output_path).parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: f.write('\n'.join(output_lines)) print(f"Saved to: {output_path}") def process_directory(input_dir: str, output_dir: str = None): """Process all files in a directory.""" if output_dir is None: output_dir = input_dir.replace('raw', 'processed') input_path = Path(input_dir) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) for file_path in input_path.rglob('*'): if file_path.is_file(): rel_path = file_path.relative_to(input_path) out_file = output_path / rel_path if file_path.suffix == '.jsonl': process_jsonl(str(file_path), str(out_file)) elif file_path.suffix in ['.txt', '.json']: process_file(str(file_path), str(out_file)) def main(): import argparse parser = argparse.ArgumentParser(description='Normalize Unicode text') parser.add_argument('input', help='Input file or directory') parser.add_argument('-o', '--output', help='Output file or directory') args = parser.parse_args() input_path = Path(args.input) if input_path.is_dir(): process_directory(args.input, args.output) elif input_path.suffix == '.jsonl': process_jsonl(args.input, args.output) else: process_file(args.input, args.output) if __name__ == "__main__": main()