File size: 3,407 Bytes
bf54d75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | #!/usr/bin/env python3
"""
Unicode Normalization Script for Burmese Text
Converts various Unicode encodings to standardized form
"""
import os
import unicodedata
import json
from pathlib import Path
def normalize_unicode(text: str) -> str:
"""
Normalize Unicode text to NFC form.
Handles various Burmese Unicode variants.
"""
# Normalize to NFC form
normalized = unicodedata.normalize('NFC', text)
# Additional Burmese-specific normalization
# Fix common encoding issues
return normalized
def process_file(input_path: str, output_path: str = None):
"""Process a single text file."""
if output_path is None:
output_path = input_path.replace('raw', 'processed')
print(f"Processing: {input_path}")
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
normalized = normalize_unicode(content)
# Create output directory if needed
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(normalized)
print(f"Saved to: {output_path}")
def process_jsonl(input_path: str, output_path: str = None):
"""Process a JSONL file."""
if output_path is None:
output_path = input_path.replace('raw', 'processed')
print(f"Processing: {input_path}")
output_lines = []
with open(input_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data = json.loads(line)
# Normalize text fields
for key, value in data.items():
if isinstance(value, str):
data[key] = normalize_unicode(value)
output_lines.append(json.dumps(data, ensure_ascii=False))
# Create output directory if needed
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(output_lines))
print(f"Saved to: {output_path}")
def process_directory(input_dir: str, output_dir: str = None):
"""Process all files in a directory."""
if output_dir is None:
output_dir = input_dir.replace('raw', 'processed')
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
for file_path in input_path.rglob('*'):
if file_path.is_file():
rel_path = file_path.relative_to(input_path)
out_file = output_path / rel_path
if file_path.suffix == '.jsonl':
process_jsonl(str(file_path), str(out_file))
elif file_path.suffix in ['.txt', '.json']:
process_file(str(file_path), str(out_file))
def main():
import argparse
parser = argparse.ArgumentParser(description='Normalize Unicode text')
parser.add_argument('input', help='Input file or directory')
parser.add_argument('-o', '--output', help='Output file or directory')
args = parser.parse_args()
input_path = Path(args.input)
if input_path.is_dir():
process_directory(args.input, args.output)
elif input_path.suffix == '.jsonl':
process_jsonl(args.input, args.output)
else:
process_file(args.input, args.output)
if __name__ == "__main__":
main() |