AmkyawDev-LLM-V3 / scripts /convert_to_unicode.py
amkyawdev's picture
Initial upload from AmkyawDev-LLM-V3
bf54d75 verified
#!/usr/bin/env python3
"""
Unicode Normalization Script for Burmese Text
Converts various Unicode encodings to standardized form
"""
import os
import unicodedata
import json
from pathlib import Path
def normalize_unicode(text: str) -> str:
"""
Normalize Unicode text to NFC form.
Handles various Burmese Unicode variants.
"""
# Normalize to NFC form
normalized = unicodedata.normalize('NFC', text)
# Additional Burmese-specific normalization
# Fix common encoding issues
return normalized
def process_file(input_path: str, output_path: str = None):
"""Process a single text file."""
if output_path is None:
output_path = input_path.replace('raw', 'processed')
print(f"Processing: {input_path}")
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
normalized = normalize_unicode(content)
# Create output directory if needed
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(normalized)
print(f"Saved to: {output_path}")
def process_jsonl(input_path: str, output_path: str = None):
"""Process a JSONL file."""
if output_path is None:
output_path = input_path.replace('raw', 'processed')
print(f"Processing: {input_path}")
output_lines = []
with open(input_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data = json.loads(line)
# Normalize text fields
for key, value in data.items():
if isinstance(value, str):
data[key] = normalize_unicode(value)
output_lines.append(json.dumps(data, ensure_ascii=False))
# Create output directory if needed
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(output_lines))
print(f"Saved to: {output_path}")
def process_directory(input_dir: str, output_dir: str = None):
"""Process all files in a directory."""
if output_dir is None:
output_dir = input_dir.replace('raw', 'processed')
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
for file_path in input_path.rglob('*'):
if file_path.is_file():
rel_path = file_path.relative_to(input_path)
out_file = output_path / rel_path
if file_path.suffix == '.jsonl':
process_jsonl(str(file_path), str(out_file))
elif file_path.suffix in ['.txt', '.json']:
process_file(str(file_path), str(out_file))
def main():
import argparse
parser = argparse.ArgumentParser(description='Normalize Unicode text')
parser.add_argument('input', help='Input file or directory')
parser.add_argument('-o', '--output', help='Output file or directory')
args = parser.parse_args()
input_path = Path(args.input)
if input_path.is_dir():
process_directory(args.input, args.output)
elif input_path.suffix == '.jsonl':
process_jsonl(args.input, args.output)
else:
process_file(args.input, args.output)
if __name__ == "__main__":
main()