#!/usr/bin/env python3 # -*- coding:utf-8 -*- import argparse import re import sys """ Syllable breaking tool for Myanmar language. Usage: python sylbreak.py --help cat test.txt | python sylbreak.py python sylbreak.py --input test.txt python ./sylbreak.py --input ./test.txt --print python sylbreak.py --input test.txt --output out.txt python ./sylbreak.py --input ./one_line.txt --separator " " --output one_line.syl Date: 21 July 2016 Written by Ye Kyaw Thu, Visiting Researcher, Waseda University HP: https://sites.google.com/site/yekyawthunlp/ Date: 29 Sep 2021 Add support for python3 by sengkyaut Last Updated: 19 January 2024. The code has been rewritten for easier readability. It now includes features for removing the leading delimiter and replacing sequences of 'delimiter-space-delimiter' with a single space. Updated by Ye Kyaw Thu. Reference of Myanmar Unicode: http://unicode.org/charts/PDF/U1000.pdf """ def parse_arguments(): """Parse command line arguments for the script.""" parser = argparse.ArgumentParser(description='Syllable segmentation for Myanmar language') parser.add_argument('-i', '--input', type=str, help='Input file (optional)') parser.add_argument('-o', '--output', type=str, help='Output file (optional)') parser.add_argument('-s', '--separator', type=str, default='|', help='Separator for syllable (e.g. -s "/"), default is "|"') parser.add_argument('-p', '--print', action='store_true', help='Print both input and syllable segmented sentences') return parser.parse_args() def create_break_pattern(): """Creates and returns the regular expression pattern for Myanmar syllable breaking.""" my_consonant = r"က-အ" en_char = r"a-zA-Z0-9" other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s" subscript_symbol = r'္' a_that = r'်' # Regular expression pattern for Myanmar syllable breaking return re.compile( r"((?