| |
| |
|
|
| import argparse |
| import re |
| import sys |
|
|
| """ |
| Syllable breaking tool for Myanmar language. |
| |
| Usage: python sylbreak.py --help |
| cat test.txt | python sylbreak.py |
| python sylbreak.py --input test.txt |
| python ./sylbreak.py --input ./test.txt --print |
| python sylbreak.py --input test.txt --output out.txt |
| python ./sylbreak.py --input ./one_line.txt --separator " " --output one_line.syl |
| |
| Date: 21 July 2016 |
| Written by Ye Kyaw Thu, Visiting Researcher, Waseda University |
| HP: https://sites.google.com/site/yekyawthunlp/ |
| |
| Date: 29 Sep 2021 |
| Add support for python3 by sengkyaut |
| |
| Last Updated: 19 January 2024. |
| The code has been rewritten for easier readability. It now includes features for removing the leading delimiter and replacing sequences of 'delimiter-space-delimiter' with a single space. |
| Updated by Ye Kyaw Thu. |
| |
| Reference of Myanmar Unicode: http://unicode.org/charts/PDF/U1000.pdf |
| """ |
|
|
| def parse_arguments(): |
| """Parse command line arguments for the script.""" |
| parser = argparse.ArgumentParser(description='Syllable segmentation for Myanmar language') |
| parser.add_argument('-i', '--input', type=str, help='Input file (optional)') |
| parser.add_argument('-o', '--output', type=str, help='Output file (optional)') |
| parser.add_argument('-s', '--separator', type=str, default='|', help='Separator for syllable (e.g. -s "/"), default is "|"') |
| parser.add_argument('-p', '--print', action='store_true', help='Print both input and syllable segmented sentences') |
| return parser.parse_args() |
|
|
| def create_break_pattern(): |
| """Creates and returns the regular expression pattern for Myanmar syllable breaking.""" |
| my_consonant = r"α-α‘" |
| en_char = r"a-zA-Z0-9" |
| other_char = r"α£α€α₯α¦α§α©αͺαΏαααα-ααα!-/:-@[-`{-~\s" |
| subscript_symbol = r'αΉ' |
| a_that = r'αΊ' |
|
|
| |
| return re.compile( |
| r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]" |
| r"(?![" |
| + a_that + subscript_symbol + r"])" |
| + r"|[" + en_char + other_char + r"])" |
| ) |
|
|
| def break_syllables(line, break_pattern, separator): |
| """Applies syllable breaking rules to a line.""" |
| line = re.sub(r'\s+', ' ', line.strip()) |
| segmented_line = break_pattern.sub(separator + r"\1", line) |
|
|
| |
| if segmented_line.startswith(separator): |
| segmented_line = segmented_line[len(separator):] |
|
|
| |
| double_delimiter = separator + " " + separator |
| segmented_line = segmented_line.replace(double_delimiter, " ") |
|
|
| return segmented_line |
|
|
| def process_input(input_stream, output_stream, separator, print_option): |
| """Reads, processes, and writes the syllable segmented data.""" |
| for line in input_stream: |
| if print_option: |
| print("Input:\t" + line.strip()) |
| segmented_line = break_syllables(line, break_pattern, separator) |
| output_stream.write(segmented_line + '\n') |
| if print_option: |
| print("Sylbreaked:\t" + segmented_line) |
|
|
| if __name__ == "__main__": |
| args = parse_arguments() |
| break_pattern = create_break_pattern() |
|
|
| input_stream = open(args.input, 'r', encoding='utf-8') if args.input else sys.stdin |
| output_stream = open(args.output, 'w', encoding='utf-8') if args.output else sys.stdout |
|
|
| try: |
| process_input(input_stream, output_stream, args.separator, args.print) |
| finally: |
| if args.input: |
| input_stream.close() |
| if args.output: |
| output_stream.close() |
|
|