| # -*- coding: utf-8 -*- | |
| # | |
| # pyhwp : hwp file format parser in python | |
| # Copyright (C) 2010-2023 mete0r <https://github.com/mete0r> | |
| # | |
| # This program is free software: you can redistribute it and/or modify | |
| # it under the terms of the GNU Affero General Public License as published by | |
| # the Free Software Foundation, either version 3 of the License, or | |
| # (at your option) any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU Affero General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU Affero General Public License | |
| # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| # | |
| from __future__ import absolute_import | |
| from __future__ import print_function | |
| from __future__ import unicode_literals | |
| def get_unichr_lang(uch): | |
| # Hangul Syllables | |
| # U+AC00..U+D7AF | |
| # Hangul Jamo Extended-B | |
| # U+D7B0..D7FF | |
| if u'\uAC00' <= uch <= u'\uD7FF': | |
| return 'ko' | |
| # Control Characters and Numbers in Basic Latin | |
| if u'\u0000' <= uch <= u'\u0040': | |
| return None | |
| # Hangul Jamo | |
| if u'\u1100' <= uch <= u'\u11FF': | |
| return 'ko' | |
| # Hangul Compatibility Jamo | |
| if u'\u3130' <= uch <= u'\u318F': | |
| return 'ko' | |
| # Hangul Jamo Extended-A | |
| if u'\uA960' <= uch <= u'\uA97F': | |
| return 'ko' | |
| # -- en -- | |
| # Basic Latin, Latin Extended-A/B | |
| if u'\u0040' <= uch <= u'\u024F': | |
| return 'en' | |
| # -- cn -- | |
| # CJK Unified Ideographs | |
| # U+4E00..U+9FFF | |
| if u'\u4E00' <= uch <= u'\u9FFF': | |
| return 'cn' | |
| # CJK Radicals Supplement | |
| # U+2E80..U+2EFF | |
| # Kangxi Radicals | |
| # U+2F00..U+2FDF | |
| if u'\u2E80' <= uch <= u'\u2FDF': | |
| return 'cn' | |
| # CJK Unified Ideographs Extension A | |
| # U+3400..U+4DBF | |
| if u'\u3400' <= uch <= u'\u4DBF': | |
| return 'cn' | |
| # CJK Compatibility Ideographs | |
| # U+F900..U+FAFF | |
| if u'\uF900' <= uch <= u'\uFAFF': | |
| return 'cn' | |
| # CJK Symbols and Punctuation | |
| # U+3000..U+303F | |
| if u'\u3000' <= uch <= u'\u303F': | |
| return 'symbol' | |
| # -- jp -- | |
| # Hiragana + Katakana | |
| if u'\u3040' <= uch <= u'\u30FF': | |
| return 'jp' | |
| return 'other' | |
| def tokenize_unicode_by_lang(text): | |
| buf = [] | |
| buf_lang = None | |
| for uch in text: | |
| lang = get_unichr_lang(uch) | |
| if lang is None: | |
| buf.append(uch) | |
| continue | |
| if buf_lang == lang or buf_lang is None: | |
| buf_lang = lang | |
| buf.append(uch) | |
| continue | |
| else: | |
| yield buf_lang or 'ko', ''.join(buf) | |
| buf = [uch] | |
| buf_lang = lang | |
| if buf: | |
| yield buf_lang or 'ko', ''.join(buf) | |