File size: 2,841 Bytes
d94b56e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | # -*- coding: utf-8 -*-
#
# pyhwp : hwp file format parser in python
# Copyright (C) 2010-2023 mete0r <https://github.com/mete0r>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
def get_unichr_lang(uch):
# Hangul Syllables
# U+AC00..U+D7AF
# Hangul Jamo Extended-B
# U+D7B0..D7FF
if u'\uAC00' <= uch <= u'\uD7FF':
return 'ko'
# Control Characters and Numbers in Basic Latin
if u'\u0000' <= uch <= u'\u0040':
return None
# Hangul Jamo
if u'\u1100' <= uch <= u'\u11FF':
return 'ko'
# Hangul Compatibility Jamo
if u'\u3130' <= uch <= u'\u318F':
return 'ko'
# Hangul Jamo Extended-A
if u'\uA960' <= uch <= u'\uA97F':
return 'ko'
# -- en --
# Basic Latin, Latin Extended-A/B
if u'\u0040' <= uch <= u'\u024F':
return 'en'
# -- cn --
# CJK Unified Ideographs
# U+4E00..U+9FFF
if u'\u4E00' <= uch <= u'\u9FFF':
return 'cn'
# CJK Radicals Supplement
# U+2E80..U+2EFF
# Kangxi Radicals
# U+2F00..U+2FDF
if u'\u2E80' <= uch <= u'\u2FDF':
return 'cn'
# CJK Unified Ideographs Extension A
# U+3400..U+4DBF
if u'\u3400' <= uch <= u'\u4DBF':
return 'cn'
# CJK Compatibility Ideographs
# U+F900..U+FAFF
if u'\uF900' <= uch <= u'\uFAFF':
return 'cn'
# CJK Symbols and Punctuation
# U+3000..U+303F
if u'\u3000' <= uch <= u'\u303F':
return 'symbol'
# -- jp --
# Hiragana + Katakana
if u'\u3040' <= uch <= u'\u30FF':
return 'jp'
return 'other'
def tokenize_unicode_by_lang(text):
buf = []
buf_lang = None
for uch in text:
lang = get_unichr_lang(uch)
if lang is None:
buf.append(uch)
continue
if buf_lang == lang or buf_lang is None:
buf_lang = lang
buf.append(uch)
continue
else:
yield buf_lang or 'ko', ''.join(buf)
buf = [uch]
buf_lang = lang
if buf:
yield buf_lang or 'ko', ''.join(buf)
|