File size: 2,841 Bytes
d94b56e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
#
#   pyhwp : hwp file format parser in python
#   Copyright (C) 2010-2023 mete0r <https://github.com/mete0r>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals


def get_unichr_lang(uch):
    # Hangul Syllables
    # U+AC00..U+D7AF
    # Hangul Jamo Extended-B
    # U+D7B0..D7FF
    if u'\uAC00' <= uch <= u'\uD7FF':
        return 'ko'

    # Control Characters and Numbers in Basic Latin
    if u'\u0000' <= uch <= u'\u0040':
        return None

    # Hangul Jamo
    if u'\u1100' <= uch <= u'\u11FF':
        return 'ko'

    # Hangul Compatibility Jamo
    if u'\u3130' <= uch <= u'\u318F':
        return 'ko'

    # Hangul Jamo Extended-A
    if u'\uA960' <= uch <= u'\uA97F':
        return 'ko'

    # -- en --

    # Basic Latin, Latin Extended-A/B
    if u'\u0040' <= uch <= u'\u024F':
        return 'en'

    # -- cn --

    # CJK Unified Ideographs
    # U+4E00..U+9FFF
    if u'\u4E00' <= uch <= u'\u9FFF':
        return 'cn'

    # CJK Radicals Supplement
    # U+2E80..U+2EFF
    # Kangxi Radicals
    # U+2F00..U+2FDF
    if u'\u2E80' <= uch <= u'\u2FDF':
        return 'cn'

    # CJK Unified Ideographs Extension A
    # U+3400..U+4DBF
    if u'\u3400' <= uch <= u'\u4DBF':
        return 'cn'

    # CJK Compatibility Ideographs
    # U+F900..U+FAFF
    if u'\uF900' <= uch <= u'\uFAFF':
        return 'cn'

    # CJK Symbols and Punctuation
    # U+3000..U+303F
    if u'\u3000' <= uch <= u'\u303F':
        return 'symbol'

    # -- jp --

    # Hiragana + Katakana
    if u'\u3040' <= uch <= u'\u30FF':
        return 'jp'

    return 'other'


def tokenize_unicode_by_lang(text):
    buf = []
    buf_lang = None
    for uch in text:
        lang = get_unichr_lang(uch)
        if lang is None:
            buf.append(uch)
            continue
        if buf_lang == lang or buf_lang is None:
            buf_lang = lang
            buf.append(uch)
            continue
        else:
            yield buf_lang or 'ko', ''.join(buf)
            buf = [uch]
            buf_lang = lang
    if buf:
        yield buf_lang or 'ko', ''.join(buf)