Spaces:

Heartsync
/

cobiz

Running

App Files Files Community

cobiz / src /hwp5 /charsets.py

seawolf2357

Add src

d94b56e verified about 1 month ago

raw

history blame contribute delete

2.84 kB

	# -- coding: utf-8 --
	#
	# pyhwp : hwp file format parser in python
	# Copyright (C) 2010-2023 mete0r <https://github.com/mete0r>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU Affero General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU Affero General Public License for more details.
	#
	# You should have received a copy of the GNU Affero General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	#
	from __future__ import absolute_import
	from __future__ import print_function
	from __future__ import unicode_literals


	def get_unichr_lang(uch):
	# Hangul Syllables
	# U+AC00..U+D7AF
	# Hangul Jamo Extended-B
	# U+D7B0..D7FF
	if u'\uAC00' <= uch <= u'\uD7FF':
	return 'ko'

	# Control Characters and Numbers in Basic Latin
	if u'\u0000' <= uch <= u'\u0040':
	return None

	# Hangul Jamo
	if u'\u1100' <= uch <= u'\u11FF':
	return 'ko'

	# Hangul Compatibility Jamo
	if u'\u3130' <= uch <= u'\u318F':
	return 'ko'

	# Hangul Jamo Extended-A
	if u'\uA960' <= uch <= u'\uA97F':
	return 'ko'

	# -- en --

	# Basic Latin, Latin Extended-A/B
	if u'\u0040' <= uch <= u'\u024F':
	return 'en'

	# -- cn --

	# CJK Unified Ideographs
	# U+4E00..U+9FFF
	if u'\u4E00' <= uch <= u'\u9FFF':
	return 'cn'

	# CJK Radicals Supplement
	# U+2E80..U+2EFF
	# Kangxi Radicals
	# U+2F00..U+2FDF
	if u'\u2E80' <= uch <= u'\u2FDF':
	return 'cn'

	# CJK Unified Ideographs Extension A
	# U+3400..U+4DBF
	if u'\u3400' <= uch <= u'\u4DBF':
	return 'cn'

	# CJK Compatibility Ideographs
	# U+F900..U+FAFF
	if u'\uF900' <= uch <= u'\uFAFF':
	return 'cn'

	# CJK Symbols and Punctuation
	# U+3000..U+303F
	if u'\u3000' <= uch <= u'\u303F':
	return 'symbol'

	# -- jp --

	# Hiragana + Katakana
	if u'\u3040' <= uch <= u'\u30FF':
	return 'jp'

	return 'other'


	def tokenize_unicode_by_lang(text):
	buf = []
	buf_lang = None
	for uch in text:
	lang = get_unichr_lang(uch)
	if lang is None:
	buf.append(uch)
	continue
	if buf_lang == lang or buf_lang is None:
	buf_lang = lang
	buf.append(uch)
	continue
	else:
	yield buf_lang or 'ko', ''.join(buf)
	buf = [uch]
	buf_lang = lang
	if buf:
	yield buf_lang or 'ko', ''.join(buf)