Huakang Chen commited on
Commit
e58ed1d
·
1 Parent(s): 37417b4
text/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .clean import clean_text
2
+ from .spliter import split_text
3
+
4
+ __all__ = ["clean_text", "split_text"]
text/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (269 Bytes). View file
 
text/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (258 Bytes). View file
 
text/__pycache__/clean.cpython-310.pyc ADDED
Binary file (1.34 kB). View file
 
text/__pycache__/clean.cpython-38.pyc ADDED
Binary file (1.22 kB). View file
 
text/__pycache__/spliter.cpython-310.pyc ADDED
Binary file (3.19 kB). View file
 
text/__pycache__/spliter.cpython-38.pyc ADDED
Binary file (3.21 kB). View file
 
text/chn_text_norm/.gitignore ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Translations
51
+ *.mo
52
+ *.pot
53
+
54
+ # Django stuff:
55
+ *.log
56
+ local_settings.py
57
+ db.sqlite3
58
+
59
+ # Flask stuff:
60
+ instance/
61
+ .webassets-cache
62
+
63
+ # Scrapy stuff:
64
+ .scrapy
65
+
66
+ # Sphinx documentation
67
+ docs/_build/
68
+
69
+ # PyBuilder
70
+ target/
71
+
72
+ # Jupyter Notebook
73
+ .ipynb_checkpoints
74
+
75
+ # pyenv
76
+ .python-version
77
+
78
+ # celery beat schedule file
79
+ celerybeat-schedule
80
+
81
+ # SageMath parsed files
82
+ *.sage.py
83
+
84
+ # Environments
85
+ .env
86
+ .venv
87
+ env/
88
+ venv/
89
+ ENV/
90
+ env.bak/
91
+ venv.bak/
92
+
93
+ # Spyder project settings
94
+ .spyderproject
95
+ .spyproject
96
+
97
+ # Rope project settings
98
+ .ropeproject
99
+
100
+ # mkdocs documentation
101
+ /site
102
+
103
+ # mypy
104
+ .mypy_cache/
105
+
106
+ # JetBrains PyCharm
107
+ .idea
108
+
109
+ # Customize
110
+ references
111
+ url.txt
112
+
113
+ # Git
114
+ .git
text/chn_text_norm/README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This account is no longer in use, see [Atomicoo](https://github.com/atomicoo) for my latest works.
2
+
3
+ # Chn Text Norm
4
+
5
+ this is a repository for chinese text normalization (no longer maintained).
6
+
7
+ ## Quick Start ##
8
+
9
+ ### Git Clone Repo ###
10
+
11
+ git clone this repo to the root directory of your project which need to use it.
12
+
13
+ cd /path/to/proj
14
+ git clone https://github.com/Joee1995/chn-text-norm.git
15
+
16
+ after that, your doc tree should be:
17
+ ```
18
+ proj # root of your project
19
+ |--- chn_text_norm # this chn-text-norm tool
20
+ |--- text.py
21
+ |--- ...
22
+ |--- text_normalize.py # your text normalization code
23
+ |--- ...
24
+ ```
25
+
26
+ ### How to Use ? ###
27
+
28
+ # text_normalize.py
29
+ from chn_text_norm.text import *
30
+
31
+ raw_text = 'your raw text'
32
+ text = Text(raw_text=raw_text).normalize()
33
+
34
+ ### How to add quantums ###
35
+
36
+ 打开test.py,然后你就知道怎么做了。
text/chn_text_norm/__init__.py ADDED
File without changes
text/chn_text_norm/basic_class.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """基本类
3
+ 中文字符类
4
+ 中文数字/数位类
5
+ 中文数字类
6
+ 中文数位类
7
+ 中文数字系统类
8
+ 中文数学符号类
9
+ *中文其他符号类
10
+ """
11
+
12
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
13
+ __data__ = "2019-05-02"
14
+
15
+ from text.chn_text_norm.basic_constant import NUMBERING_TYPES
16
+
17
+
18
+ class ChineseChar(object):
19
+ """
20
+ 中文字符
21
+ 每个字符对应简体和繁体,
22
+ e.g. 简体 = '负', 繁体 = '負'
23
+ 转换时可转换为简体或繁体
24
+ """
25
+
26
+ def __init__(self, simplified, traditional):
27
+ self.simplified = simplified
28
+ self.traditional = traditional
29
+ self.__repr__ = self.__str__
30
+
31
+ def __str__(self):
32
+ return self.simplified or self.traditional or None
33
+
34
+ def __repr__(self):
35
+ return self.__str__()
36
+
37
+
38
+ class ChineseNumberUnit(ChineseChar):
39
+ """
40
+ 中文数字/数位字符
41
+ 每个字符除繁简体外还有一个额外的大写字符
42
+ e.g. '陆' 和 '陸'
43
+ """
44
+
45
+ def __init__(self, power, simplified, traditional, big_s, big_t):
46
+ super(ChineseNumberUnit, self).__init__(simplified, traditional)
47
+ self.power = power
48
+ self.big_s = big_s
49
+ self.big_t = big_t
50
+
51
+ def __str__(self):
52
+ return "10^{}".format(self.power)
53
+
54
+ @classmethod
55
+ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
56
+
57
+ if small_unit:
58
+ return ChineseNumberUnit(
59
+ power=index + 1,
60
+ simplified=value[0],
61
+ traditional=value[1],
62
+ big_s=value[1],
63
+ big_t=value[1],
64
+ )
65
+ elif numbering_type == NUMBERING_TYPES[0]:
66
+ return ChineseNumberUnit(
67
+ power=index + 8,
68
+ simplified=value[0],
69
+ traditional=value[1],
70
+ big_s=value[0],
71
+ big_t=value[1],
72
+ )
73
+ elif numbering_type == NUMBERING_TYPES[1]:
74
+ return ChineseNumberUnit(
75
+ power=(index + 2) * 4,
76
+ simplified=value[0],
77
+ traditional=value[1],
78
+ big_s=value[0],
79
+ big_t=value[1],
80
+ )
81
+ elif numbering_type == NUMBERING_TYPES[2]:
82
+ return ChineseNumberUnit(
83
+ power=pow(2, index + 3),
84
+ simplified=value[0],
85
+ traditional=value[1],
86
+ big_s=value[0],
87
+ big_t=value[1],
88
+ )
89
+ else:
90
+ raise ValueError(
91
+ "Counting type should be in {0} ({1} provided).".format(
92
+ NUMBERING_TYPES, numbering_type
93
+ )
94
+ )
95
+
96
+
97
+ class ChineseNumberDigit(ChineseChar):
98
+ """
99
+ 中文数字字符
100
+ """
101
+
102
+ def __init__(
103
+ self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None
104
+ ):
105
+ super(ChineseNumberDigit, self).__init__(simplified, traditional)
106
+ self.value = value
107
+ self.big_s = big_s
108
+ self.big_t = big_t
109
+ self.alt_s = alt_s
110
+ self.alt_t = alt_t
111
+
112
+ def __str__(self):
113
+ return str(self.value)
114
+
115
+ @classmethod
116
+ def create(cls, i, v):
117
+ return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
118
+
119
+
120
+ class ChineseMath(ChineseChar):
121
+ """
122
+ 中文数位字符
123
+ """
124
+
125
+ def __init__(self, simplified, traditional, symbol, expression=None):
126
+ super(ChineseMath, self).__init__(simplified, traditional)
127
+ self.symbol = symbol
128
+ self.expression = expression
129
+ self.big_s = simplified
130
+ self.big_t = traditional
131
+
132
+
133
+ CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
134
+
135
+
136
+ class NumberSystem(object):
137
+ """
138
+ 中文数字系统
139
+ """
140
+
141
+ pass
142
+
143
+
144
+ class MathSymbol(object):
145
+ """
146
+ 用于中文数字系统的数学符号 (繁/简体), e.g.
147
+ positive = ['正', '正']
148
+ negative = ['负', '負']
149
+ point = ['点', '點']
150
+ """
151
+
152
+ def __init__(self, positive, negative, point):
153
+ self.positive = positive
154
+ self.negative = negative
155
+ self.point = point
156
+
157
+ def __iter__(self):
158
+ for v in self.__dict__.values():
159
+ yield v
160
+
161
+
162
+ # class OtherSymbol(object):
163
+ # """
164
+ # 其他符号
165
+ # """
166
+ #
167
+ # def __init__(self, sil):
168
+ # self.sil = sil
169
+ #
170
+ # def __iter__(self):
171
+ # for v in self.__dict__.values():
172
+ # yield v
text/chn_text_norm/basic_constant.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """基本常量
3
+ 中文数字/数位/符号字符常量
4
+ """
5
+
6
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
7
+ __data__ = "2019-05-02"
8
+
9
+ CHINESE_DIGIS = "零一二三四五六七八九"
10
+ BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖"
11
+ BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖"
12
+ SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万"
13
+ SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬"
14
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载"
15
+ LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載"
16
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万"
17
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬"
18
+
19
+ ZERO_ALT = "〇"
20
+ ONE_ALT = "幺"
21
+ TWO_ALTS = ["两", "兩"]
22
+
23
+ POSITIVE = ["正", "正"]
24
+ NEGATIVE = ["负", "負"]
25
+ POINT = ["点", "點"]
26
+ # PLUS = [u'加', u'加']
27
+ # SIL = [u'杠', u'槓']
28
+
29
+ # 中文数字系统类型
30
+ NUMBERING_TYPES = ["low", "mid", "high"]
text/chn_text_norm/basic_util.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """基本方法
3
+ 创建中文数字系统 方法
4
+ 中文字符串 <=> 数字串 方法
5
+ 数字串 <=> 中文字符串 方法
6
+ """
7
+
8
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
9
+ __data__ = "2019-05-02"
10
+
11
+ from text.chn_text_norm.basic_class import *
12
+ from text.chn_text_norm.basic_constant import *
13
+
14
+
15
+ def create_system(numbering_type=NUMBERING_TYPES[1]):
16
+ """
17
+ 根据数字系统类型返回创建相应的数字系统,默认为 mid
18
+ NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
19
+ low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
20
+ mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
21
+ high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
22
+ 返回对应的数字系统
23
+ """
24
+
25
+ # chinese number units of '亿' and larger
26
+ all_larger_units = zip(
27
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED,
28
+ LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL,
29
+ )
30
+ larger_units = [
31
+ CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)
32
+ ]
33
+ # chinese number units of '十, 百, 千, 万'
34
+ all_smaller_units = zip(
35
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED,
36
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL,
37
+ )
38
+ smaller_units = [
39
+ CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)
40
+ ]
41
+ # digis
42
+ chinese_digis = zip(
43
+ CHINESE_DIGIS,
44
+ CHINESE_DIGIS,
45
+ BIG_CHINESE_DIGIS_SIMPLIFIED,
46
+ BIG_CHINESE_DIGIS_TRADITIONAL,
47
+ )
48
+ digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
49
+ digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
50
+ digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
51
+ digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
52
+
53
+ # symbols
54
+ positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x)
55
+ negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x)
56
+ point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y)))
57
+ # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
58
+ system = NumberSystem()
59
+ system.units = smaller_units + larger_units
60
+ system.digits = digits
61
+ system.math = MathSymbol(positive_cn, negative_cn, point_cn)
62
+ # system.symbols = OtherSymbol(sil_cn)
63
+ return system
64
+
65
+
66
+ def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
67
+
68
+ def get_symbol(char, system):
69
+ for u in system.units:
70
+ if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
71
+ return u
72
+ for d in system.digits:
73
+ if char in [
74
+ d.traditional,
75
+ d.simplified,
76
+ d.big_s,
77
+ d.big_t,
78
+ d.alt_s,
79
+ d.alt_t,
80
+ ]:
81
+ return d
82
+ for m in system.math:
83
+ if char in [m.traditional, m.simplified]:
84
+ return m
85
+
86
+ def string2symbols(chinese_string, system):
87
+ int_string, dec_string = chinese_string, ""
88
+ for p in [system.math.point.simplified, system.math.point.traditional]:
89
+ if p in chinese_string:
90
+ int_string, dec_string = chinese_string.split(p)
91
+ break
92
+ return [get_symbol(c, system) for c in int_string], [
93
+ get_symbol(c, system) for c in dec_string
94
+ ]
95
+
96
+ def correct_symbols(integer_symbols, system):
97
+ """
98
+ 一百八 to 一百八十
99
+ 一亿一千三百万 to 一亿 一千万 三百万
100
+ """
101
+
102
+ if integer_symbols and isinstance(integer_symbols[0], CNU):
103
+ if integer_symbols[0].power == 1:
104
+ integer_symbols = [system.digits[1]] + integer_symbols
105
+
106
+ if len(integer_symbols) > 1:
107
+ if isinstance(integer_symbols[-1], CND) and isinstance(
108
+ integer_symbols[-2], CNU
109
+ ):
110
+ integer_symbols.append(
111
+ CNU(integer_symbols[-2].power - 1, None, None, None, None)
112
+ )
113
+
114
+ result = []
115
+ unit_count = 0
116
+ for s in integer_symbols:
117
+ if isinstance(s, CND):
118
+ result.append(s)
119
+ unit_count = 0
120
+ elif isinstance(s, CNU):
121
+ current_unit = CNU(s.power, None, None, None, None)
122
+ unit_count += 1
123
+
124
+ if unit_count == 1:
125
+ result.append(current_unit)
126
+ elif unit_count > 1:
127
+ for i in range(len(result)):
128
+ if (
129
+ isinstance(result[-i - 1], CNU)
130
+ and result[-i - 1].power < current_unit.power
131
+ ):
132
+ result[-i - 1] = CNU(
133
+ result[-i - 1].power + current_unit.power,
134
+ None,
135
+ None,
136
+ None,
137
+ None,
138
+ )
139
+ return result
140
+
141
+ def compute_value(integer_symbols):
142
+ """
143
+ Compute the value.
144
+ When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
145
+ e.g. '两千万' = 2000 * 10000 not 2000 + 10000
146
+ """
147
+ value = [0]
148
+ last_power = 0
149
+ for s in integer_symbols:
150
+ if isinstance(s, CND):
151
+ value[-1] = s.value
152
+ elif isinstance(s, CNU):
153
+ value[-1] *= pow(10, s.power)
154
+ if s.power > last_power:
155
+ value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1]))
156
+ last_power = s.power
157
+ value.append(0)
158
+ return sum(value)
159
+
160
+ system = create_system(numbering_type)
161
+ int_part, dec_part = string2symbols(chinese_string, system)
162
+ int_part = correct_symbols(int_part, system)
163
+ int_str = str(compute_value(int_part))
164
+ dec_str = "".join([str(d.value) for d in dec_part])
165
+ if dec_part:
166
+ return "{0}.{1}".format(int_str, dec_str)
167
+ else:
168
+ return int_str
169
+
170
+
171
+ def num2chn(
172
+ number_string,
173
+ numbering_type=NUMBERING_TYPES[1],
174
+ big=False,
175
+ traditional=False,
176
+ alt_zero=False,
177
+ alt_one=False,
178
+ alt_two=True,
179
+ use_zeros=True,
180
+ use_units=True,
181
+ ):
182
+
183
+ def get_value(value_string, use_zeros=True):
184
+
185
+ striped_string = value_string.lstrip("0")
186
+
187
+ # record nothing if all zeros
188
+ if not striped_string:
189
+ return []
190
+
191
+ # record one digits
192
+ elif len(striped_string) == 1:
193
+ if use_zeros and len(value_string) != len(striped_string):
194
+ return [system.digits[0], system.digits[int(striped_string)]]
195
+ else:
196
+ return [system.digits[int(striped_string)]]
197
+
198
+ # recursively record multiple digits
199
+ else:
200
+ result_unit = next(
201
+ u for u in reversed(system.units) if u.power < len(striped_string)
202
+ )
203
+ result_string = value_string[: -result_unit.power]
204
+ return (
205
+ get_value(result_string)
206
+ + [result_unit]
207
+ + get_value(striped_string[-result_unit.power :])
208
+ )
209
+
210
+ system = create_system(numbering_type)
211
+
212
+ int_dec = number_string.split(".")
213
+ if len(int_dec) == 1:
214
+ int_string = int_dec[0]
215
+ dec_string = ""
216
+ elif len(int_dec) == 2:
217
+ int_string = int_dec[0]
218
+ dec_string = int_dec[1]
219
+ else:
220
+ raise ValueError(
221
+ "invalid input num string with more than one dot: {}".format(number_string)
222
+ )
223
+
224
+ if use_units and len(int_string) > 1:
225
+ result_symbols = get_value(int_string)
226
+ else:
227
+ result_symbols = [system.digits[int(c)] for c in int_string]
228
+ dec_symbols = [system.digits[int(c)] for c in dec_string]
229
+ if dec_string:
230
+ result_symbols += [system.math.point] + dec_symbols
231
+
232
+ if alt_two:
233
+ liang = CND(
234
+ 2,
235
+ system.digits[2].alt_s,
236
+ system.digits[2].alt_t,
237
+ system.digits[2].big_s,
238
+ system.digits[2].big_t,
239
+ )
240
+ for i, v in enumerate(result_symbols):
241
+ if isinstance(v, CND) and v.value == 2:
242
+ next_symbol = (
243
+ result_symbols[i + 1] if i < len(result_symbols) - 1 else None
244
+ )
245
+ previous_symbol = result_symbols[i - 1] if i > 0 else None
246
+ if isinstance(next_symbol, CNU) and isinstance(
247
+ previous_symbol, (CNU, type(None))
248
+ ):
249
+ if next_symbol.power != 1 and (
250
+ (previous_symbol is None) or (previous_symbol.power != 1)
251
+ ):
252
+ result_symbols[i] = liang
253
+
254
+ # if big is True, '两' will not be used and `alt_two` has no impact on output
255
+ if big:
256
+ attr_name = "big_"
257
+ if traditional:
258
+ attr_name += "t"
259
+ else:
260
+ attr_name += "s"
261
+ else:
262
+ if traditional:
263
+ attr_name = "traditional"
264
+ else:
265
+ attr_name = "simplified"
266
+
267
+ result = "".join([getattr(s, attr_name) for s in result_symbols])
268
+
269
+ # if not use_zeros:
270
+ # result = result.strip(getattr(system.digits[0], attr_name))
271
+
272
+ if alt_zero:
273
+ result = result.replace(
274
+ getattr(system.digits[0], attr_name), system.digits[0].alt_s
275
+ )
276
+
277
+ if alt_one:
278
+ result = result.replace(
279
+ getattr(system.digits[1], attr_name), system.digits[1].alt_s
280
+ )
281
+
282
+ for i, p in enumerate(POINT):
283
+ if result.startswith(p):
284
+ return CHINESE_DIGIS[0] + result
285
+
286
+ # ^10, 11, .., 19
287
+ if (
288
+ len(result) >= 2
289
+ and result[1]
290
+ in [
291
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
292
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0],
293
+ ]
294
+ and result[0]
295
+ in [
296
+ CHINESE_DIGIS[1],
297
+ BIG_CHINESE_DIGIS_SIMPLIFIED[1],
298
+ BIG_CHINESE_DIGIS_TRADITIONAL[1],
299
+ ]
300
+ ):
301
+ result = result[1:]
302
+
303
+ return result
304
+
305
+
306
+ if __name__ == "__main__":
307
+
308
+ # 测试程序
309
+ all_chinese_number_string = (
310
+ CHINESE_DIGIS
311
+ + BIG_CHINESE_DIGIS_SIMPLIFIED
312
+ + BIG_CHINESE_DIGIS_TRADITIONAL
313
+ + LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED
314
+ + LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL
315
+ + SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED
316
+ + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL
317
+ + ZERO_ALT
318
+ + ONE_ALT
319
+ + "".join(TWO_ALTS + POSITIVE + NEGATIVE + POINT)
320
+ )
321
+
322
+ print("num:", chn2num("一万零四百零三点八零五"))
323
+ print("num:", chn2num("一亿六点三"))
324
+ print("num:", chn2num("一亿零六点三"))
325
+ print("num:", chn2num("两千零一亿六点三"))
326
+ # print('num:', chn2num('一零零八六'))
327
+ print("txt:", num2chn("10260.03", alt_zero=True))
328
+ print("txt:", num2chn("20037.090", numbering_type="low", traditional=True))
329
+ print("txt:", num2chn("100860001.77", numbering_type="high", big=True))
330
+ print(
331
+ "txt:",
332
+ num2chn(
333
+ "059523810880",
334
+ alt_one=True,
335
+ alt_two=False,
336
+ use_lzeros=True,
337
+ use_rzeros=True,
338
+ use_units=False,
339
+ ),
340
+ )
341
+
342
+ print(all_chinese_number_string)
text/chn_text_norm/cardinal.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """CARDINAL类 (包含小数DECIMAL类)
3
+ 纯数 <=> 中文字符串 方法
4
+ 中文字符串 <=> 纯数 方法
5
+ """
6
+
7
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
8
+ __data__ = "2019-05-03"
9
+
10
+ from text.chn_text_norm.basic_util import *
11
+
12
+
13
+ class Cardinal:
14
+ """
15
+ CARDINAL类
16
+ """
17
+
18
+ def __init__(self, cardinal=None, chntext=None):
19
+ self.cardinal = cardinal
20
+ self.chntext = chntext
21
+
22
+ def chntext2cardinal(self):
23
+ return chn2num(self.chntext)
24
+
25
+ def cardinal2chntext(self):
26
+ return num2chn(self.cardinal)
27
+
28
+
29
+ if __name__ == "__main__":
30
+
31
+ # 测试程序
32
+ print(Cardinal(cardinal="21357.230").cardinal2chntext())
text/chn_text_norm/date.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """DATE类
3
+ 日期 <=> 中文字符串 方法
4
+ 中文字符串 <=> 日期 方法
5
+ """
6
+
7
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
8
+ __data__ = "2019-05-07"
9
+
10
+ from text.chn_text_norm.cardinal import Cardinal
11
+ from text.chn_text_norm.digit import Digit
12
+
13
+
14
+ class Date:
15
+ """
16
+ DATE类
17
+ """
18
+
19
+ def __init__(self, date=None, chntext=None):
20
+ self.date = date
21
+ self.chntext = chntext
22
+
23
+ # def chntext2date(self):
24
+ # chntext = self.chntext
25
+ # try:
26
+ # year, other = chntext.strip().split('年', maxsplit=1)
27
+ # year = Digit(chntext=year).digit2chntext() + '年'
28
+ # except ValueError:
29
+ # other = chntext
30
+ # year = ''
31
+ # if other:
32
+ # try:
33
+ # month, day = other.strip().split('月', maxsplit=1)
34
+ # month = Cardinal(chntext=month).chntext2cardinal() + '月'
35
+ # except ValueError:
36
+ # day = chntext
37
+ # month = ''
38
+ # if day:
39
+ # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
40
+ # else:
41
+ # month = ''
42
+ # day = ''
43
+ # date = year + month + day
44
+ # self.date = date
45
+ # return self.date
46
+
47
+ def date2chntext(self):
48
+ date = self.date
49
+ try:
50
+ year, other = date.strip().split("年", maxsplit=1)
51
+ year = Digit(digit=year).digit2chntext() + "年"
52
+ except ValueError:
53
+ other = date
54
+ year = ""
55
+ if other:
56
+ try:
57
+ month, day = other.strip().split("月", maxsplit=1)
58
+ month = Cardinal(cardinal=month).cardinal2chntext() + "月"
59
+ except ValueError:
60
+ day = date
61
+ month = ""
62
+ if day:
63
+ day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
64
+ else:
65
+ month = ""
66
+ day = ""
67
+ chntext = year + month + day
68
+ self.chntext = chntext
69
+ return self.chntext
70
+
71
+
72
+ if __name__ == "__main__":
73
+
74
+ # 测试
75
+ print(Date(date="09年3月16日").date2chntext())
text/chn_text_norm/digit.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """DIGIT类
3
+ 数字串 <=> 中文字符串 方法
4
+ 中文字符串 <=> 数字串 方法
5
+ """
6
+
7
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
8
+ __data__ = "2019-05-03"
9
+
10
+ from text.chn_text_norm.basic_util import *
11
+
12
+
13
+ class Digit:
14
+ """
15
+ DIGIT类
16
+ """
17
+
18
+ def __init__(self, digit=None, chntext=None):
19
+ self.digit = digit
20
+ self.chntext = chntext
21
+
22
+ # def chntext2digit(self):
23
+ # return chn2num(self.chntext)
24
+
25
+ def digit2chntext(self):
26
+ return num2chn(self.digit, alt_two=False, use_units=False)
27
+
28
+
29
+ if __name__ == "__main__":
30
+
31
+ # 测试程序
32
+ print(Digit(digit="2016").digit2chntext())
text/chn_text_norm/fraction.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """FRACTION类
3
+ 分数 <=> 中文字符串 方法
4
+ 中文字符串 <=> 分数 方法
5
+ """
6
+
7
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
8
+ __data__ = "2019-05-03"
9
+
10
+ from text.chn_text_norm.basic_util import *
11
+
12
+
13
+ class Fraction:
14
+ """
15
+ FRACTION类
16
+ """
17
+
18
+ def __init__(self, fraction=None, chntext=None):
19
+ self.fraction = fraction
20
+ self.chntext = chntext
21
+
22
+ def chntext2fraction(self):
23
+ denominator, numerator = self.chntext.split("分之")
24
+ return chn2num(numerator) + "/" + chn2num(denominator)
25
+
26
+ def fraction2chntext(self):
27
+ numerator, denominator = self.fraction.split("/")
28
+ return num2chn(denominator) + "分之" + num2chn(numerator)
29
+
30
+
31
+ if __name__ == "__main__":
32
+
33
+ # 测试程序
34
+ print(Fraction(fraction="2135/7230").fraction2chntext())
35
+ print(Fraction(chntext="五百八十一分之三百六十九").chntext2fraction())
text/chn_text_norm/money.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """MONEY类
3
+ 金钱 <=> 中文字符串 方法
4
+ 中文字符串 <=> 金钱 方法
5
+ """
6
+ import re
7
+
8
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
9
+ __data__ = "2019-05-08"
10
+
11
+ from text.chn_text_norm.cardinal import Cardinal
12
+
13
+
14
+ class Money:
15
+ """
16
+ MONEY类
17
+ """
18
+
19
+ def __init__(self, money=None, chntext=None):
20
+ self.money = money
21
+ self.chntext = chntext
22
+
23
+ # def chntext2money(self):
24
+ # return self.money
25
+
26
+ def money2chntext(self):
27
+ money = self.money
28
+ pattern = re.compile(r"(\d+(\.\d+)?)")
29
+ matchers = pattern.findall(money)
30
+ if matchers:
31
+ for matcher in matchers:
32
+ money = money.replace(
33
+ matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()
34
+ )
35
+ self.chntext = money
36
+ return self.chntext
37
+
38
+
39
+ if __name__ == "__main__":
40
+
41
+ # 测试
42
+ print(Money(money="21.5万元").money2chntext())
43
+ print(Money(money="230块5毛").money2chntext())
text/chn_text_norm/percentage.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """PERCENTAGE类
3
+ 百分数 <=> 中文字符串 方法
4
+ 中文字符串 <=> 百分数 方法
5
+ """
6
+
7
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
8
+ __data__ = "2019-05-06"
9
+
10
+ from text.chn_text_norm.basic_util import *
11
+
12
+
13
+ class Percentage:
14
+ """
15
+ PERCENTAGE类
16
+ """
17
+
18
+ def __init__(self, percentage=None, chntext=None):
19
+ self.percentage = percentage
20
+ self.chntext = chntext
21
+
22
+ def chntext2percentage(self):
23
+ return chn2num(self.chntext.strip().strip("百分之")) + "%"
24
+
25
+ def percentage2chntext(self):
26
+ return "百分之" + num2chn(self.percentage.strip().strip("%"))
27
+
28
+
29
+ if __name__ == "__main__":
30
+
31
+ # 测试程序
32
+ print(Percentage(chntext="百分之五十六点零三").chntext2percentage())
33
+ print(Percentage(percentage="65.3%").percentage2chntext())
text/chn_text_norm/telephone.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """TELEPHONE类
3
+ 电话号码 <=> 中文字符串 方法
4
+ 中文字符串 <=> 电话号码 方法
5
+ """
6
+
7
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
8
+ __data__ = "2019-05-03"
9
+
10
+ from text.chn_text_norm.basic_util import *
11
+
12
+
13
+ class TelePhone:
14
+ """
15
+ TELEPHONE类
16
+ """
17
+
18
+ def __init__(self, telephone=None, raw_chntext=None, chntext=None):
19
+ self.telephone = telephone
20
+ self.raw_chntext = raw_chntext
21
+ self.chntext = chntext
22
+
23
+ # def chntext2telephone(self):
24
+ # sil_parts = self.raw_chntext.split('<SIL>')
25
+ # self.telephone = '-'.join([
26
+ # str(chn2num(p)) for p in sil_parts
27
+ # ])
28
+ # return self.telephone
29
+
30
+ def telephone2chntext(self, fixed=False):
31
+
32
+ if fixed:
33
+ sil_parts = self.telephone.split("-")
34
+ self.raw_chntext = "<SIL>".join(
35
+ [num2chn(part, alt_two=False, use_units=False) for part in sil_parts]
36
+ )
37
+ self.chntext = self.raw_chntext.replace("<SIL>", "")
38
+ else:
39
+ sp_parts = self.telephone.strip("+").split()
40
+ self.raw_chntext = "<SP>".join(
41
+ [num2chn(part, alt_two=False, use_units=False) for part in sp_parts]
42
+ )
43
+ self.chntext = self.raw_chntext.replace("<SP>", "")
44
+ return self.chntext
45
+
46
+
47
+ if __name__ == "__main__":
48
+
49
+ # 测试程序
50
+ print(TelePhone(telephone="0595-23980880").telephone2chntext())
51
+ # print(TelePhone(raw_chntext='零五九五杠二三八六五零九八').chntext2telephone())
text/chn_text_norm/text.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ TEXT类
4
+ """
5
+
6
+ __author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
7
+ __data__ = "2019-05-03"
8
+
9
+ import re
10
+ from text.chn_text_norm.cardinal import Cardinal
11
+ from text.chn_text_norm.date import Date
12
+ from text.chn_text_norm.digit import Digit
13
+ from text.chn_text_norm.fraction import Fraction
14
+ from text.chn_text_norm.money import Money
15
+ from text.chn_text_norm.percentage import Percentage
16
+ from text.chn_text_norm.telephone import TelePhone
17
+
18
+
19
+ CURRENCY_NAMES = (
20
+ "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|"
21
+ "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)"
22
+ )
23
+ CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)"
24
+ COM_QUANTIFIERS = (
25
+ "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|"
26
+ "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|"
27
+ "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
28
+ "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|"
29
+ "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|"
30
+ "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|人|抽)"
31
+ )
32
+
33
+
34
+ class Text:
35
+ """
36
+ Text类
37
+ """
38
+
39
+ def __init__(self, raw_text, norm_text=None):
40
+ self.raw_text = "^" + raw_text + "$"
41
+ self.norm_text = norm_text
42
+
43
+ def _particular(self):
44
+ text = self.norm_text
45
+ pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
46
+ matchers = pattern.findall(text)
47
+ if matchers:
48
+ # print('particular')
49
+ for matcher in matchers:
50
+ text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1)
51
+ self.norm_text = text
52
+ return self.norm_text
53
+
54
+ def normalize(self):
55
+ text = self.raw_text
56
+
57
+ # 规范化日期
58
+ pattern = re.compile(
59
+ r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)"
60
+ )
61
+ matchers = pattern.findall(text)
62
+ if matchers:
63
+ # print('date')
64
+ for matcher in matchers:
65
+ text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
66
+
67
+ # 规范化金钱
68
+ pattern = re.compile(
69
+ r"\D+((\d+(\.\d+)?)[多余几]?"
70
+ + CURRENCY_UNITS
71
+ + "(\d"
72
+ + CURRENCY_UNITS
73
+ + "?)?)"
74
+ )
75
+ matchers = pattern.findall(text)
76
+ if matchers:
77
+ # print('money')
78
+ for matcher in matchers:
79
+ text = text.replace(
80
+ matcher[0], Money(money=matcher[0]).money2chntext(), 1
81
+ )
82
+
83
+ # 规范化固话/手机号码
84
+ # 手机
85
+ # http://www.jihaoba.com/news/show/13680
86
+ # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
87
+ # 联通:130、131、132、156、155、186、185、176
88
+ # 电信:133、153、189、180、181、177
89
+ pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
90
+ matchers = pattern.findall(text)
91
+ if matchers:
92
+ # print('telephone')
93
+ for matcher in matchers:
94
+ text = text.replace(
95
+ matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1
96
+ )
97
+ # 固话
98
+ pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
99
+ matchers = pattern.findall(text)
100
+ if matchers:
101
+ # print('fixed telephone')
102
+ for matcher in matchers:
103
+ text = text.replace(
104
+ matcher[0],
105
+ TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True),
106
+ 1,
107
+ )
108
+
109
+ # 规范化分数
110
+ pattern = re.compile(r"(\d+/\d+)")
111
+ matchers = pattern.findall(text)
112
+ if matchers:
113
+ # print('fraction')
114
+ for matcher in matchers:
115
+ text = text.replace(
116
+ matcher, Fraction(fraction=matcher).fraction2chntext(), 1
117
+ )
118
+
119
+ # 规范化百分数
120
+ text = text.replace("%", "%")
121
+ pattern = re.compile(r"(\d+(\.\d+)?%)")
122
+ matchers = pattern.findall(text)
123
+ if matchers:
124
+ # print('percentage')
125
+ for matcher in matchers:
126
+ text = text.replace(
127
+ matcher[0],
128
+ Percentage(percentage=matcher[0]).percentage2chntext(),
129
+ 1,
130
+ )
131
+
132
+ # 规范化纯数+量词
133
+ pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
134
+ matchers = pattern.findall(text)
135
+ if matchers:
136
+ # print('cardinal+quantifier')
137
+ for matcher in matchers:
138
+ text = text.replace(
139
+ matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
140
+ )
141
+
142
+ # 规范化数字编号
143
+ pattern = re.compile(r"(\d{4,32})")
144
+ matchers = pattern.findall(text)
145
+ if matchers:
146
+ # print('digit')
147
+ for matcher in matchers:
148
+ text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
149
+
150
+ # 规范化纯数
151
+ pattern = re.compile(r"(\d+(\.\d+)?)")
152
+ matchers = pattern.findall(text)
153
+ if matchers:
154
+ # print('cardinal')
155
+ for matcher in matchers:
156
+ text = text.replace(
157
+ matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
158
+ )
159
+
160
+ self.norm_text = text
161
+ self._particular()
162
+
163
+ return self.norm_text.lstrip("^").rstrip("$")
164
+
165
+
166
+ if __name__ == "__main__":
167
+
168
+ # 测试程序
169
+ print(Text(raw_text="固话:0595-23865596或23880880。").normalize())
170
+ print(Text(raw_text="手机:+86 19859213959或15659451527。").normalize())
171
+ print(Text(raw_text="分数:32477/76391。").normalize())
172
+ print(Text(raw_text="百分数:80.03%。").normalize())
173
+ print(Text(raw_text="编号:31520181154418。").normalize())
174
+ print(Text(raw_text="纯数:2983.07克或12345.60米。").normalize())
175
+ print(Text(raw_text="日期:1999年2月20日或09年3月15号。").normalize())
176
+ print(Text(raw_text="金钱:12块5,34.5元,20.1万").normalize())
177
+ print(Text(raw_text="特殊:O2O或B2C。").normalize())
text/clean.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ SYMBOLS_MAPPING = {
4
+ "\n": "",
5
+ "…": ".",
6
+ "“": "'",
7
+ "”": "'",
8
+ "‘": "'",
9
+ "’": "'",
10
+ "【": "",
11
+ "】": "",
12
+ "[": "",
13
+ "]": "",
14
+ "(": "",
15
+ ")": "",
16
+ "(": "",
17
+ ")": "",
18
+ "・": "",
19
+ "·": "",
20
+ "「": "'",
21
+ "」": "'",
22
+ "《": "'",
23
+ "》": "'",
24
+ "—": "",
25
+ "~": "",
26
+ "~": "",
27
+ ":": ",",
28
+ ";": ",",
29
+ ";": ",",
30
+ ":": ",",
31
+ }
32
+
33
+ REPLACE_SYMBOL_REGEX = re.compile(
34
+ "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
35
+ )
36
+
37
+
38
+ EMOJI_REGEX = re.compile(
39
+ "["
40
+ "\U0001F600-\U0001F64F" # emoticons
41
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
42
+ "\U0001F680-\U0001F6FF" # transport & map symbols
43
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
44
+ "]+",
45
+ flags=re.UNICODE,
46
+ )
47
+
48
+
49
+ def clean_text(text):
50
+ # Clean the text
51
+ text = text.strip()
52
+
53
+ # Replace all chinese symbols with their english counterparts
54
+ text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
55
+
56
+ # Remove emojis
57
+ text = EMOJI_REGEX.sub(r"", text)
58
+
59
+ # Remove continuous periods (...) and commas (,,,)
60
+ text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
61
+
62
+ return text
text/spliter.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+
4
+ from text.clean import clean_text
5
+
6
+
7
+ def utf_8_len(text):
8
+ return len(text.encode("utf-8"))
9
+
10
+
11
+ def break_text(texts, length, splits: set):
12
+ for text in texts:
13
+ if utf_8_len(text) <= length:
14
+ yield text
15
+ continue
16
+
17
+ curr = ""
18
+ for char in text:
19
+ curr += char
20
+
21
+ if char in splits:
22
+ yield curr
23
+ curr = ""
24
+
25
+ if curr:
26
+ yield curr
27
+
28
+
29
+ def break_text_by_length(texts, length):
30
+ for text in texts:
31
+ if utf_8_len(text) <= length:
32
+ yield text
33
+ continue
34
+
35
+ curr = ""
36
+ for char in text:
37
+ curr += char
38
+
39
+ if utf_8_len(curr) >= length:
40
+ yield curr
41
+ curr = ""
42
+
43
+ if curr:
44
+ yield curr
45
+
46
+
47
+ def add_cleaned(curr, segments):
48
+ curr = curr.strip()
49
+ if curr and not all(c.isspace() or c in string.punctuation for c in curr):
50
+ segments.append(curr)
51
+
52
+
53
+ def protect_float(text):
54
+ # Turns 3.14 into <3_f_14> to prevent splitting
55
+ return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
56
+
57
+
58
+ def unprotect_float(text):
59
+ # Turns <3_f_14> into 3.14
60
+ return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
61
+
62
+
63
+ def split_text(text, length):
64
+ text = clean_text(text)
65
+
66
+ # Break the text into pieces with following rules:
67
+ # 1. Split the text at ".", "!", "?" if text is NOT a float
68
+ # 2. If the text is longer than length, split at ","
69
+ # 3. If the text is still longer than length, split at " "
70
+ # 4. If the text is still longer than length, split at any character to length
71
+
72
+ texts = [text]
73
+ texts = map(protect_float, texts)
74
+ texts = break_text(texts, length, {".", "!", "?", "。", "!", "?"})
75
+ texts = map(unprotect_float, texts)
76
+ texts = break_text(texts, length, {",", ","})
77
+ texts = break_text(texts, length, {" "})
78
+ texts = list(break_text_by_length(texts, length))
79
+
80
+ # Then, merge the texts into segments with length <= length
81
+ segments = []
82
+ curr = ""
83
+
84
+ for text in texts:
85
+ if utf_8_len(curr) + utf_8_len(text) <= length:
86
+ curr += text
87
+ else:
88
+ add_cleaned(curr, segments)
89
+ curr = text
90
+
91
+ if curr:
92
+ add_cleaned(curr, segments)
93
+
94
+ return segments
95
+
96
+
97
+ if __name__ == "__main__":
98
+ # Test the split_text function
99
+
100
+ text = "This is a test sentence. This is another test sentence. And a third one."
101
+
102
+ assert split_text(text, 50) == [
103
+ "This is a test sentence.",
104
+ "This is another test sentence. And a third one.",
105
+ ]
106
+ assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
107
+ assert split_text(" ", 10) == []
108
+ assert split_text("a", 10) == ["a"]
109
+
110
+ text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
111
+ assert split_text(text, 50) == [
112
+ "This is a test sentence with only commas,",
113
+ "and no dots, and no exclamation marks,",
114
+ "and no question marks, and no newlines.",
115
+ ]
116
+
117
+ text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
118
+ # First half split at " ", second half split at ","
119
+ assert split_text(text, 50) == [
120
+ "This is a test sentence This is a test sentence",
121
+ "This is a test sentence. This is a test sentence,",
122
+ "This is a test sentence, This is a test sentence.",
123
+ ]
124
+
125
+ text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
126
+ assert split_text(text, 50) == [
127
+ "这是一段很长的中文文本,",
128
+ "而且没有句号,也没有感叹号,",
129
+ "也没有问号,也没有换行符.",
130
+ ]