Spaces:
Running
on
Zero
Running
on
Zero
Huakang Chen
commited on
Commit
·
e58ed1d
1
Parent(s):
37417b4
add text
Browse files- text/__init__.py +4 -0
- text/__pycache__/__init__.cpython-310.pyc +0 -0
- text/__pycache__/__init__.cpython-38.pyc +0 -0
- text/__pycache__/clean.cpython-310.pyc +0 -0
- text/__pycache__/clean.cpython-38.pyc +0 -0
- text/__pycache__/spliter.cpython-310.pyc +0 -0
- text/__pycache__/spliter.cpython-38.pyc +0 -0
- text/chn_text_norm/.gitignore +114 -0
- text/chn_text_norm/README.md +36 -0
- text/chn_text_norm/__init__.py +0 -0
- text/chn_text_norm/basic_class.py +172 -0
- text/chn_text_norm/basic_constant.py +30 -0
- text/chn_text_norm/basic_util.py +342 -0
- text/chn_text_norm/cardinal.py +32 -0
- text/chn_text_norm/date.py +75 -0
- text/chn_text_norm/digit.py +32 -0
- text/chn_text_norm/fraction.py +35 -0
- text/chn_text_norm/money.py +43 -0
- text/chn_text_norm/percentage.py +33 -0
- text/chn_text_norm/telephone.py +51 -0
- text/chn_text_norm/text.py +177 -0
- text/clean.py +62 -0
- text/spliter.py +130 -0
text/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .clean import clean_text
|
| 2 |
+
from .spliter import split_text
|
| 3 |
+
|
| 4 |
+
__all__ = ["clean_text", "split_text"]
|
text/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (269 Bytes). View file
|
|
|
text/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (258 Bytes). View file
|
|
|
text/__pycache__/clean.cpython-310.pyc
ADDED
|
Binary file (1.34 kB). View file
|
|
|
text/__pycache__/clean.cpython-38.pyc
ADDED
|
Binary file (1.22 kB). View file
|
|
|
text/__pycache__/spliter.cpython-310.pyc
ADDED
|
Binary file (3.19 kB). View file
|
|
|
text/__pycache__/spliter.cpython-38.pyc
ADDED
|
Binary file (3.21 kB). View file
|
|
|
text/chn_text_norm/.gitignore
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
MANIFEST
|
| 27 |
+
|
| 28 |
+
# PyInstaller
|
| 29 |
+
# Usually these files are written by a python script from a template
|
| 30 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 31 |
+
*.manifest
|
| 32 |
+
*.spec
|
| 33 |
+
|
| 34 |
+
# Installer logs
|
| 35 |
+
pip-log.txt
|
| 36 |
+
pip-delete-this-directory.txt
|
| 37 |
+
|
| 38 |
+
# Unit test / coverage reports
|
| 39 |
+
htmlcov/
|
| 40 |
+
.tox/
|
| 41 |
+
.coverage
|
| 42 |
+
.coverage.*
|
| 43 |
+
.cache
|
| 44 |
+
nosetests.xml
|
| 45 |
+
coverage.xml
|
| 46 |
+
*.cover
|
| 47 |
+
.hypothesis/
|
| 48 |
+
.pytest_cache/
|
| 49 |
+
|
| 50 |
+
# Translations
|
| 51 |
+
*.mo
|
| 52 |
+
*.pot
|
| 53 |
+
|
| 54 |
+
# Django stuff:
|
| 55 |
+
*.log
|
| 56 |
+
local_settings.py
|
| 57 |
+
db.sqlite3
|
| 58 |
+
|
| 59 |
+
# Flask stuff:
|
| 60 |
+
instance/
|
| 61 |
+
.webassets-cache
|
| 62 |
+
|
| 63 |
+
# Scrapy stuff:
|
| 64 |
+
.scrapy
|
| 65 |
+
|
| 66 |
+
# Sphinx documentation
|
| 67 |
+
docs/_build/
|
| 68 |
+
|
| 69 |
+
# PyBuilder
|
| 70 |
+
target/
|
| 71 |
+
|
| 72 |
+
# Jupyter Notebook
|
| 73 |
+
.ipynb_checkpoints
|
| 74 |
+
|
| 75 |
+
# pyenv
|
| 76 |
+
.python-version
|
| 77 |
+
|
| 78 |
+
# celery beat schedule file
|
| 79 |
+
celerybeat-schedule
|
| 80 |
+
|
| 81 |
+
# SageMath parsed files
|
| 82 |
+
*.sage.py
|
| 83 |
+
|
| 84 |
+
# Environments
|
| 85 |
+
.env
|
| 86 |
+
.venv
|
| 87 |
+
env/
|
| 88 |
+
venv/
|
| 89 |
+
ENV/
|
| 90 |
+
env.bak/
|
| 91 |
+
venv.bak/
|
| 92 |
+
|
| 93 |
+
# Spyder project settings
|
| 94 |
+
.spyderproject
|
| 95 |
+
.spyproject
|
| 96 |
+
|
| 97 |
+
# Rope project settings
|
| 98 |
+
.ropeproject
|
| 99 |
+
|
| 100 |
+
# mkdocs documentation
|
| 101 |
+
/site
|
| 102 |
+
|
| 103 |
+
# mypy
|
| 104 |
+
.mypy_cache/
|
| 105 |
+
|
| 106 |
+
# JetBrains PyCharm
|
| 107 |
+
.idea
|
| 108 |
+
|
| 109 |
+
# Customize
|
| 110 |
+
references
|
| 111 |
+
url.txt
|
| 112 |
+
|
| 113 |
+
# Git
|
| 114 |
+
.git
|
text/chn_text_norm/README.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This account is no longer in use, see [Atomicoo](https://github.com/atomicoo) for my latest works.
|
| 2 |
+
|
| 3 |
+
# Chn Text Norm
|
| 4 |
+
|
| 5 |
+
this is a repository for chinese text normalization (no longer maintained).
|
| 6 |
+
|
| 7 |
+
## Quick Start ##
|
| 8 |
+
|
| 9 |
+
### Git Clone Repo ###
|
| 10 |
+
|
| 11 |
+
git clone this repo to the root directory of your project which need to use it.
|
| 12 |
+
|
| 13 |
+
cd /path/to/proj
|
| 14 |
+
git clone https://github.com/Joee1995/chn-text-norm.git
|
| 15 |
+
|
| 16 |
+
after that, your doc tree should be:
|
| 17 |
+
```
|
| 18 |
+
proj # root of your project
|
| 19 |
+
|--- chn_text_norm # this chn-text-norm tool
|
| 20 |
+
|--- text.py
|
| 21 |
+
|--- ...
|
| 22 |
+
|--- text_normalize.py # your text normalization code
|
| 23 |
+
|--- ...
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### How to Use ? ###
|
| 27 |
+
|
| 28 |
+
# text_normalize.py
|
| 29 |
+
from chn_text_norm.text import *
|
| 30 |
+
|
| 31 |
+
raw_text = 'your raw text'
|
| 32 |
+
text = Text(raw_text=raw_text).normalize()
|
| 33 |
+
|
| 34 |
+
### How to add quantums ###
|
| 35 |
+
|
| 36 |
+
打开test.py,然后你就知道怎么做了。
|
text/chn_text_norm/__init__.py
ADDED
|
File without changes
|
text/chn_text_norm/basic_class.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""基本类
|
| 3 |
+
中文字符类
|
| 4 |
+
中文数字/数位类
|
| 5 |
+
中文数字类
|
| 6 |
+
中文数位类
|
| 7 |
+
中文数字系统类
|
| 8 |
+
中文数学符号类
|
| 9 |
+
*中文其他符号类
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 13 |
+
__data__ = "2019-05-02"
|
| 14 |
+
|
| 15 |
+
from text.chn_text_norm.basic_constant import NUMBERING_TYPES
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ChineseChar(object):
|
| 19 |
+
"""
|
| 20 |
+
中文字符
|
| 21 |
+
每个字符对应简体和繁体,
|
| 22 |
+
e.g. 简体 = '负', 繁体 = '負'
|
| 23 |
+
转换时可转换为简体或繁体
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, simplified, traditional):
|
| 27 |
+
self.simplified = simplified
|
| 28 |
+
self.traditional = traditional
|
| 29 |
+
self.__repr__ = self.__str__
|
| 30 |
+
|
| 31 |
+
def __str__(self):
|
| 32 |
+
return self.simplified or self.traditional or None
|
| 33 |
+
|
| 34 |
+
def __repr__(self):
|
| 35 |
+
return self.__str__()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ChineseNumberUnit(ChineseChar):
|
| 39 |
+
"""
|
| 40 |
+
中文数字/数位字符
|
| 41 |
+
每个字符除繁简体外还有一个额外的大写字符
|
| 42 |
+
e.g. '陆' 和 '陸'
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(self, power, simplified, traditional, big_s, big_t):
|
| 46 |
+
super(ChineseNumberUnit, self).__init__(simplified, traditional)
|
| 47 |
+
self.power = power
|
| 48 |
+
self.big_s = big_s
|
| 49 |
+
self.big_t = big_t
|
| 50 |
+
|
| 51 |
+
def __str__(self):
|
| 52 |
+
return "10^{}".format(self.power)
|
| 53 |
+
|
| 54 |
+
@classmethod
|
| 55 |
+
def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
|
| 56 |
+
|
| 57 |
+
if small_unit:
|
| 58 |
+
return ChineseNumberUnit(
|
| 59 |
+
power=index + 1,
|
| 60 |
+
simplified=value[0],
|
| 61 |
+
traditional=value[1],
|
| 62 |
+
big_s=value[1],
|
| 63 |
+
big_t=value[1],
|
| 64 |
+
)
|
| 65 |
+
elif numbering_type == NUMBERING_TYPES[0]:
|
| 66 |
+
return ChineseNumberUnit(
|
| 67 |
+
power=index + 8,
|
| 68 |
+
simplified=value[0],
|
| 69 |
+
traditional=value[1],
|
| 70 |
+
big_s=value[0],
|
| 71 |
+
big_t=value[1],
|
| 72 |
+
)
|
| 73 |
+
elif numbering_type == NUMBERING_TYPES[1]:
|
| 74 |
+
return ChineseNumberUnit(
|
| 75 |
+
power=(index + 2) * 4,
|
| 76 |
+
simplified=value[0],
|
| 77 |
+
traditional=value[1],
|
| 78 |
+
big_s=value[0],
|
| 79 |
+
big_t=value[1],
|
| 80 |
+
)
|
| 81 |
+
elif numbering_type == NUMBERING_TYPES[2]:
|
| 82 |
+
return ChineseNumberUnit(
|
| 83 |
+
power=pow(2, index + 3),
|
| 84 |
+
simplified=value[0],
|
| 85 |
+
traditional=value[1],
|
| 86 |
+
big_s=value[0],
|
| 87 |
+
big_t=value[1],
|
| 88 |
+
)
|
| 89 |
+
else:
|
| 90 |
+
raise ValueError(
|
| 91 |
+
"Counting type should be in {0} ({1} provided).".format(
|
| 92 |
+
NUMBERING_TYPES, numbering_type
|
| 93 |
+
)
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class ChineseNumberDigit(ChineseChar):
|
| 98 |
+
"""
|
| 99 |
+
中文数字字符
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
def __init__(
|
| 103 |
+
self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None
|
| 104 |
+
):
|
| 105 |
+
super(ChineseNumberDigit, self).__init__(simplified, traditional)
|
| 106 |
+
self.value = value
|
| 107 |
+
self.big_s = big_s
|
| 108 |
+
self.big_t = big_t
|
| 109 |
+
self.alt_s = alt_s
|
| 110 |
+
self.alt_t = alt_t
|
| 111 |
+
|
| 112 |
+
def __str__(self):
|
| 113 |
+
return str(self.value)
|
| 114 |
+
|
| 115 |
+
@classmethod
|
| 116 |
+
def create(cls, i, v):
|
| 117 |
+
return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class ChineseMath(ChineseChar):
|
| 121 |
+
"""
|
| 122 |
+
中文数位字符
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
def __init__(self, simplified, traditional, symbol, expression=None):
|
| 126 |
+
super(ChineseMath, self).__init__(simplified, traditional)
|
| 127 |
+
self.symbol = symbol
|
| 128 |
+
self.expression = expression
|
| 129 |
+
self.big_s = simplified
|
| 130 |
+
self.big_t = traditional
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class NumberSystem(object):
|
| 137 |
+
"""
|
| 138 |
+
中文数字系统
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class MathSymbol(object):
|
| 145 |
+
"""
|
| 146 |
+
用于中文数字系统的数学符号 (繁/简体), e.g.
|
| 147 |
+
positive = ['正', '正']
|
| 148 |
+
negative = ['负', '負']
|
| 149 |
+
point = ['点', '點']
|
| 150 |
+
"""
|
| 151 |
+
|
| 152 |
+
def __init__(self, positive, negative, point):
|
| 153 |
+
self.positive = positive
|
| 154 |
+
self.negative = negative
|
| 155 |
+
self.point = point
|
| 156 |
+
|
| 157 |
+
def __iter__(self):
|
| 158 |
+
for v in self.__dict__.values():
|
| 159 |
+
yield v
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# class OtherSymbol(object):
|
| 163 |
+
# """
|
| 164 |
+
# 其他符号
|
| 165 |
+
# """
|
| 166 |
+
#
|
| 167 |
+
# def __init__(self, sil):
|
| 168 |
+
# self.sil = sil
|
| 169 |
+
#
|
| 170 |
+
# def __iter__(self):
|
| 171 |
+
# for v in self.__dict__.values():
|
| 172 |
+
# yield v
|
text/chn_text_norm/basic_constant.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""基本常量
|
| 3 |
+
中文数字/数位/符号字符常量
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 7 |
+
__data__ = "2019-05-02"
|
| 8 |
+
|
| 9 |
+
CHINESE_DIGIS = "零一二三四五六七八九"
|
| 10 |
+
BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖"
|
| 11 |
+
BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖"
|
| 12 |
+
SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万"
|
| 13 |
+
SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬"
|
| 14 |
+
LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载"
|
| 15 |
+
LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載"
|
| 16 |
+
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万"
|
| 17 |
+
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬"
|
| 18 |
+
|
| 19 |
+
ZERO_ALT = "〇"
|
| 20 |
+
ONE_ALT = "幺"
|
| 21 |
+
TWO_ALTS = ["两", "兩"]
|
| 22 |
+
|
| 23 |
+
POSITIVE = ["正", "正"]
|
| 24 |
+
NEGATIVE = ["负", "負"]
|
| 25 |
+
POINT = ["点", "點"]
|
| 26 |
+
# PLUS = [u'加', u'加']
|
| 27 |
+
# SIL = [u'杠', u'槓']
|
| 28 |
+
|
| 29 |
+
# 中文数字系统类型
|
| 30 |
+
NUMBERING_TYPES = ["low", "mid", "high"]
|
text/chn_text_norm/basic_util.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""基本方法
|
| 3 |
+
创建中文数字系统 方法
|
| 4 |
+
中文字符串 <=> 数字串 方法
|
| 5 |
+
数字串 <=> 中文字符串 方法
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 9 |
+
__data__ = "2019-05-02"
|
| 10 |
+
|
| 11 |
+
from text.chn_text_norm.basic_class import *
|
| 12 |
+
from text.chn_text_norm.basic_constant import *
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def create_system(numbering_type=NUMBERING_TYPES[1]):
|
| 16 |
+
"""
|
| 17 |
+
根据数字系统类型返回创建相应的数字系统,默认为 mid
|
| 18 |
+
NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
|
| 19 |
+
low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
|
| 20 |
+
mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
|
| 21 |
+
high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
|
| 22 |
+
返回对应的数字系统
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
# chinese number units of '亿' and larger
|
| 26 |
+
all_larger_units = zip(
|
| 27 |
+
LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED,
|
| 28 |
+
LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL,
|
| 29 |
+
)
|
| 30 |
+
larger_units = [
|
| 31 |
+
CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)
|
| 32 |
+
]
|
| 33 |
+
# chinese number units of '十, 百, 千, 万'
|
| 34 |
+
all_smaller_units = zip(
|
| 35 |
+
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED,
|
| 36 |
+
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL,
|
| 37 |
+
)
|
| 38 |
+
smaller_units = [
|
| 39 |
+
CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)
|
| 40 |
+
]
|
| 41 |
+
# digis
|
| 42 |
+
chinese_digis = zip(
|
| 43 |
+
CHINESE_DIGIS,
|
| 44 |
+
CHINESE_DIGIS,
|
| 45 |
+
BIG_CHINESE_DIGIS_SIMPLIFIED,
|
| 46 |
+
BIG_CHINESE_DIGIS_TRADITIONAL,
|
| 47 |
+
)
|
| 48 |
+
digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
|
| 49 |
+
digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
|
| 50 |
+
digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
|
| 51 |
+
digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
|
| 52 |
+
|
| 53 |
+
# symbols
|
| 54 |
+
positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x)
|
| 55 |
+
negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x)
|
| 56 |
+
point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y)))
|
| 57 |
+
# sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
|
| 58 |
+
system = NumberSystem()
|
| 59 |
+
system.units = smaller_units + larger_units
|
| 60 |
+
system.digits = digits
|
| 61 |
+
system.math = MathSymbol(positive_cn, negative_cn, point_cn)
|
| 62 |
+
# system.symbols = OtherSymbol(sil_cn)
|
| 63 |
+
return system
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
|
| 67 |
+
|
| 68 |
+
def get_symbol(char, system):
|
| 69 |
+
for u in system.units:
|
| 70 |
+
if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
|
| 71 |
+
return u
|
| 72 |
+
for d in system.digits:
|
| 73 |
+
if char in [
|
| 74 |
+
d.traditional,
|
| 75 |
+
d.simplified,
|
| 76 |
+
d.big_s,
|
| 77 |
+
d.big_t,
|
| 78 |
+
d.alt_s,
|
| 79 |
+
d.alt_t,
|
| 80 |
+
]:
|
| 81 |
+
return d
|
| 82 |
+
for m in system.math:
|
| 83 |
+
if char in [m.traditional, m.simplified]:
|
| 84 |
+
return m
|
| 85 |
+
|
| 86 |
+
def string2symbols(chinese_string, system):
|
| 87 |
+
int_string, dec_string = chinese_string, ""
|
| 88 |
+
for p in [system.math.point.simplified, system.math.point.traditional]:
|
| 89 |
+
if p in chinese_string:
|
| 90 |
+
int_string, dec_string = chinese_string.split(p)
|
| 91 |
+
break
|
| 92 |
+
return [get_symbol(c, system) for c in int_string], [
|
| 93 |
+
get_symbol(c, system) for c in dec_string
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
def correct_symbols(integer_symbols, system):
|
| 97 |
+
"""
|
| 98 |
+
一百八 to 一百八十
|
| 99 |
+
一亿一千三百万 to 一亿 一千万 三百万
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
if integer_symbols and isinstance(integer_symbols[0], CNU):
|
| 103 |
+
if integer_symbols[0].power == 1:
|
| 104 |
+
integer_symbols = [system.digits[1]] + integer_symbols
|
| 105 |
+
|
| 106 |
+
if len(integer_symbols) > 1:
|
| 107 |
+
if isinstance(integer_symbols[-1], CND) and isinstance(
|
| 108 |
+
integer_symbols[-2], CNU
|
| 109 |
+
):
|
| 110 |
+
integer_symbols.append(
|
| 111 |
+
CNU(integer_symbols[-2].power - 1, None, None, None, None)
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
result = []
|
| 115 |
+
unit_count = 0
|
| 116 |
+
for s in integer_symbols:
|
| 117 |
+
if isinstance(s, CND):
|
| 118 |
+
result.append(s)
|
| 119 |
+
unit_count = 0
|
| 120 |
+
elif isinstance(s, CNU):
|
| 121 |
+
current_unit = CNU(s.power, None, None, None, None)
|
| 122 |
+
unit_count += 1
|
| 123 |
+
|
| 124 |
+
if unit_count == 1:
|
| 125 |
+
result.append(current_unit)
|
| 126 |
+
elif unit_count > 1:
|
| 127 |
+
for i in range(len(result)):
|
| 128 |
+
if (
|
| 129 |
+
isinstance(result[-i - 1], CNU)
|
| 130 |
+
and result[-i - 1].power < current_unit.power
|
| 131 |
+
):
|
| 132 |
+
result[-i - 1] = CNU(
|
| 133 |
+
result[-i - 1].power + current_unit.power,
|
| 134 |
+
None,
|
| 135 |
+
None,
|
| 136 |
+
None,
|
| 137 |
+
None,
|
| 138 |
+
)
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
def compute_value(integer_symbols):
|
| 142 |
+
"""
|
| 143 |
+
Compute the value.
|
| 144 |
+
When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
|
| 145 |
+
e.g. '两千万' = 2000 * 10000 not 2000 + 10000
|
| 146 |
+
"""
|
| 147 |
+
value = [0]
|
| 148 |
+
last_power = 0
|
| 149 |
+
for s in integer_symbols:
|
| 150 |
+
if isinstance(s, CND):
|
| 151 |
+
value[-1] = s.value
|
| 152 |
+
elif isinstance(s, CNU):
|
| 153 |
+
value[-1] *= pow(10, s.power)
|
| 154 |
+
if s.power > last_power:
|
| 155 |
+
value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1]))
|
| 156 |
+
last_power = s.power
|
| 157 |
+
value.append(0)
|
| 158 |
+
return sum(value)
|
| 159 |
+
|
| 160 |
+
system = create_system(numbering_type)
|
| 161 |
+
int_part, dec_part = string2symbols(chinese_string, system)
|
| 162 |
+
int_part = correct_symbols(int_part, system)
|
| 163 |
+
int_str = str(compute_value(int_part))
|
| 164 |
+
dec_str = "".join([str(d.value) for d in dec_part])
|
| 165 |
+
if dec_part:
|
| 166 |
+
return "{0}.{1}".format(int_str, dec_str)
|
| 167 |
+
else:
|
| 168 |
+
return int_str
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def num2chn(
|
| 172 |
+
number_string,
|
| 173 |
+
numbering_type=NUMBERING_TYPES[1],
|
| 174 |
+
big=False,
|
| 175 |
+
traditional=False,
|
| 176 |
+
alt_zero=False,
|
| 177 |
+
alt_one=False,
|
| 178 |
+
alt_two=True,
|
| 179 |
+
use_zeros=True,
|
| 180 |
+
use_units=True,
|
| 181 |
+
):
|
| 182 |
+
|
| 183 |
+
def get_value(value_string, use_zeros=True):
|
| 184 |
+
|
| 185 |
+
striped_string = value_string.lstrip("0")
|
| 186 |
+
|
| 187 |
+
# record nothing if all zeros
|
| 188 |
+
if not striped_string:
|
| 189 |
+
return []
|
| 190 |
+
|
| 191 |
+
# record one digits
|
| 192 |
+
elif len(striped_string) == 1:
|
| 193 |
+
if use_zeros and len(value_string) != len(striped_string):
|
| 194 |
+
return [system.digits[0], system.digits[int(striped_string)]]
|
| 195 |
+
else:
|
| 196 |
+
return [system.digits[int(striped_string)]]
|
| 197 |
+
|
| 198 |
+
# recursively record multiple digits
|
| 199 |
+
else:
|
| 200 |
+
result_unit = next(
|
| 201 |
+
u for u in reversed(system.units) if u.power < len(striped_string)
|
| 202 |
+
)
|
| 203 |
+
result_string = value_string[: -result_unit.power]
|
| 204 |
+
return (
|
| 205 |
+
get_value(result_string)
|
| 206 |
+
+ [result_unit]
|
| 207 |
+
+ get_value(striped_string[-result_unit.power :])
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
system = create_system(numbering_type)
|
| 211 |
+
|
| 212 |
+
int_dec = number_string.split(".")
|
| 213 |
+
if len(int_dec) == 1:
|
| 214 |
+
int_string = int_dec[0]
|
| 215 |
+
dec_string = ""
|
| 216 |
+
elif len(int_dec) == 2:
|
| 217 |
+
int_string = int_dec[0]
|
| 218 |
+
dec_string = int_dec[1]
|
| 219 |
+
else:
|
| 220 |
+
raise ValueError(
|
| 221 |
+
"invalid input num string with more than one dot: {}".format(number_string)
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
if use_units and len(int_string) > 1:
|
| 225 |
+
result_symbols = get_value(int_string)
|
| 226 |
+
else:
|
| 227 |
+
result_symbols = [system.digits[int(c)] for c in int_string]
|
| 228 |
+
dec_symbols = [system.digits[int(c)] for c in dec_string]
|
| 229 |
+
if dec_string:
|
| 230 |
+
result_symbols += [system.math.point] + dec_symbols
|
| 231 |
+
|
| 232 |
+
if alt_two:
|
| 233 |
+
liang = CND(
|
| 234 |
+
2,
|
| 235 |
+
system.digits[2].alt_s,
|
| 236 |
+
system.digits[2].alt_t,
|
| 237 |
+
system.digits[2].big_s,
|
| 238 |
+
system.digits[2].big_t,
|
| 239 |
+
)
|
| 240 |
+
for i, v in enumerate(result_symbols):
|
| 241 |
+
if isinstance(v, CND) and v.value == 2:
|
| 242 |
+
next_symbol = (
|
| 243 |
+
result_symbols[i + 1] if i < len(result_symbols) - 1 else None
|
| 244 |
+
)
|
| 245 |
+
previous_symbol = result_symbols[i - 1] if i > 0 else None
|
| 246 |
+
if isinstance(next_symbol, CNU) and isinstance(
|
| 247 |
+
previous_symbol, (CNU, type(None))
|
| 248 |
+
):
|
| 249 |
+
if next_symbol.power != 1 and (
|
| 250 |
+
(previous_symbol is None) or (previous_symbol.power != 1)
|
| 251 |
+
):
|
| 252 |
+
result_symbols[i] = liang
|
| 253 |
+
|
| 254 |
+
# if big is True, '两' will not be used and `alt_two` has no impact on output
|
| 255 |
+
if big:
|
| 256 |
+
attr_name = "big_"
|
| 257 |
+
if traditional:
|
| 258 |
+
attr_name += "t"
|
| 259 |
+
else:
|
| 260 |
+
attr_name += "s"
|
| 261 |
+
else:
|
| 262 |
+
if traditional:
|
| 263 |
+
attr_name = "traditional"
|
| 264 |
+
else:
|
| 265 |
+
attr_name = "simplified"
|
| 266 |
+
|
| 267 |
+
result = "".join([getattr(s, attr_name) for s in result_symbols])
|
| 268 |
+
|
| 269 |
+
# if not use_zeros:
|
| 270 |
+
# result = result.strip(getattr(system.digits[0], attr_name))
|
| 271 |
+
|
| 272 |
+
if alt_zero:
|
| 273 |
+
result = result.replace(
|
| 274 |
+
getattr(system.digits[0], attr_name), system.digits[0].alt_s
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
if alt_one:
|
| 278 |
+
result = result.replace(
|
| 279 |
+
getattr(system.digits[1], attr_name), system.digits[1].alt_s
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
for i, p in enumerate(POINT):
|
| 283 |
+
if result.startswith(p):
|
| 284 |
+
return CHINESE_DIGIS[0] + result
|
| 285 |
+
|
| 286 |
+
# ^10, 11, .., 19
|
| 287 |
+
if (
|
| 288 |
+
len(result) >= 2
|
| 289 |
+
and result[1]
|
| 290 |
+
in [
|
| 291 |
+
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
|
| 292 |
+
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0],
|
| 293 |
+
]
|
| 294 |
+
and result[0]
|
| 295 |
+
in [
|
| 296 |
+
CHINESE_DIGIS[1],
|
| 297 |
+
BIG_CHINESE_DIGIS_SIMPLIFIED[1],
|
| 298 |
+
BIG_CHINESE_DIGIS_TRADITIONAL[1],
|
| 299 |
+
]
|
| 300 |
+
):
|
| 301 |
+
result = result[1:]
|
| 302 |
+
|
| 303 |
+
return result
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
if __name__ == "__main__":
|
| 307 |
+
|
| 308 |
+
# 测试程序
|
| 309 |
+
all_chinese_number_string = (
|
| 310 |
+
CHINESE_DIGIS
|
| 311 |
+
+ BIG_CHINESE_DIGIS_SIMPLIFIED
|
| 312 |
+
+ BIG_CHINESE_DIGIS_TRADITIONAL
|
| 313 |
+
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED
|
| 314 |
+
+ LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL
|
| 315 |
+
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED
|
| 316 |
+
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL
|
| 317 |
+
+ ZERO_ALT
|
| 318 |
+
+ ONE_ALT
|
| 319 |
+
+ "".join(TWO_ALTS + POSITIVE + NEGATIVE + POINT)
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
print("num:", chn2num("一万零四百零三点八零五"))
|
| 323 |
+
print("num:", chn2num("一亿六点三"))
|
| 324 |
+
print("num:", chn2num("一亿零六点三"))
|
| 325 |
+
print("num:", chn2num("两千零一亿六点三"))
|
| 326 |
+
# print('num:', chn2num('一零零八六'))
|
| 327 |
+
print("txt:", num2chn("10260.03", alt_zero=True))
|
| 328 |
+
print("txt:", num2chn("20037.090", numbering_type="low", traditional=True))
|
| 329 |
+
print("txt:", num2chn("100860001.77", numbering_type="high", big=True))
|
| 330 |
+
print(
|
| 331 |
+
"txt:",
|
| 332 |
+
num2chn(
|
| 333 |
+
"059523810880",
|
| 334 |
+
alt_one=True,
|
| 335 |
+
alt_two=False,
|
| 336 |
+
use_lzeros=True,
|
| 337 |
+
use_rzeros=True,
|
| 338 |
+
use_units=False,
|
| 339 |
+
),
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
print(all_chinese_number_string)
|
text/chn_text_norm/cardinal.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""CARDINAL类 (包含小数DECIMAL类)
|
| 3 |
+
纯数 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 纯数 方法
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 8 |
+
__data__ = "2019-05-03"
|
| 9 |
+
|
| 10 |
+
from text.chn_text_norm.basic_util import *
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Cardinal:
|
| 14 |
+
"""
|
| 15 |
+
CARDINAL类
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, cardinal=None, chntext=None):
|
| 19 |
+
self.cardinal = cardinal
|
| 20 |
+
self.chntext = chntext
|
| 21 |
+
|
| 22 |
+
def chntext2cardinal(self):
|
| 23 |
+
return chn2num(self.chntext)
|
| 24 |
+
|
| 25 |
+
def cardinal2chntext(self):
|
| 26 |
+
return num2chn(self.cardinal)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
|
| 31 |
+
# 测试程序
|
| 32 |
+
print(Cardinal(cardinal="21357.230").cardinal2chntext())
|
text/chn_text_norm/date.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""DATE类
|
| 3 |
+
日期 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 日期 方法
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 8 |
+
__data__ = "2019-05-07"
|
| 9 |
+
|
| 10 |
+
from text.chn_text_norm.cardinal import Cardinal
|
| 11 |
+
from text.chn_text_norm.digit import Digit
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Date:
|
| 15 |
+
"""
|
| 16 |
+
DATE类
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, date=None, chntext=None):
|
| 20 |
+
self.date = date
|
| 21 |
+
self.chntext = chntext
|
| 22 |
+
|
| 23 |
+
# def chntext2date(self):
|
| 24 |
+
# chntext = self.chntext
|
| 25 |
+
# try:
|
| 26 |
+
# year, other = chntext.strip().split('年', maxsplit=1)
|
| 27 |
+
# year = Digit(chntext=year).digit2chntext() + '年'
|
| 28 |
+
# except ValueError:
|
| 29 |
+
# other = chntext
|
| 30 |
+
# year = ''
|
| 31 |
+
# if other:
|
| 32 |
+
# try:
|
| 33 |
+
# month, day = other.strip().split('月', maxsplit=1)
|
| 34 |
+
# month = Cardinal(chntext=month).chntext2cardinal() + '月'
|
| 35 |
+
# except ValueError:
|
| 36 |
+
# day = chntext
|
| 37 |
+
# month = ''
|
| 38 |
+
# if day:
|
| 39 |
+
# day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
|
| 40 |
+
# else:
|
| 41 |
+
# month = ''
|
| 42 |
+
# day = ''
|
| 43 |
+
# date = year + month + day
|
| 44 |
+
# self.date = date
|
| 45 |
+
# return self.date
|
| 46 |
+
|
| 47 |
+
def date2chntext(self):
|
| 48 |
+
date = self.date
|
| 49 |
+
try:
|
| 50 |
+
year, other = date.strip().split("年", maxsplit=1)
|
| 51 |
+
year = Digit(digit=year).digit2chntext() + "年"
|
| 52 |
+
except ValueError:
|
| 53 |
+
other = date
|
| 54 |
+
year = ""
|
| 55 |
+
if other:
|
| 56 |
+
try:
|
| 57 |
+
month, day = other.strip().split("月", maxsplit=1)
|
| 58 |
+
month = Cardinal(cardinal=month).cardinal2chntext() + "月"
|
| 59 |
+
except ValueError:
|
| 60 |
+
day = date
|
| 61 |
+
month = ""
|
| 62 |
+
if day:
|
| 63 |
+
day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
|
| 64 |
+
else:
|
| 65 |
+
month = ""
|
| 66 |
+
day = ""
|
| 67 |
+
chntext = year + month + day
|
| 68 |
+
self.chntext = chntext
|
| 69 |
+
return self.chntext
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
|
| 74 |
+
# 测试
|
| 75 |
+
print(Date(date="09年3月16日").date2chntext())
|
text/chn_text_norm/digit.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""DIGIT类
|
| 3 |
+
数字串 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 数字串 方法
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 8 |
+
__data__ = "2019-05-03"
|
| 9 |
+
|
| 10 |
+
from text.chn_text_norm.basic_util import *
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Digit:
|
| 14 |
+
"""
|
| 15 |
+
DIGIT类
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, digit=None, chntext=None):
|
| 19 |
+
self.digit = digit
|
| 20 |
+
self.chntext = chntext
|
| 21 |
+
|
| 22 |
+
# def chntext2digit(self):
|
| 23 |
+
# return chn2num(self.chntext)
|
| 24 |
+
|
| 25 |
+
def digit2chntext(self):
|
| 26 |
+
return num2chn(self.digit, alt_two=False, use_units=False)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
|
| 31 |
+
# 测试程序
|
| 32 |
+
print(Digit(digit="2016").digit2chntext())
|
text/chn_text_norm/fraction.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""FRACTION类
|
| 3 |
+
分数 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 分数 方法
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 8 |
+
__data__ = "2019-05-03"
|
| 9 |
+
|
| 10 |
+
from text.chn_text_norm.basic_util import *
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Fraction:
|
| 14 |
+
"""
|
| 15 |
+
FRACTION类
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, fraction=None, chntext=None):
|
| 19 |
+
self.fraction = fraction
|
| 20 |
+
self.chntext = chntext
|
| 21 |
+
|
| 22 |
+
def chntext2fraction(self):
|
| 23 |
+
denominator, numerator = self.chntext.split("分之")
|
| 24 |
+
return chn2num(numerator) + "/" + chn2num(denominator)
|
| 25 |
+
|
| 26 |
+
def fraction2chntext(self):
|
| 27 |
+
numerator, denominator = self.fraction.split("/")
|
| 28 |
+
return num2chn(denominator) + "分之" + num2chn(numerator)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
|
| 33 |
+
# 测试程序
|
| 34 |
+
print(Fraction(fraction="2135/7230").fraction2chntext())
|
| 35 |
+
print(Fraction(chntext="五百八十一分之三百六十九").chntext2fraction())
|
text/chn_text_norm/money.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""MONEY类
|
| 3 |
+
金钱 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 金钱 方法
|
| 5 |
+
"""
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 9 |
+
__data__ = "2019-05-08"
|
| 10 |
+
|
| 11 |
+
from text.chn_text_norm.cardinal import Cardinal
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Money:
|
| 15 |
+
"""
|
| 16 |
+
MONEY类
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, money=None, chntext=None):
|
| 20 |
+
self.money = money
|
| 21 |
+
self.chntext = chntext
|
| 22 |
+
|
| 23 |
+
# def chntext2money(self):
|
| 24 |
+
# return self.money
|
| 25 |
+
|
| 26 |
+
def money2chntext(self):
|
| 27 |
+
money = self.money
|
| 28 |
+
pattern = re.compile(r"(\d+(\.\d+)?)")
|
| 29 |
+
matchers = pattern.findall(money)
|
| 30 |
+
if matchers:
|
| 31 |
+
for matcher in matchers:
|
| 32 |
+
money = money.replace(
|
| 33 |
+
matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()
|
| 34 |
+
)
|
| 35 |
+
self.chntext = money
|
| 36 |
+
return self.chntext
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
|
| 41 |
+
# 测试
|
| 42 |
+
print(Money(money="21.5万元").money2chntext())
|
| 43 |
+
print(Money(money="230块5毛").money2chntext())
|
text/chn_text_norm/percentage.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""PERCENTAGE类
|
| 3 |
+
百分数 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 百分数 方法
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 8 |
+
__data__ = "2019-05-06"
|
| 9 |
+
|
| 10 |
+
from text.chn_text_norm.basic_util import *
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Percentage:
|
| 14 |
+
"""
|
| 15 |
+
PERCENTAGE类
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, percentage=None, chntext=None):
|
| 19 |
+
self.percentage = percentage
|
| 20 |
+
self.chntext = chntext
|
| 21 |
+
|
| 22 |
+
def chntext2percentage(self):
|
| 23 |
+
return chn2num(self.chntext.strip().strip("百分之")) + "%"
|
| 24 |
+
|
| 25 |
+
def percentage2chntext(self):
|
| 26 |
+
return "百分之" + num2chn(self.percentage.strip().strip("%"))
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
|
| 31 |
+
# 测试程序
|
| 32 |
+
print(Percentage(chntext="百分之五十六点零三").chntext2percentage())
|
| 33 |
+
print(Percentage(percentage="65.3%").percentage2chntext())
|
text/chn_text_norm/telephone.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""TELEPHONE类
|
| 3 |
+
电话号码 <=> 中文字符串 方法
|
| 4 |
+
中文字符串 <=> 电话号码 方法
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 8 |
+
__data__ = "2019-05-03"
|
| 9 |
+
|
| 10 |
+
from text.chn_text_norm.basic_util import *
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TelePhone:
|
| 14 |
+
"""
|
| 15 |
+
TELEPHONE类
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, telephone=None, raw_chntext=None, chntext=None):
|
| 19 |
+
self.telephone = telephone
|
| 20 |
+
self.raw_chntext = raw_chntext
|
| 21 |
+
self.chntext = chntext
|
| 22 |
+
|
| 23 |
+
# def chntext2telephone(self):
|
| 24 |
+
# sil_parts = self.raw_chntext.split('<SIL>')
|
| 25 |
+
# self.telephone = '-'.join([
|
| 26 |
+
# str(chn2num(p)) for p in sil_parts
|
| 27 |
+
# ])
|
| 28 |
+
# return self.telephone
|
| 29 |
+
|
| 30 |
+
def telephone2chntext(self, fixed=False):
|
| 31 |
+
|
| 32 |
+
if fixed:
|
| 33 |
+
sil_parts = self.telephone.split("-")
|
| 34 |
+
self.raw_chntext = "<SIL>".join(
|
| 35 |
+
[num2chn(part, alt_two=False, use_units=False) for part in sil_parts]
|
| 36 |
+
)
|
| 37 |
+
self.chntext = self.raw_chntext.replace("<SIL>", "")
|
| 38 |
+
else:
|
| 39 |
+
sp_parts = self.telephone.strip("+").split()
|
| 40 |
+
self.raw_chntext = "<SP>".join(
|
| 41 |
+
[num2chn(part, alt_two=False, use_units=False) for part in sp_parts]
|
| 42 |
+
)
|
| 43 |
+
self.chntext = self.raw_chntext.replace("<SP>", "")
|
| 44 |
+
return self.chntext
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
|
| 49 |
+
# 测试程序
|
| 50 |
+
print(TelePhone(telephone="0595-23980880").telephone2chntext())
|
| 51 |
+
# print(TelePhone(raw_chntext='零五九五杠二三八六五零九八').chntext2telephone())
|
text/chn_text_norm/text.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
TEXT类
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
|
| 7 |
+
__data__ = "2019-05-03"
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
from text.chn_text_norm.cardinal import Cardinal
|
| 11 |
+
from text.chn_text_norm.date import Date
|
| 12 |
+
from text.chn_text_norm.digit import Digit
|
| 13 |
+
from text.chn_text_norm.fraction import Fraction
|
| 14 |
+
from text.chn_text_norm.money import Money
|
| 15 |
+
from text.chn_text_norm.percentage import Percentage
|
| 16 |
+
from text.chn_text_norm.telephone import TelePhone
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
CURRENCY_NAMES = (
|
| 20 |
+
"(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|"
|
| 21 |
+
"里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)"
|
| 22 |
+
)
|
| 23 |
+
CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)"
|
| 24 |
+
COM_QUANTIFIERS = (
|
| 25 |
+
"(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|"
|
| 26 |
+
"砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|"
|
| 27 |
+
"针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
|
| 28 |
+
"毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|"
|
| 29 |
+
"盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|"
|
| 30 |
+
"纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|人|抽)"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Text:
|
| 35 |
+
"""
|
| 36 |
+
Text类
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def __init__(self, raw_text, norm_text=None):
|
| 40 |
+
self.raw_text = "^" + raw_text + "$"
|
| 41 |
+
self.norm_text = norm_text
|
| 42 |
+
|
| 43 |
+
def _particular(self):
|
| 44 |
+
text = self.norm_text
|
| 45 |
+
pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
|
| 46 |
+
matchers = pattern.findall(text)
|
| 47 |
+
if matchers:
|
| 48 |
+
# print('particular')
|
| 49 |
+
for matcher in matchers:
|
| 50 |
+
text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1)
|
| 51 |
+
self.norm_text = text
|
| 52 |
+
return self.norm_text
|
| 53 |
+
|
| 54 |
+
def normalize(self):
|
| 55 |
+
text = self.raw_text
|
| 56 |
+
|
| 57 |
+
# 规范化日期
|
| 58 |
+
pattern = re.compile(
|
| 59 |
+
r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)"
|
| 60 |
+
)
|
| 61 |
+
matchers = pattern.findall(text)
|
| 62 |
+
if matchers:
|
| 63 |
+
# print('date')
|
| 64 |
+
for matcher in matchers:
|
| 65 |
+
text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
|
| 66 |
+
|
| 67 |
+
# 规范化金钱
|
| 68 |
+
pattern = re.compile(
|
| 69 |
+
r"\D+((\d+(\.\d+)?)[多余几]?"
|
| 70 |
+
+ CURRENCY_UNITS
|
| 71 |
+
+ "(\d"
|
| 72 |
+
+ CURRENCY_UNITS
|
| 73 |
+
+ "?)?)"
|
| 74 |
+
)
|
| 75 |
+
matchers = pattern.findall(text)
|
| 76 |
+
if matchers:
|
| 77 |
+
# print('money')
|
| 78 |
+
for matcher in matchers:
|
| 79 |
+
text = text.replace(
|
| 80 |
+
matcher[0], Money(money=matcher[0]).money2chntext(), 1
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# 规范化固话/手机号码
|
| 84 |
+
# 手机
|
| 85 |
+
# http://www.jihaoba.com/news/show/13680
|
| 86 |
+
# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
|
| 87 |
+
# 联通:130、131、132、156、155、186、185、176
|
| 88 |
+
# 电信:133、153、189、180、181、177
|
| 89 |
+
pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
|
| 90 |
+
matchers = pattern.findall(text)
|
| 91 |
+
if matchers:
|
| 92 |
+
# print('telephone')
|
| 93 |
+
for matcher in matchers:
|
| 94 |
+
text = text.replace(
|
| 95 |
+
matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1
|
| 96 |
+
)
|
| 97 |
+
# 固话
|
| 98 |
+
pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
|
| 99 |
+
matchers = pattern.findall(text)
|
| 100 |
+
if matchers:
|
| 101 |
+
# print('fixed telephone')
|
| 102 |
+
for matcher in matchers:
|
| 103 |
+
text = text.replace(
|
| 104 |
+
matcher[0],
|
| 105 |
+
TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True),
|
| 106 |
+
1,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# 规范化分数
|
| 110 |
+
pattern = re.compile(r"(\d+/\d+)")
|
| 111 |
+
matchers = pattern.findall(text)
|
| 112 |
+
if matchers:
|
| 113 |
+
# print('fraction')
|
| 114 |
+
for matcher in matchers:
|
| 115 |
+
text = text.replace(
|
| 116 |
+
matcher, Fraction(fraction=matcher).fraction2chntext(), 1
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# 规范化百分数
|
| 120 |
+
text = text.replace("%", "%")
|
| 121 |
+
pattern = re.compile(r"(\d+(\.\d+)?%)")
|
| 122 |
+
matchers = pattern.findall(text)
|
| 123 |
+
if matchers:
|
| 124 |
+
# print('percentage')
|
| 125 |
+
for matcher in matchers:
|
| 126 |
+
text = text.replace(
|
| 127 |
+
matcher[0],
|
| 128 |
+
Percentage(percentage=matcher[0]).percentage2chntext(),
|
| 129 |
+
1,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# 规范化纯数+量词
|
| 133 |
+
pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
|
| 134 |
+
matchers = pattern.findall(text)
|
| 135 |
+
if matchers:
|
| 136 |
+
# print('cardinal+quantifier')
|
| 137 |
+
for matcher in matchers:
|
| 138 |
+
text = text.replace(
|
| 139 |
+
matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# 规范化数字编号
|
| 143 |
+
pattern = re.compile(r"(\d{4,32})")
|
| 144 |
+
matchers = pattern.findall(text)
|
| 145 |
+
if matchers:
|
| 146 |
+
# print('digit')
|
| 147 |
+
for matcher in matchers:
|
| 148 |
+
text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
|
| 149 |
+
|
| 150 |
+
# 规范化纯数
|
| 151 |
+
pattern = re.compile(r"(\d+(\.\d+)?)")
|
| 152 |
+
matchers = pattern.findall(text)
|
| 153 |
+
if matchers:
|
| 154 |
+
# print('cardinal')
|
| 155 |
+
for matcher in matchers:
|
| 156 |
+
text = text.replace(
|
| 157 |
+
matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
self.norm_text = text
|
| 161 |
+
self._particular()
|
| 162 |
+
|
| 163 |
+
return self.norm_text.lstrip("^").rstrip("$")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
if __name__ == "__main__":
|
| 167 |
+
|
| 168 |
+
# 测试程序
|
| 169 |
+
print(Text(raw_text="固话:0595-23865596或23880880。").normalize())
|
| 170 |
+
print(Text(raw_text="手机:+86 19859213959或15659451527。").normalize())
|
| 171 |
+
print(Text(raw_text="分数:32477/76391。").normalize())
|
| 172 |
+
print(Text(raw_text="百分数:80.03%。").normalize())
|
| 173 |
+
print(Text(raw_text="编号:31520181154418。").normalize())
|
| 174 |
+
print(Text(raw_text="纯数:2983.07克或12345.60米。").normalize())
|
| 175 |
+
print(Text(raw_text="日期:1999年2月20日或09年3月15号。").normalize())
|
| 176 |
+
print(Text(raw_text="金钱:12块5,34.5元,20.1万").normalize())
|
| 177 |
+
print(Text(raw_text="特殊:O2O或B2C。").normalize())
|
text/clean.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
SYMBOLS_MAPPING = {
|
| 4 |
+
"\n": "",
|
| 5 |
+
"…": ".",
|
| 6 |
+
"“": "'",
|
| 7 |
+
"”": "'",
|
| 8 |
+
"‘": "'",
|
| 9 |
+
"’": "'",
|
| 10 |
+
"【": "",
|
| 11 |
+
"】": "",
|
| 12 |
+
"[": "",
|
| 13 |
+
"]": "",
|
| 14 |
+
"(": "",
|
| 15 |
+
")": "",
|
| 16 |
+
"(": "",
|
| 17 |
+
")": "",
|
| 18 |
+
"・": "",
|
| 19 |
+
"·": "",
|
| 20 |
+
"「": "'",
|
| 21 |
+
"」": "'",
|
| 22 |
+
"《": "'",
|
| 23 |
+
"》": "'",
|
| 24 |
+
"—": "",
|
| 25 |
+
"~": "",
|
| 26 |
+
"~": "",
|
| 27 |
+
":": ",",
|
| 28 |
+
";": ",",
|
| 29 |
+
";": ",",
|
| 30 |
+
":": ",",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
REPLACE_SYMBOL_REGEX = re.compile(
|
| 34 |
+
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
EMOJI_REGEX = re.compile(
|
| 39 |
+
"["
|
| 40 |
+
"\U0001F600-\U0001F64F" # emoticons
|
| 41 |
+
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 42 |
+
"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 43 |
+
"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
| 44 |
+
"]+",
|
| 45 |
+
flags=re.UNICODE,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def clean_text(text):
|
| 50 |
+
# Clean the text
|
| 51 |
+
text = text.strip()
|
| 52 |
+
|
| 53 |
+
# Replace all chinese symbols with their english counterparts
|
| 54 |
+
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
| 55 |
+
|
| 56 |
+
# Remove emojis
|
| 57 |
+
text = EMOJI_REGEX.sub(r"", text)
|
| 58 |
+
|
| 59 |
+
# Remove continuous periods (...) and commas (,,,)
|
| 60 |
+
text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
|
| 61 |
+
|
| 62 |
+
return text
|
text/spliter.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
|
| 4 |
+
from text.clean import clean_text
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def utf_8_len(text):
|
| 8 |
+
return len(text.encode("utf-8"))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def break_text(texts, length, splits: set):
|
| 12 |
+
for text in texts:
|
| 13 |
+
if utf_8_len(text) <= length:
|
| 14 |
+
yield text
|
| 15 |
+
continue
|
| 16 |
+
|
| 17 |
+
curr = ""
|
| 18 |
+
for char in text:
|
| 19 |
+
curr += char
|
| 20 |
+
|
| 21 |
+
if char in splits:
|
| 22 |
+
yield curr
|
| 23 |
+
curr = ""
|
| 24 |
+
|
| 25 |
+
if curr:
|
| 26 |
+
yield curr
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def break_text_by_length(texts, length):
|
| 30 |
+
for text in texts:
|
| 31 |
+
if utf_8_len(text) <= length:
|
| 32 |
+
yield text
|
| 33 |
+
continue
|
| 34 |
+
|
| 35 |
+
curr = ""
|
| 36 |
+
for char in text:
|
| 37 |
+
curr += char
|
| 38 |
+
|
| 39 |
+
if utf_8_len(curr) >= length:
|
| 40 |
+
yield curr
|
| 41 |
+
curr = ""
|
| 42 |
+
|
| 43 |
+
if curr:
|
| 44 |
+
yield curr
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def add_cleaned(curr, segments):
|
| 48 |
+
curr = curr.strip()
|
| 49 |
+
if curr and not all(c.isspace() or c in string.punctuation for c in curr):
|
| 50 |
+
segments.append(curr)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def protect_float(text):
|
| 54 |
+
# Turns 3.14 into <3_f_14> to prevent splitting
|
| 55 |
+
return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def unprotect_float(text):
|
| 59 |
+
# Turns <3_f_14> into 3.14
|
| 60 |
+
return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def split_text(text, length):
|
| 64 |
+
text = clean_text(text)
|
| 65 |
+
|
| 66 |
+
# Break the text into pieces with following rules:
|
| 67 |
+
# 1. Split the text at ".", "!", "?" if text is NOT a float
|
| 68 |
+
# 2. If the text is longer than length, split at ","
|
| 69 |
+
# 3. If the text is still longer than length, split at " "
|
| 70 |
+
# 4. If the text is still longer than length, split at any character to length
|
| 71 |
+
|
| 72 |
+
texts = [text]
|
| 73 |
+
texts = map(protect_float, texts)
|
| 74 |
+
texts = break_text(texts, length, {".", "!", "?", "。", "!", "?"})
|
| 75 |
+
texts = map(unprotect_float, texts)
|
| 76 |
+
texts = break_text(texts, length, {",", ","})
|
| 77 |
+
texts = break_text(texts, length, {" "})
|
| 78 |
+
texts = list(break_text_by_length(texts, length))
|
| 79 |
+
|
| 80 |
+
# Then, merge the texts into segments with length <= length
|
| 81 |
+
segments = []
|
| 82 |
+
curr = ""
|
| 83 |
+
|
| 84 |
+
for text in texts:
|
| 85 |
+
if utf_8_len(curr) + utf_8_len(text) <= length:
|
| 86 |
+
curr += text
|
| 87 |
+
else:
|
| 88 |
+
add_cleaned(curr, segments)
|
| 89 |
+
curr = text
|
| 90 |
+
|
| 91 |
+
if curr:
|
| 92 |
+
add_cleaned(curr, segments)
|
| 93 |
+
|
| 94 |
+
return segments
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
# Test the split_text function
|
| 99 |
+
|
| 100 |
+
text = "This is a test sentence. This is another test sentence. And a third one."
|
| 101 |
+
|
| 102 |
+
assert split_text(text, 50) == [
|
| 103 |
+
"This is a test sentence.",
|
| 104 |
+
"This is another test sentence. And a third one.",
|
| 105 |
+
]
|
| 106 |
+
assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
|
| 107 |
+
assert split_text(" ", 10) == []
|
| 108 |
+
assert split_text("a", 10) == ["a"]
|
| 109 |
+
|
| 110 |
+
text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
|
| 111 |
+
assert split_text(text, 50) == [
|
| 112 |
+
"This is a test sentence with only commas,",
|
| 113 |
+
"and no dots, and no exclamation marks,",
|
| 114 |
+
"and no question marks, and no newlines.",
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
|
| 118 |
+
# First half split at " ", second half split at ","
|
| 119 |
+
assert split_text(text, 50) == [
|
| 120 |
+
"This is a test sentence This is a test sentence",
|
| 121 |
+
"This is a test sentence. This is a test sentence,",
|
| 122 |
+
"This is a test sentence, This is a test sentence.",
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
|
| 126 |
+
assert split_text(text, 50) == [
|
| 127 |
+
"这是一段很长的中文文本,",
|
| 128 |
+
"而且没有句号,也没有感叹号,",
|
| 129 |
+
"也没有问号,也没有换行符.",
|
| 130 |
+
]
|