| import string, re, opencc |
|
|
|
|
| 全型2半型= str.maketrans( |
| ' 0123456789' |
| 'abcdefghijklmnopqrstuvwxyz' |
| 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
| '!゛#$%&()*+、ー。/:;〈=〉?@[]^_‘{|}~', |
| ' 0123456789' |
| 'abcdefghijklmnopqrstuvwxyz' |
| 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
| '!"#$%&()*+,-./:;<=>?@[]^_`{|}~' |
| ) |
|
|
| def 把怪字修進unicode(xStr): |
| xStr= re.sub('\uf5c3','𪜶', xStr) |
| return xStr |
|
|
| def ryNormText(s): |
| """ |
| <<<None>>> ==> 刪除 |
| 標點 ==> 空白 |
| 連續空白 ==> 1個空白 |
| 簡繁 |
| """ |
|
|
| punc1= string.punctuation |
| punc1 |
| punc2= '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏·〈〉-。﹐、!?,\n' |
|
|
| punc= f"[{punc1}{punc2}]" |
|
|
| |
| s= re.sub('<<<None>>>','',s) |
|
|
| |
| s= re.sub(punc,' ',s) |
| |
|
|
| |
| s= re.sub('[ ]+',' ',s) |
| |
| |
| |
| s= re.sub(' ','',s) |
| |
| s= 把怪字修進unicode(s) |
|
|
| |
| s= opencc.OpenCC('s2tw').convert(s) |
| |
|
|
| return s |
|
|
| import unicodedata |
| import re |
|
|
|
|
| def separ_char_word(inputString= ''): |
|
|
| inputString= 把怪字修進unicode(inputString) |
|
|
| y= '' |
| for x in inputString: |
| y += x |
| try: |
| un= unicodedata.name(x) |
| if un.startswith('CJK'): |
| y += ' ' |
| else: |
| pass |
| except Exception as ex: |
| y = ' '+y+' ' |
| print(f'ryErr:(def 中英分開:){ex= }\t【{x= }】\t{inputString= }') |
|
|
| y= re.sub('[ ]+',' ', y) |
| return y |
|
|
| |
|
|
| |