Spaces:
Runtime error
Runtime error
| import subprocess | |
| subprocess.run(["pip", "install", "spacy"]) | |
| import spacy | |
| spacy.cli.download("en_core_web_sm") | |
| from spacy.tokens import Doc | |
| # 加载英文模型 | |
| nlp = spacy.load('en_core_web_sm') | |
| import nltk | |
| nltk.download('punkt') | |
| from nltk.tokenize import word_tokenize | |
| import jieba | |
| from sacremoses import MosesTokenizer | |
| from subword_nmt import apply_bpe | |
| import codecs | |
| jieba1 = jieba.Tokenizer() | |
| jieba2 = jieba.Tokenizer() | |
| jieba2.load_userdict('model2_data/dict.zh.txt') | |
| mt_zh = MosesTokenizer(lang='zh') | |
| with codecs.open('model2_data/bpecode.zh', 'r', 'utf-8') as f: | |
| bpe_zh_f = apply_bpe.BPE(f) | |
| #英文部分初始化,定义tokenize等等 | |
| mt_en = MosesTokenizer(lang='en') | |
| with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f: | |
| bpe_en_f = apply_bpe.BPE(f) | |
| def spacy_tokenize(line): | |
| # 使用spaCy处理文本 | |
| doc = nlp(line) | |
| # 获取单词列表 | |
| words = [token.text for token in doc] | |
| # 将单词连接成一个字符串,单词间用一个空格间隔 | |
| return ' '.join(words) | |
| def nltk_tokenize(line): | |
| # 使用NLTK的word_tokenize进行分词 | |
| tokens = word_tokenize(line) | |
| return tokens | |
| def jieba_tokenize(line): | |
| # 使用jieba进行分词 | |
| tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符 | |
| return tokens | |
| def tokenize(line, mode): | |
| if mode == "汉译英" : | |
| return jieba_tokenize(line) | |
| else : | |
| return nltk_tokenize(spacy_tokenize(line)) | |
| def jieba_tokenize2(line): | |
| tokens = list(jieba2.cut(line.strip())) | |
| return tokens | |
| def mt_bpe_zh(line): | |
| zh_tok = mt_zh.tokenize(line) | |
| bpe_zh = bpe_zh_f.segment_tokens(zh_tok) | |
| print(bpe_zh) | |
| return bpe_zh | |
| def mt_bpe_en(line): | |
| en_tok = mt_en.tokenize(line) | |
| bpe_en = bpe_en_f.segment_tokens(en_tok) | |
| print(bpe_en) | |
| return bpe_en | |
| def tokenize2(line, mode): | |
| if mode == "汉译英" : | |
| return mt_bpe_zh(' '.join(jieba_tokenize2(line))) | |
| else : | |
| return mt_bpe_en(line) |