Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Fri Jun 12 15:26:44 2020 | |
| @author: luol2 | |
| """ | |
| import nltk | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.corpus import wordnet | |
| from nltk.stem.porter import PorterStemmer | |
| lemmatizer = WordNetLemmatizer() | |
| stemmer = PorterStemmer() | |
| import io | |
| def get_wordnet_pos(treebank_tag): | |
| if treebank_tag.startswith('J'): | |
| return wordnet.ADJ | |
| elif treebank_tag.startswith('V'): | |
| return wordnet.VERB | |
| elif treebank_tag.startswith('N'): | |
| return wordnet.NOUN | |
| elif treebank_tag.startswith('R') or treebank_tag=='IN': | |
| return wordnet.ADV | |
| else: | |
| return wordnet.NOUN | |
| def ssplit_token_pos_lemma(in_text): | |
| fout=io.StringIO() | |
| line=in_text.strip() | |
| line=line.replace('-',' - ').replace('/',' / ') | |
| sentences = nltk.sent_tokenize(line) | |
| sentences = [nltk.word_tokenize(sent) for sent in sentences] | |
| # print(sentences) | |
| for sent in sentences: | |
| token_pos = nltk.pos_tag(sent) | |
| for token in token_pos: | |
| lemma = lemmatizer.lemmatize(token[0].lower(), get_wordnet_pos(token[1])) | |
| stem = stemmer.stem(token[0].lower()) | |
| fout.write(token[0]+'\t'+lemma+'\t'+stem+'\t'+token[1]+'\n') | |
| fout.write('\n') | |
| return fout.getvalue() |