| import nltk | |
| import re | |
| import numpy as np | |
| english_stopwords = nltk.corpus.stopwords.words('english') | |
| def extract_features(article): | |
| X = [] | |
| pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))' | |
| allWords = nltk.tokenize.word_tokenize(article) | |
| allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum()) | |
| mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)] | |
| pos_tags = nltk.pos_tag(allWords) | |
| proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP'] | |
| stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']] | |
| articleFeatureVects = [] | |
| for j, para in enumerate(article.split("\n\n")): | |
| for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]): | |
| if len(sente) == 0: | |
| continue | |
| senteFeatureVect = [] | |
| senteFeatureVect.append(1 if k == 0 else 0) | |
| senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n")))) | |
| senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". ")))) | |
| senteFeatureVect.append(len(sente.split(" "))) | |
| thematicWords = 0 | |
| propnounWords = 0 | |
| statsWords = 0 | |
| for word in sente.split(" "): | |
| if word in mostCommon: | |
| thematicWords += 1 | |
| if word in proper_nouns: | |
| propnounWords += 1 | |
| if word in stats: | |
| statsWords += 1 | |
| thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords) | |
| propnounWords = propnounWords / len(sente) * 200 | |
| statsWords = statsWords / len(sente) * 300 | |
| senteFeatureVect.extend([thematicWords, propnounWords, statsWords]) | |
| articleFeatureVects.append(senteFeatureVect) | |
| X.extend(articleFeatureVects) | |
| return X |