import nltk import re import numpy as np english_stopwords = nltk.corpus.stopwords.words('english') def extract_features(article): X = [] pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))' allWords = nltk.tokenize.word_tokenize(article) allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum()) mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)] pos_tags = nltk.pos_tag(allWords) proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP'] stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']] articleFeatureVects = [] for j, para in enumerate(article.split("\n\n")): for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]): if len(sente) == 0: continue senteFeatureVect = [] senteFeatureVect.append(1 if k == 0 else 0) senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n")))) senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". ")))) senteFeatureVect.append(len(sente.split(" "))) thematicWords = 0 propnounWords = 0 statsWords = 0 for word in sente.split(" "): if word in mostCommon: thematicWords += 1 if word in proper_nouns: propnounWords += 1 if word in stats: statsWords += 1 thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords) propnounWords = propnounWords / len(sente) * 200 statsWords = statsWords / len(sente) * 300 senteFeatureVect.extend([thematicWords, propnounWords, statsWords]) articleFeatureVects.append(senteFeatureVect) X.extend(articleFeatureVects) return X