File size: 2,072 Bytes
c302bad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import nltk
import re
import numpy as np
english_stopwords = nltk.corpus.stopwords.words('english')
def extract_features(article):
X = []
pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))'
allWords = nltk.tokenize.word_tokenize(article)
allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum())
mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)]
pos_tags = nltk.pos_tag(allWords)
proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP']
stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']]
articleFeatureVects = []
for j, para in enumerate(article.split("\n\n")):
for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]):
if len(sente) == 0:
continue
senteFeatureVect = []
senteFeatureVect.append(1 if k == 0 else 0)
senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n"))))
senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". "))))
senteFeatureVect.append(len(sente.split(" ")))
thematicWords = 0
propnounWords = 0
statsWords = 0
for word in sente.split(" "):
if word in mostCommon:
thematicWords += 1
if word in proper_nouns:
propnounWords += 1
if word in stats:
statsWords += 1
thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords)
propnounWords = propnounWords / len(sente) * 200
statsWords = statsWords / len(sente) * 300
senteFeatureVect.extend([thematicWords, propnounWords, statsWords])
articleFeatureVects.append(senteFeatureVect)
X.extend(articleFeatureVects)
return X |