File size: 2,072 Bytes
c302bad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import nltk
import re
import numpy as np

english_stopwords = nltk.corpus.stopwords.words('english')

def extract_features(article):
    X = []
    pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))'
    allWords = nltk.tokenize.word_tokenize(article)
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum())
    mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)]

    pos_tags = nltk.pos_tag(allWords)
    proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP']
    stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']]

    articleFeatureVects = []
    for j, para in enumerate(article.split("\n\n")):
        for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]):
            if len(sente) == 0:
                continue
            senteFeatureVect = []
            senteFeatureVect.append(1 if k == 0 else 0)
            senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n"))))
            senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". "))))
            senteFeatureVect.append(len(sente.split(" ")))

            thematicWords = 0
            propnounWords = 0
            statsWords = 0
            for word in sente.split(" "):
                if word in mostCommon:
                    thematicWords += 1
                if word in proper_nouns:
                    propnounWords += 1
                if word in stats:
                    statsWords += 1
            thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords)
            propnounWords = propnounWords / len(sente) * 200
            statsWords = statsWords / len(sente) * 300

            senteFeatureVect.extend([thematicWords, propnounWords, statsWords])

            articleFeatureVects.append(senteFeatureVect)

    X.extend(articleFeatureVects)
    return X