BlueQuasar commited on
Commit
c302bad
·
verified ·
1 Parent(s): 6439929

Upload extract_features.py

Browse files
Files changed (1) hide show
  1. extract_features.py +48 -0
extract_features.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import re
3
+ import numpy as np
4
+
5
+ english_stopwords = nltk.corpus.stopwords.words('english')
6
+
7
+ def extract_features(article):
8
+ X = []
9
+ pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))'
10
+ allWords = nltk.tokenize.word_tokenize(article)
11
+ allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum())
12
+ mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)]
13
+
14
+ pos_tags = nltk.pos_tag(allWords)
15
+ proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP']
16
+ stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']]
17
+
18
+ articleFeatureVects = []
19
+ for j, para in enumerate(article.split("\n\n")):
20
+ for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]):
21
+ if len(sente) == 0:
22
+ continue
23
+ senteFeatureVect = []
24
+ senteFeatureVect.append(1 if k == 0 else 0)
25
+ senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n"))))
26
+ senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". "))))
27
+ senteFeatureVect.append(len(sente.split(" ")))
28
+
29
+ thematicWords = 0
30
+ propnounWords = 0
31
+ statsWords = 0
32
+ for word in sente.split(" "):
33
+ if word in mostCommon:
34
+ thematicWords += 1
35
+ if word in proper_nouns:
36
+ propnounWords += 1
37
+ if word in stats:
38
+ statsWords += 1
39
+ thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords)
40
+ propnounWords = propnounWords / len(sente) * 200
41
+ statsWords = statsWords / len(sente) * 300
42
+
43
+ senteFeatureVect.extend([thematicWords, propnounWords, statsWords])
44
+
45
+ articleFeatureVects.append(senteFeatureVect)
46
+
47
+ X.extend(articleFeatureVects)
48
+ return X