BlueQuasar
/

Extractive_text_summarizer

Model card Files Files and versions

BlueQuasar commited on Aug 11, 2024

Commit

c302bad

·

verified ·

1 Parent(s): 6439929

Upload extract_features.py

Files changed (1) hide show

extract_features.py +48 -0

extract_features.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import nltk
+import re
+import numpy as np
+english_stopwords = nltk.corpus.stopwords.words('english')
+def extract_features(article):
+    X = []
+    pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))'
+    allWords = nltk.tokenize.word_tokenize(article)
+    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum())
+    mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)]
+    pos_tags = nltk.pos_tag(allWords)
+    proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP']
+    stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']]
+    articleFeatureVects = []
+    for j, para in enumerate(article.split("\n\n")):
+        for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]):
+            if len(sente) == 0:
+                continue
+            senteFeatureVect = []
+            senteFeatureVect.append(1 if k == 0 else 0)
+            senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n"))))
+            senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". "))))
+            senteFeatureVect.append(len(sente.split(" ")))
+            thematicWords = 0
+            propnounWords = 0
+            statsWords = 0
+            for word in sente.split(" "):
+                if word in mostCommon:
+                    thematicWords += 1
+                if word in proper_nouns:
+                    propnounWords += 1
+                if word in stats:
+                    statsWords += 1
+            thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords)
+            propnounWords = propnounWords / len(sente) * 200
+            statsWords = statsWords / len(sente) * 300
+            senteFeatureVect.extend([thematicWords, propnounWords, statsWords])
+            articleFeatureVects.append(senteFeatureVect)
+    X.extend(articleFeatureVects)
+    return X