BlueQuasar
/

Extractive_text_summarizer

Model card Files Files and versions

Extractive_text_summarizer / extract_features.py

BlueQuasar's picture

Upload extract_features.py

c302bad verified over 1 year ago

history blame contribute delete

2.07 kB

	import nltk
	import re
	import numpy as np

	english_stopwords = nltk.corpus.stopwords.words('english')

	def extract_features(article):
	X = []
	pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]\|[0-9]\.[0-9]))'
	allWords = nltk.tokenize.word_tokenize(article)
	allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum())
	mostCommon = [k for k, c in allWordExceptStopDist.most_common(10)]

	pos_tags = nltk.pos_tag(allWords)
	proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP']
	stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']]

	articleFeatureVects = []
	for j, para in enumerate(article.split("\n\n")):
	for k, sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern, para)]):
	if len(sente) == 0:
	continue
	senteFeatureVect = []
	senteFeatureVect.append(1 if k == 0 else 0)
	senteFeatureVect.append(np.absolute(np.pi * np.cos(j) / len(article.split("\n\n"))))
	senteFeatureVect.append(np.absolute(np.pi * np.cos(k) / len(para.split(". "))))
	senteFeatureVect.append(len(sente.split(" ")))

	thematicWords = 0
	propnounWords = 0
	statsWords = 0
	for word in sente.split(" "):
	if word in mostCommon:
	thematicWords += 1
	if word in proper_nouns:
	propnounWords += 1
	if word in stats:
	statsWords += 1
	thematicWords = 100 * thematicWords / (len(sente) - thematicWords + 1 if len(sente) - thematicWords == 0 else len(sente) - thematicWords)
	propnounWords = propnounWords / len(sente) * 200
	statsWords = statsWords / len(sente) * 300

	senteFeatureVect.extend([thematicWords, propnounWords, statsWords])

	articleFeatureVects.append(senteFeatureVect)

	X.extend(articleFeatureVects)
	return X