Spaces:

HaarisIqubal
/

summarize_documents

Sleeping

Synced repo using 'sync_with_huggingface' Github Action

c075e70 verified about 1 year ago

1.03 kB

	import re
	from collections import Counter
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords

	nltk.download('stopwords')
	nltk.download('punkt')
	nltk.download('punkt_tab')

	def preporcess_text(text):
	stop_words = set(stopwords.words('english'))
	words = word_tokenize(text)
	words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
	return words

	def sentence_score(text):
	sentences = sent_tokenize(text)
	words = preporcess_text(text)
	word_frequencies = Counter(words)
	scores = {}

	for sentence in sentences:
	sentence_words = preporcess_text(sentence)
	score = sum([word_frequencies[word] for word in sentence_words])
	scores[sentence] = score
	return scores

	def summarize_text(text, num_sentences=10):
	scores = sentence_score(text)
	ranked_sentences = sorted(scores, key=scores.get, reverse=True)
	summary = " ".join(ranked_sentences[:num_sentences])
	return summary