Spaces:

HaarisIqubal
/

summarize_documents

Sleeping

File size: 1,025 Bytes

c075e70

import re
from collections import Counter
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

def preporcess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    return words

def sentence_score(text):
    sentences = sent_tokenize(text)
    words = preporcess_text(text)
    word_frequencies = Counter(words)
    scores = {}

    for sentence in sentences:
        sentence_words = preporcess_text(sentence)
        score = sum([word_frequencies[word] for word in sentence_words])
        scores[sentence] = score
    return scores

def summarize_text(text, num_sentences=10):
    scores = sentence_score(text)
    ranked_sentences = sorted(scores, key=scores.get, reverse=True)
    summary = " ".join(ranked_sentences[:num_sentences])
    return summary