Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import spacy | |
| from collections import Counter | |
| import heapq | |
| import io | |
| # Load spaCy model | |
| nlp = spacy.load("./en_core_web_sm-3.7.1") | |
| def read_pdf(file_stream): | |
| text = '' | |
| reader = PyPDF2.PdfReader(file_stream) | |
| for page in reader.pages: | |
| text += page.extract_text() + ' ' | |
| return text.strip() | |
| def extract_key_phrases(text): | |
| doc = nlp(text) | |
| # Combine noun chunks and named entities as candidates for key phrases | |
| key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents] | |
| return key_phrases | |
| def score_sentences(text, key_phrases): | |
| sentence_scores = {} | |
| doc = nlp(text) | |
| for sent in doc.sents: | |
| for phrase in key_phrases: | |
| if phrase in sent.text: | |
| if sent in sentence_scores: | |
| sentence_scores[sent] += 1 | |
| else: | |
| sentence_scores[sent] = 1 | |
| return sentence_scores | |
| def summarize_text(sentence_scores, num_points=5): | |
| summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get) | |
| # Format summary as bullet points | |
| summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences]) | |
| return summary | |