Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import spacy | |
| import subprocess | |
| from collections import Counter | |
| import heapq | |
| import io | |
| # 自動檢查、下載 spaCy 語言模型(en_core_web_sm),避免 Space 缺模型報錯 | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) | |
| nlp = spacy.load("en_core_web_sm") | |
| def read_pdf(file_stream): | |
| """讀取 PDF 文字內容""" | |
| text = '' | |
| reader = PyPDF2.PdfReader(file_stream) | |
| for page in reader.pages: | |
| text += page.extract_text() + ' ' | |
| return text.strip() | |
| def extract_key_phrases(text): | |
| """擷取文章中的關鍵詞與專有名詞""" | |
| doc = nlp(text) | |
| key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents] | |
| return key_phrases | |
| def score_sentences(text, key_phrases): | |
| """根據關鍵詞分數給每個句子計分""" | |
| sentence_scores = {} | |
| doc = nlp(text) | |
| for sent in doc.sents: | |
| for phrase in key_phrases: | |
| if phrase in sent.text: | |
| if sent in sentence_scores: | |
| sentence_scores[sent] += 1 | |
| else: | |
| sentence_scores[sent] = 1 | |
| return sentence_scores | |
| def summarize_text(sentence_scores, num_points=5): | |
| """依據分數挑出重要句子並條列化輸出""" | |
| summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get) | |
| summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences]) | |
| return summary | |