Spaces:

studyonly
/

PDF-Summarizer-studyonly

Sleeping

PDF-Summarizer-studyonly / main.py

PDF Summarizer version 1

789bf02 verified 6 months ago

1.57 kB

	import PyPDF2
	import spacy
	import subprocess
	from collections import Counter
	import heapq
	import io

	# 自動檢查、下載 spaCy 語言模型（en_core_web_sm），避免 Space 缺模型報錯
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")

	def read_pdf(file_stream):
	"""讀取 PDF 文字內容"""
	text = ''
	reader = PyPDF2.PdfReader(file_stream)
	for page in reader.pages:
	text += page.extract_text() + ' '
	return text.strip()

	def extract_key_phrases(text):
	"""擷取文章中的關鍵詞與專有名詞"""
	doc = nlp(text)
	key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
	return key_phrases

	def score_sentences(text, key_phrases):
	"""根據關鍵詞分數給每個句子計分"""
	sentence_scores = {}
	doc = nlp(text)
	for sent in doc.sents:
	for phrase in key_phrases:
	if phrase in sent.text:
	if sent in sentence_scores:
	sentence_scores[sent] += 1
	else:
	sentence_scores[sent] = 1
	return sentence_scores

	def summarize_text(sentence_scores, num_points=5):
	"""依據分數挑出重要句子並條列化輸出"""
	summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
	summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
	return summary