Spaces:

DavidWill
/

web-summarizer-agent

Sleeping

App Files Files Community

web-summarizer-agent / app.py

DavidWill

Update app.py

7eaac20 verified 2 months ago

raw

history blame contribute delete

3.21 kB

	# app.py
	import gradio as gr
	import trafilatura
	import jieba
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import re

	def clean_text(text):
	# 移除過多空白、保留中文/英文/數字/標點
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def textrank_summary(text, target_min=300, target_max=500):
	# 以句號、換行切分句子
	sentences = re.split(r'[。！？\n]', text)
	sentences = [s.strip() for s in sentences if len(s.strip()) > 10] # 過濾太短句

	if not sentences:
	return text[:500]

	# 若全文太短，直接回傳
	if len(text) <= target_max:
	return text

	# 中文分詞（用 jieba）
	tokenized = [' '.join(jieba.cut(sent)) for sent in sentences]

	# TF-IDF 向量化
	vectorizer = TfidfVectorizer()
	try:
	tfidf = vectorizer.fit_transform(tokenized)
	except ValueError:
	return ' '.join(sentences)[:500]

	# 計算句子相似度
	sim_mat = cosine_similarity(tfidf)

	# 簡化 TextRank：用相似度均值當分數
	scores = np.mean(sim_mat, axis=1)

	# 按分數排序
	ranked = sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)

	# 拼接句子直到達到目標長度
	result = []
	total_len = 0
	for sent, _ in ranked:
	if total_len >= target_min:
	break
	if total_len + len(sent) > target_max:
	# 截斷最後一句
	remaining = target_max - total_len
	result.append(sent[:remaining])
	break
	result.append(sent)
	total_len += len(sent)

	return ''.join([s + '。' for s in result if s])

	def summarize_url(url):
	if not url or not url.startswith("http"):
	return "請輸入有效網址（需包含 http:// 或 https://）"

	try:
	downloaded = trafilatura.fetch_url(url)
	if not downloaded:
	return "無法載入該網頁，請確認網址是否正確。"

	text = trafilatura.extract(
	downloaded,
	include_comments=False,
	include_tables=False,
	output_format="txt"
	)

	if not text or len(text.strip()) < 100:
	return "無法提取足夠的文字內容（可能不是文章頁面）。"

	text = clean_text(text)
	summary = textrank_summary(text, target_min=300, target_max=500)
	return summary

	except Exception as e:
	return f"處理錯誤：{str(e)}"

	# Gradio UI
	with gr.Blocks(title="中文網頁重點整理（300–500字）") as demo:
	gr.Markdown("## 🌐 中文網頁重點整理 Agent\n輸入網址，自動提取 300–500 字重點內容（CPU 友好，無需模型）")
	url_input = gr.Textbox(label="請輸入網址", placeholder="https://example.com/chinese-article")
	output = gr.Textbox(label="重點摘要", lines=10)
	btn = gr.Button("開始整理")
	btn.click(fn=summarize_url, inputs=url_input, outputs=output)
	gr.Markdown("💡 適用於新聞、報告、部落格等文字型網頁。")

	demo.launch()