DavidWill's picture
Update app.py
7eaac20 verified
# app.py
import gradio as gr
import trafilatura
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
def clean_text(text):
# 移除過多空白、保留中文/英文/數字/標點
text = re.sub(r'\s+', ' ', text)
return text.strip()
def textrank_summary(text, target_min=300, target_max=500):
# 以句號、換行切分句子
sentences = re.split(r'[。!?\n]', text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 10] # 過濾太短句
if not sentences:
return text[:500]
# 若全文太短,直接回傳
if len(text) <= target_max:
return text
# 中文分詞(用 jieba)
tokenized = [' '.join(jieba.cut(sent)) for sent in sentences]
# TF-IDF 向量化
vectorizer = TfidfVectorizer()
try:
tfidf = vectorizer.fit_transform(tokenized)
except ValueError:
return ' '.join(sentences)[:500]
# 計算句子相似度
sim_mat = cosine_similarity(tfidf)
# 簡化 TextRank:用相似度均值當分數
scores = np.mean(sim_mat, axis=1)
# 按分數排序
ranked = sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)
# 拼接句子直到達到目標長度
result = []
total_len = 0
for sent, _ in ranked:
if total_len >= target_min:
break
if total_len + len(sent) > target_max:
# 截斷最後一句
remaining = target_max - total_len
result.append(sent[:remaining])
break
result.append(sent)
total_len += len(sent)
return ''.join([s + '。' for s in result if s])
def summarize_url(url):
if not url or not url.startswith("http"):
return "請輸入有效網址(需包含 http:// 或 https://)"
try:
downloaded = trafilatura.fetch_url(url)
if not downloaded:
return "無法載入該網頁,請確認網址是否正確。"
text = trafilatura.extract(
downloaded,
include_comments=False,
include_tables=False,
output_format="txt"
)
if not text or len(text.strip()) < 100:
return "無法提取足夠的文字內容(可能不是文章頁面)。"
text = clean_text(text)
summary = textrank_summary(text, target_min=300, target_max=500)
return summary
except Exception as e:
return f"處理錯誤:{str(e)}"
# Gradio UI
with gr.Blocks(title="中文網頁重點整理(300–500字)") as demo:
gr.Markdown("## 🌐 中文網頁重點整理 Agent\n輸入網址,自動提取 **300–500 字重點內容**(CPU 友好,無需模型)")
url_input = gr.Textbox(label="請輸入網址", placeholder="https://example.com/chinese-article")
output = gr.Textbox(label="重點摘要", lines=10)
btn = gr.Button("開始整理")
btn.click(fn=summarize_url, inputs=url_input, outputs=output)
gr.Markdown("💡 適用於新聞、報告、部落格等文字型網頁。")
demo.launch()