Spaces:

DavidWill
/

web-summarizer-agent

Sleeping

App Files Files Community

DavidWill commited on Oct 17, 2025

Commit

7eaac20

verified ·

1 Parent(s): 2592248

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -25

app.py CHANGED Viewed

@@ -1,39 +1,97 @@
 # app.py
-from transformers import pipeline
 import gradio as gr
 import trafilatura
-# 使用公開可用的多語摘要模型
-#from transformers import pipeline
-summarizer = pipeline(
-    "summarization",
-    model="csebuetnlp/mT5_multilingual_XLSum",
-    tokenizer="csebuetnlp/mT5_multilingual_XLSum",
-    device=-1
-)
 def summarize_url(url):
     if not url or not url.startswith("http"):
-        return "請輸入有效網址"
     try:
         downloaded = trafilatura.fetch_url(url)
         if not downloaded:
-            return "無法載入網頁"
-        text = trafilatura.extract(downloaded, include_comments=False, output_format="txt")
-        if not text or len(text) < 50:
-            return "內容太少或非文字頁面"
-        # 模型雖支援長文本，但 CPU 建議縮短
-        input_text = text[:800]
-        summary = summarizer(input_text, max_length=120, min_length=40, do_sample=False)
-        return summary[0]['summary_text']
     except Exception as e:
-        return f"錯誤：{str(e)}"
-with gr.Blocks() as demo:
-    gr.Markdown("## 🌐 網頁重點整理（支援中文）")
-    url = gr.Textbox(label="網址")
-    out = gr.Textbox(label="摘要", lines=6)
-    btn = gr.Button("整理")
-    btn.click(summarize_url, url, out)
 demo.launch()

 # app.py
 import gradio as gr
 import trafilatura
+import jieba
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import re
+def clean_text(text):
+    # 移除過多空白、保留中文/英文/數字/標點
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def textrank_summary(text, target_min=300, target_max=500):
+    # 以句號、換行切分句子
+    sentences = re.split(r'[。！？\n]', text)
+    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]  # 過濾太短句
+    if not sentences:
+        return text[:500]
+    # 若全文太短，直接回傳
+    if len(text) <= target_max:
+        return text
+    # 中文分詞（用 jieba）
+    tokenized = [' '.join(jieba.cut(sent)) for sent in sentences]
+    # TF-IDF 向量化
+    vectorizer = TfidfVectorizer()
+    try:
+        tfidf = vectorizer.fit_transform(tokenized)
+    except ValueError:
+        return ' '.join(sentences)[:500]
+    # 計算句子相似度
+    sim_mat = cosine_similarity(tfidf)
+    # 簡化 TextRank：用相似度均值當分數
+    scores = np.mean(sim_mat, axis=1)
+    # 按分數排序
+    ranked = sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)
+    # 拼接句子直到達到目標長度
+    result = []
+    total_len = 0
+    for sent, _ in ranked:
+        if total_len >= target_min:
+            break
+        if total_len + len(sent) > target_max:
+            # 截斷最後一句
+            remaining = target_max - total_len
+            result.append(sent[:remaining])
+            break
+        result.append(sent)
+        total_len += len(sent)
+    return ''.join([s + '。' for s in result if s])
 def summarize_url(url):
     if not url or not url.startswith("http"):
+        return "請輸入有效網址（需包含 http:// 或 https://）"
     try:
         downloaded = trafilatura.fetch_url(url)
         if not downloaded:
+            return "無法載入該網頁，請確認網址是否正確。"
+        text = trafilatura.extract(
+            downloaded,
+            include_comments=False,
+            include_tables=False,
+            output_format="txt"
+        )
+        if not text or len(text.strip()) < 100:
+            return "無法提取足夠的文字內容（可能不是文章頁面）。"
+        text = clean_text(text)
+        summary = textrank_summary(text, target_min=300, target_max=500)
+        return summary
     except Exception as e:
+        return f"處理錯誤：{str(e)}"
+# Gradio UI
+with gr.Blocks(title="中文網頁重點整理（300–500字）") as demo:
+    gr.Markdown("## 🌐 中文網頁重點整理 Agent\n輸入網址，自動提取 **300–500 字重點內容**（CPU 友好，無需模型）")
+    url_input = gr.Textbox(label="請輸入網址", placeholder="https://example.com/chinese-article")
+    output = gr.Textbox(label="重點摘要", lines=10)
+    btn = gr.Button("開始整理")
+    btn.click(fn=summarize_url, inputs=url_input, outputs=output)
+    gr.Markdown("💡 適用於新聞、報告、部落格等文字型網頁。")
 demo.launch()