Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| import trafilatura | |
| import jieba | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import re | |
| def clean_text(text): | |
| # 移除過多空白、保留中文/英文/數字/標點 | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def textrank_summary(text, target_min=300, target_max=500): | |
| # 以句號、換行切分句子 | |
| sentences = re.split(r'[。!?\n]', text) | |
| sentences = [s.strip() for s in sentences if len(s.strip()) > 10] # 過濾太短句 | |
| if not sentences: | |
| return text[:500] | |
| # 若全文太短,直接回傳 | |
| if len(text) <= target_max: | |
| return text | |
| # 中文分詞(用 jieba) | |
| tokenized = [' '.join(jieba.cut(sent)) for sent in sentences] | |
| # TF-IDF 向量化 | |
| vectorizer = TfidfVectorizer() | |
| try: | |
| tfidf = vectorizer.fit_transform(tokenized) | |
| except ValueError: | |
| return ' '.join(sentences)[:500] | |
| # 計算句子相似度 | |
| sim_mat = cosine_similarity(tfidf) | |
| # 簡化 TextRank:用相似度均值當分數 | |
| scores = np.mean(sim_mat, axis=1) | |
| # 按分數排序 | |
| ranked = sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True) | |
| # 拼接句子直到達到目標長度 | |
| result = [] | |
| total_len = 0 | |
| for sent, _ in ranked: | |
| if total_len >= target_min: | |
| break | |
| if total_len + len(sent) > target_max: | |
| # 截斷最後一句 | |
| remaining = target_max - total_len | |
| result.append(sent[:remaining]) | |
| break | |
| result.append(sent) | |
| total_len += len(sent) | |
| return ''.join([s + '。' for s in result if s]) | |
| def summarize_url(url): | |
| if not url or not url.startswith("http"): | |
| return "請輸入有效網址(需包含 http:// 或 https://)" | |
| try: | |
| downloaded = trafilatura.fetch_url(url) | |
| if not downloaded: | |
| return "無法載入該網頁,請確認網址是否正確。" | |
| text = trafilatura.extract( | |
| downloaded, | |
| include_comments=False, | |
| include_tables=False, | |
| output_format="txt" | |
| ) | |
| if not text or len(text.strip()) < 100: | |
| return "無法提取足夠的文字內容(可能不是文章頁面)。" | |
| text = clean_text(text) | |
| summary = textrank_summary(text, target_min=300, target_max=500) | |
| return summary | |
| except Exception as e: | |
| return f"處理錯誤:{str(e)}" | |
| # Gradio UI | |
| with gr.Blocks(title="中文網頁重點整理(300–500字)") as demo: | |
| gr.Markdown("## 🌐 中文網頁重點整理 Agent\n輸入網址,自動提取 **300–500 字重點內容**(CPU 友好,無需模型)") | |
| url_input = gr.Textbox(label="請輸入網址", placeholder="https://example.com/chinese-article") | |
| output = gr.Textbox(label="重點摘要", lines=10) | |
| btn = gr.Button("開始整理") | |
| btn.click(fn=summarize_url, inputs=url_input, outputs=output) | |
| gr.Markdown("💡 適用於新聞、報告、部落格等文字型網頁。") | |
| demo.launch() |