DavidWill commited on
Commit
7eaac20
·
verified ·
1 Parent(s): 2592248

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -25
app.py CHANGED
@@ -1,39 +1,97 @@
1
  # app.py
2
- from transformers import pipeline
3
  import gradio as gr
4
  import trafilatura
 
 
 
 
 
5
 
6
- # 使用公開可用的多語摘要模型
7
- #from transformers import pipeline
8
- summarizer = pipeline(
9
- "summarization",
10
- model="csebuetnlp/mT5_multilingual_XLSum",
11
- tokenizer="csebuetnlp/mT5_multilingual_XLSum",
12
- device=-1
13
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def summarize_url(url):
16
  if not url or not url.startswith("http"):
17
- return "請輸入有效網址"
 
18
  try:
19
  downloaded = trafilatura.fetch_url(url)
20
  if not downloaded:
21
- return "無法載入網頁"
22
- text = trafilatura.extract(downloaded, include_comments=False, output_format="txt")
23
- if not text or len(text) < 50:
24
- return "內容太少或非文字頁面"
25
- # 模型雖支援長文本,但 CPU 建議縮短
26
- input_text = text[:800]
27
- summary = summarizer(input_text, max_length=120, min_length=40, do_sample=False)
28
- return summary[0]['summary_text']
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
- return f"錯誤:{str(e)}"
31
 
32
- with gr.Blocks() as demo:
33
- gr.Markdown("## 🌐 網頁重點整理(支援中文)")
34
- url = gr.Textbox(label="網址")
35
- out = gr.Textbox(label="摘要", lines=6)
36
- btn = gr.Button("整理")
37
- btn.click(summarize_url, url, out)
 
 
38
 
39
  demo.launch()
 
1
  # app.py
 
2
  import gradio as gr
3
  import trafilatura
4
+ import jieba
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import numpy as np
8
+ import re
9
 
10
+ def clean_text(text):
11
+ # 移除過多空白、保留中文/英文/數字/標點
12
+ text = re.sub(r'\s+', ' ', text)
13
+ return text.strip()
14
+
15
+ def textrank_summary(text, target_min=300, target_max=500):
16
+ # 以句號、換行切分句子
17
+ sentences = re.split(r'[。!?\n]', text)
18
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10] # 過濾太短句
19
+
20
+ if not sentences:
21
+ return text[:500]
22
+
23
+ # 若全文太短,直接回傳
24
+ if len(text) <= target_max:
25
+ return text
26
+
27
+ # 中文分詞(用 jieba)
28
+ tokenized = [' '.join(jieba.cut(sent)) for sent in sentences]
29
+
30
+ # TF-IDF 向量化
31
+ vectorizer = TfidfVectorizer()
32
+ try:
33
+ tfidf = vectorizer.fit_transform(tokenized)
34
+ except ValueError:
35
+ return ' '.join(sentences)[:500]
36
+
37
+ # 計算句子相似度
38
+ sim_mat = cosine_similarity(tfidf)
39
+
40
+ # 簡化 TextRank:用相似度均值當分數
41
+ scores = np.mean(sim_mat, axis=1)
42
+
43
+ # 按分數排序
44
+ ranked = sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)
45
+
46
+ # 拼接句子直到達到目標長度
47
+ result = []
48
+ total_len = 0
49
+ for sent, _ in ranked:
50
+ if total_len >= target_min:
51
+ break
52
+ if total_len + len(sent) > target_max:
53
+ # 截斷最後一句
54
+ remaining = target_max - total_len
55
+ result.append(sent[:remaining])
56
+ break
57
+ result.append(sent)
58
+ total_len += len(sent)
59
+
60
+ return ''.join([s + '。' for s in result if s])
61
 
62
  def summarize_url(url):
63
  if not url or not url.startswith("http"):
64
+ return "請輸入有效網址(需包含 http:// 或 https://)"
65
+
66
  try:
67
  downloaded = trafilatura.fetch_url(url)
68
  if not downloaded:
69
+ return "無法載入該網頁,請確認網址是否正確。"
70
+
71
+ text = trafilatura.extract(
72
+ downloaded,
73
+ include_comments=False,
74
+ include_tables=False,
75
+ output_format="txt"
76
+ )
77
+
78
+ if not text or len(text.strip()) < 100:
79
+ return "無法提取足夠的文字內容(可能不是文章頁面)。"
80
+
81
+ text = clean_text(text)
82
+ summary = textrank_summary(text, target_min=300, target_max=500)
83
+ return summary
84
+
85
  except Exception as e:
86
+ return f"處理錯誤:{str(e)}"
87
 
88
+ # Gradio UI
89
+ with gr.Blocks(title="中文網頁重點整理(300–500字)") as demo:
90
+ gr.Markdown("## 🌐 中文網頁重點整理 Agent\n輸入網址,自動提取 **300–500 字重點內容**(CPU 友好,無需模型)")
91
+ url_input = gr.Textbox(label="請輸入網址", placeholder="https://example.com/chinese-article")
92
+ output = gr.Textbox(label="重點摘要", lines=10)
93
+ btn = gr.Button("開始整理")
94
+ btn.click(fn=summarize_url, inputs=url_input, outputs=output)
95
+ gr.Markdown("💡 適用於新聞、報告、部落格等文字型網頁。")
96
 
97
  demo.launch()