Spaces:

richpei
/

ptt-wordcloud-analyzer

Build error

App Files Files Community

ptt-wordcloud-analyzer / app.py

richpei

Upload 4 files

7394029 verified 4 months ago

raw

history blame contribute delete

10.3 kB

	"""
	PTT 政治版爬蟲與文字雲分析 - Hugging Face Spaces 版本

	使用 Gradio 界面，可部署到 Hugging Face Spaces

	作者: Claude
	日期: 2024-12-18
	"""

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import jieba
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from collections import Counter
	import re
	import time
	import io
	from PIL import Image


	class PTTCrawler:
	"""PTT 爬蟲"""

	def __init__(self):
	self.base_url = "https://www.ptt.cc"
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
	'Cookie': 'over18=1'
	}
	self.stopwords = {
	'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
	'都', '一', '上', '也', '很', '到', '說', '要', '去', '你',
	'會', '著', '沒有', '看', '好', 'Re', 'Fw', 'http', 'https',
	'www', 'com', 'ptt', 'cc', '作者', '看板', '標題', '時間'
	}

	def crawl_board(self, board, max_articles, progress=gr.Progress()):
	"""爬取看板文章"""

	articles = []
	current_url = f"{self.base_url}/bbs/{board}/index.html"

	progress(0, desc="開始爬取...")

	total_pages = max_articles // 20 + 1
	page_count = 0

	while len(articles) < max_articles and current_url and page_count < total_pages:
	try:
	# 獲取頁面
	response = requests.get(current_url, headers=self.headers, timeout=10)
	response.encoding = 'utf-8'
	soup = BeautifulSoup(response.text, 'html.parser')

	# 解析文章
	entries = soup.find_all('div', class_='r-ent')
	for entry in entries:
	title_tag = entry.find('div', class_='title')
	if title_tag and title_tag.find('a'):
	title = title_tag.find('a').text.strip()
	# 過濾公告等
	if not any(x in title for x in ['[公告]', '[協尋]', '(本文已被刪除)']):
	articles.append(title)

	# 更新進度
	progress(len(articles) / max_articles,
	desc=f"已爬取 {len(articles)} 篇...")

	# 找上一頁
	btn_group = soup.find('div', class_='btn-group-paging')
	if btn_group:
	prev_links = btn_group.find_all('a', class_='btn')
	if len(prev_links) >= 2:
	current_url = self.base_url + prev_links[1]['href']
	else:
	break
	else:
	break

	page_count += 1
	time.sleep(0.5) # 延遲避免被封鎖

	except Exception as e:
	break

	progress(1.0, desc="爬取完成！")
	return articles[:max_articles]

	def generate_wordcloud(self, texts):
	"""生成文字雲"""

	# 合併文字
	all_text = ' '.join(texts)

	# 移除標籤
	all_text = re.sub(r'\[.*?\]', '', all_text)

	# 分詞
	words = jieba.cut(all_text)

	# 過濾
	filtered = [w for w in words
	if len(w) > 1
	and w not in self.stopwords
	and not w.isdigit()
	and not re.match(r'^[a-zA-Z]+$', w)]

	# 計算詞頻
	word_freq = Counter(filtered)

	# 生成文字雲
	try:
	# 嘗試使用系統字型
	wordcloud = WordCloud(
	width=1200,
	height=600,
	background_color='white',
	max_words=150,
	relative_scaling=0.5,
	colormap='viridis',
	font_path=None # 讓它自動找字型
	).generate_from_frequencies(word_freq)
	except:
	# 如果失敗，不指定字型（只顯示英文和數字）
	wordcloud = WordCloud(
	width=1200,
	height=600,
	background_color='white',
	max_words=150,
	relative_scaling=0.5,
	colormap='viridis'
	).generate_from_frequencies(word_freq)

	# 轉換為圖片
	fig, ax = plt.subplots(figsize=(12, 6), dpi=100)
	ax.imshow(wordcloud, interpolation='bilinear')
	ax.axis('off')
	ax.set_title('PTT 文字雲分析', fontsize=16, pad=10)
	plt.tight_layout(pad=0)

	# 儲存到 buffer
	buf = io.BytesIO()
	plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
	buf.seek(0)
	plt.close()

	# 返回 PIL Image
	img = Image.open(buf)

	return img, word_freq


	def analyze_ptt(board, num_articles, progress=gr.Progress()):
	"""分析 PTT 看板"""

	if not board:
	return None, "❌ 請輸入看板名稱", None

	if num_articles < 10:
	return None, "❌ 文章數量至少需要 10 篇", None

	try:
	# 創建爬蟲
	crawler = PTTCrawler()

	# 爬取文章
	progress(0, desc="開始爬取文章...")
	articles = crawler.crawl_board(board, num_articles, progress)

	if not articles:
	return None, f"❌ 無法爬取 {board} 版，請確認看板名稱是否正確", None

	# 生成文字雲
	progress(0.8, desc="生成文字雲...")
	wordcloud_img, word_freq = crawler.generate_wordcloud(articles)

	# 生成統計資訊
	stats = f"""
	📊 分析結果

	🔍 看板: {board}
	📝 文章數量: {len(articles)} 篇

	📈 前 20 個最常見的詞:
	"""
	for i, (word, freq) in enumerate(word_freq.most_common(20), 1):
	stats += f"\n{i:2d}. {word}: {freq} 次"

	# 準備下載內容
	download_content = f"PTT {board} 版文章標題\n{'='*60}\n\n"
	for i, article in enumerate(articles, 1):
	download_content += f"{i}. {article}\n"

	# 儲存到檔案
	filename = f"ptt_{board}_articles.txt"
	with open(filename, 'w', encoding='utf-8') as f:
	f.write(download_content)

	progress(1.0, desc="✅ 完成！")

	return wordcloud_img, stats, filename

	except Exception as e:
	return None, f"❌ 發生錯誤: {str(e)}", None


	# 創建 Gradio 界面
	with gr.Blocks(title="PTT 文字雲分析", theme=gr.themes.Soft()) as app:

	gr.Markdown("""
	# 📰 PTT 文字雲分析工具

	分析 PTT 看板文章，生成文字雲視覺化

	支援的看板：HatePolitics (政治), Gossiping (八卦), Stock (股票), NBA, Baseball 等

	---
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# 輸入區
	board_input = gr.Textbox(
	label="🎯 看板名稱",
	value="HatePolitics",
	placeholder="例如: HatePolitics, Gossiping, Stock",
	info="輸入 PTT 看板名稱（不含 'bbs' 和 '版'）"
	)

	num_slider = gr.Slider(
	label="📊 文章數量",
	minimum=10,
	maximum=500,
	value=100,
	step=10,
	info="爬取的文章數量（10-500 篇）"
	)

	analyze_btn = gr.Button(
	"🚀 開始分析",
	variant="primary",
	size="lg"
	)

	# 統計資訊
	stats_output = gr.Textbox(
	label="📊 統計結果",
	lines=20,
	interactive=False
	)

	with gr.Column(scale=2):
	# 文字雲圖片
	wordcloud_output = gr.Image(
	label="☁️ 文字雲",
	type="pil"
	)

	# 下載按鈕
	download_output = gr.File(
	label="💾 下載文章列表"
	)

	# 使用說明
	with gr.Accordion("📖 使用說明", open=False):
	gr.Markdown("""
	### 如何使用

	1. 輸入看板名稱：例如 `HatePolitics`（政治版）、`Gossiping`（八卦版）
	2. 選擇文章數量：建議 50-200 篇，數量越多越慢
	3. 點擊「開始分析」：等待爬取和分析完成
	4. 查看結果：
	- 左側顯示統計資訊和熱門詞彙
	- 右側顯示文字雲圖片
	- 可下載文章列表

	### 常見看板名稱

	- `HatePolitics` - 政治版
	- `Gossiping` - 八卦版
	- `Stock` - 股票版
	- `NBA` - NBA 版
	- `Baseball` - 棒球版
	- `C_Chat` - 希洽版
	- `Tech_Job` - 科技工作版

	### 注意事項

	- ⏱️ 爬取需要時間，請耐心等待
	- 🚦 建議不要選擇太多文章（避免被封鎖）
	- 🔒 部分看板需要登入或有限制
	- 📊 文字雲只顯示最常見的詞彙

	### 技術說明

	- 使用 `jieba` 進行中文分詞
	- 自動過濾停用詞（的、了、在等）
	- 移除文章標籤和特殊符號
	- 生成高解析度文字雲圖片
	""")

	# 範例
	gr.Examples(
	examples=[
	["HatePolitics", 100],
	["Gossiping", 150],
	["Stock", 100],
	["NBA", 80],
	],
	inputs=[board_input, num_slider],
	label="💡 快速範例"
	)

	# 綁定事件
	analyze_btn.click(
	fn=analyze_ptt,
	inputs=[board_input, num_slider],
	outputs=[wordcloud_output, stats_output, download_output]
	)


	# 啟動應用
	if __name__ == "__main__":
	app.launch(
	share=False,
	server_name="0.0.0.0",
	server_port=7860
	)