Spaces:

khjhs60199
/

pyCrawing

Sleeping

App Files Files Community

pyCrawing / utils.py

khjhs60199

kickStart

ec6ea02 verified 5 months ago

raw

history blame contribute delete

5.48 kB

	import logging
	import re
	from datetime import datetime
	from typing import List, Dict
	import html

	def setup_logging():
	"""設置日誌系統"""
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(),
	logging.FileHandler('news_app.log', encoding='utf-8')
	]
	)

	def format_news_for_display(news_data: List[Dict]) -> str:
	"""格式化新聞用於顯示"""
	if not news_data:
	return "📰 暫無新聞資料"

	html_content = """
	<div style="max-width: 100%; font-family: Arial, sans-serif;">
	"""

	for news in news_data:
	# 獲取情緒和對應的樣式
	sentiment = news.get('sentiment', 'neutral')
	sentiment_class = f"news-{sentiment}"

	# 情緒徽章
	sentiment_badges = {
	'positive': '<span class="sentiment-badge positive-badge">正面 😊</span>',
	'negative': '<span class="sentiment-badge negative-badge">負面 😔</span>',
	'neutral': '<span class="sentiment-badge neutral-badge">中性 😐</span>'
	}

	sentiment_badge = sentiment_badges.get(sentiment, sentiment_badges['neutral'])

	# 格式化發布時間
	published_date = news.get('published_date', '')
	if isinstance(published_date, str):
	try:
	dt = datetime.fromisoformat(published_date.replace('Z', '+00:00'))
	formatted_date = dt.strftime('%Y-%m-%d %H:%M')
	except:
	formatted_date = published_date
	else:
	formatted_date = str(published_date)

	# 清理和截斷內容
	title = html.escape(news.get('title', ''))
	content = html.escape(news.get('content', ''))
	url = news.get('url', '')
	source = html.escape(news.get('source', ''))
	category_name = {'us_stock': '美股', 'tw_stock': '台股'}.get(news.get('category', ''), '財經')

	# 截斷長內容
	if len(content) > 300:
	content = content[:300] + "..."

	# 新聞卡片HTML
	news_card = f"""
	<div class="news-card {sentiment_class}" style="margin-bottom: 20px; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
	<div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 10px;">
	<h3 style="margin: 0; font-size: 18px; color: #333; flex: 1; margin-right: 10px;">
	<a href="{url}" target="_blank" style="color: #333; text-decoration: none;">{title}</a>
	</h3>
	{sentiment_badge}
	</div>

	<div style="margin-bottom: 10px; color: #666; font-size: 14px;">
	<span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 12px; margin-right: 8px;">{category_name}</span>
	<span>{source}</span>
	<span style="margin-left: 8px;">📅 {formatted_date}</span>
	</div>

	<p style="margin: 10px 0; color: #555; line-height: 1.6;">{content}</p>

	<div style="margin-top: 10px; text-align: right;">
	<a href="{url}" target="_blank" style="color: #007bff; text-decoration: none; font-size: 14px;">閱讀全文 →</a>
	</div>
	</div>
	"""

	html_content += news_card

	html_content += "</div>"

	return html_content

	def clean_text(text: str) -> str:
	"""清理文本"""
	if not text:
	return ""

	# 移除HTML標籤
	text = re.sub(r'<[^>]+>', '', text)

	# 移除多餘空格
	text = re.sub(r'\s+', ' ', text)

	# 移除特殊字符
	text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()（），。！？]', '', text)

	return text.strip()

	def calculate_similarity(text1: str, text2: str) -> float:
	"""計算兩個文本的相似度"""
	if not text1 or not text2:
	return 0.0

	# 簡單的詞彙相似度計算
	words1 = set(text1.lower().split())
	words2 = set(text2.lower().split())

	if not words1 or not words2:
	return 0.0

	intersection = words1.intersection(words2)
	union = words1.union(words2)

	return len(intersection) / len(union) if union else 0.0

	def validate_url(url: str) -> bool:
	"""驗證URL格式"""
	url_pattern = re.compile(
	r'^https?://' # http:// or https://
	r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?\|' # domain...
	r'localhost\|' # localhost...
	r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
	r'(?::\d+)?' # optional port
	r'(?:/?\|[/?]\S+)$', re.IGNORECASE)

	return url_pattern.match(url) is not None

	def format_duration(seconds: float) -> str:
	"""格式化時間長度"""
	if seconds < 60:
	return f"{seconds:.1f}秒"
	elif seconds < 3600:
	minutes = seconds / 60
	return f"{minutes:.1f}分鐘"
	else:
	hours = seconds / 3600
	return f"{hours:.1f}小時"

	def truncate_text(text: str, max_length: int = 100) -> str:
	"""截斷文本"""
	if not text:
	return ""

	if len(text) <= max_length:
	return text

	return text[:max_length].rsplit(' ', 1)[0] + "..."