Spaces:
Sleeping
Sleeping
Update database.py
Browse files- database.py +111 -15
database.py
CHANGED
|
@@ -9,7 +9,7 @@ from contextlib import contextmanager
|
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
class NewsDatabase:
|
| 12 |
-
"""新聞資料庫管理器"""
|
| 13 |
|
| 14 |
def __init__(self, db_path: str = "news.db"):
|
| 15 |
self.db_path = db_path
|
|
@@ -46,6 +46,8 @@ class NewsDatabase:
|
|
| 46 |
cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
|
| 47 |
cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
|
| 48 |
cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# 創建統計表
|
| 51 |
cursor.execute("""
|
|
@@ -131,27 +133,51 @@ class NewsDatabase:
|
|
| 131 |
logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
|
| 132 |
return inserted_count, duplicate_count
|
| 133 |
|
| 134 |
-
def get_recent_news(self, category: str = "all",
|
| 135 |
-
|
|
|
|
| 136 |
try:
|
| 137 |
with self._get_connection() as conn:
|
| 138 |
cursor = conn.cursor()
|
| 139 |
|
| 140 |
# 構建查詢條件
|
| 141 |
-
|
| 142 |
-
params = [
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
if category != "all":
|
| 145 |
-
|
| 146 |
params.append(category)
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
query = f"""
|
| 149 |
SELECT * FROM news
|
| 150 |
{where_clause}
|
| 151 |
-
ORDER BY published_date DESC
|
| 152 |
-
LIMIT ?
|
| 153 |
"""
|
| 154 |
-
|
|
|
|
|
|
|
| 155 |
|
| 156 |
cursor.execute(query, params)
|
| 157 |
rows = cursor.fetchall()
|
|
@@ -162,9 +188,16 @@ class NewsDatabase:
|
|
| 162 |
news_dict = dict(row)
|
| 163 |
# 轉換日期格式
|
| 164 |
if news_dict['published_date']:
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
news_list.append(news_dict)
|
| 167 |
|
|
|
|
| 168 |
return news_list
|
| 169 |
|
| 170 |
except Exception as e:
|
|
@@ -202,8 +235,17 @@ class NewsDatabase:
|
|
| 202 |
cursor.execute("SELECT MAX(created_date) as last_update FROM news")
|
| 203 |
last_update = cursor.fetchone()['last_update']
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
return {
|
| 206 |
'total_news': total_news,
|
|
|
|
| 207 |
'us_stock_count': category_stats.get('us_stock', 0),
|
| 208 |
'tw_stock_count': category_stats.get('tw_stock', 0),
|
| 209 |
'positive_count': sentiment_stats.get('positive', 0),
|
|
@@ -258,20 +300,36 @@ class NewsDatabase:
|
|
| 258 |
except Exception as e:
|
| 259 |
logger.error(f"記錄爬蟲統計錯誤: {e}")
|
| 260 |
|
| 261 |
-
def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.
|
| 262 |
-
"""檢查標題重複性"""
|
| 263 |
try:
|
|
|
|
|
|
|
|
|
|
| 264 |
with self._get_connection() as conn:
|
| 265 |
cursor = conn.cursor()
|
| 266 |
|
| 267 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
cursor.execute("""
|
| 269 |
SELECT title FROM news
|
| 270 |
WHERE created_date >= ?
|
| 271 |
-
|
|
|
|
|
|
|
| 272 |
|
| 273 |
existing_titles = [row['title'] for row in cursor.fetchall()]
|
| 274 |
|
|
|
|
|
|
|
|
|
|
| 275 |
# 計算相似度(簡化版)
|
| 276 |
title_words = set(title.lower().split())
|
| 277 |
|
|
@@ -287,10 +345,48 @@ class NewsDatabase:
|
|
| 287 |
similarity = len(intersection) / len(union) if union else 0
|
| 288 |
|
| 289 |
if similarity > similarity_threshold:
|
|
|
|
|
|
|
|
|
|
| 290 |
return True
|
| 291 |
|
| 292 |
return False
|
| 293 |
|
| 294 |
except Exception as e:
|
| 295 |
logger.error(f"檢查標題重複性錯誤: {e}")
|
| 296 |
-
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
class NewsDatabase:
|
| 12 |
+
"""新聞資料庫管理器 - 增強版"""
|
| 13 |
|
| 14 |
def __init__(self, db_path: str = "news.db"):
|
| 15 |
self.db_path = db_path
|
|
|
|
| 46 |
cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
|
| 47 |
cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
|
| 48 |
cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
|
| 49 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_title ON news(title)")
|
| 50 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_content ON news(content)")
|
| 51 |
|
| 52 |
# 創建統計表
|
| 53 |
cursor.execute("""
|
|
|
|
| 133 |
logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
|
| 134 |
return inserted_count, duplicate_count
|
| 135 |
|
| 136 |
+
def get_recent_news(self, category: str = "all", days: int = 7,
|
| 137 |
+
keyword: str = "", sentiment_filter: str = "all") -> List[Dict]:
|
| 138 |
+
"""獲取最近的新聞 - 增強版"""
|
| 139 |
try:
|
| 140 |
with self._get_connection() as conn:
|
| 141 |
cursor = conn.cursor()
|
| 142 |
|
| 143 |
# 構建查詢條件
|
| 144 |
+
where_conditions = []
|
| 145 |
+
params = []
|
| 146 |
|
| 147 |
+
# 時間條件
|
| 148 |
+
if days > 0:
|
| 149 |
+
where_conditions.append("published_date >= ?")
|
| 150 |
+
params.append(datetime.now() - timedelta(days=days))
|
| 151 |
+
|
| 152 |
+
# 分類條件
|
| 153 |
if category != "all":
|
| 154 |
+
where_conditions.append("category = ?")
|
| 155 |
params.append(category)
|
| 156 |
|
| 157 |
+
# 關鍵字搜尋
|
| 158 |
+
if keyword:
|
| 159 |
+
where_conditions.append("(title LIKE ? OR content LIKE ?)")
|
| 160 |
+
keyword_pattern = f"%{keyword}%"
|
| 161 |
+
params.extend([keyword_pattern, keyword_pattern])
|
| 162 |
+
|
| 163 |
+
# 情緒篩選
|
| 164 |
+
if sentiment_filter != "all":
|
| 165 |
+
where_conditions.append("sentiment = ?")
|
| 166 |
+
params.append(sentiment_filter)
|
| 167 |
+
|
| 168 |
+
# 組合查詢
|
| 169 |
+
where_clause = ""
|
| 170 |
+
if where_conditions:
|
| 171 |
+
where_clause = "WHERE " + " AND ".join(where_conditions)
|
| 172 |
+
|
| 173 |
query = f"""
|
| 174 |
SELECT * FROM news
|
| 175 |
{where_clause}
|
| 176 |
+
ORDER BY published_date DESC
|
|
|
|
| 177 |
"""
|
| 178 |
+
|
| 179 |
+
logger.info(f"執行查詢: {query}")
|
| 180 |
+
logger.info(f"參數: {params}")
|
| 181 |
|
| 182 |
cursor.execute(query, params)
|
| 183 |
rows = cursor.fetchall()
|
|
|
|
| 188 |
news_dict = dict(row)
|
| 189 |
# 轉換日期格式
|
| 190 |
if news_dict['published_date']:
|
| 191 |
+
try:
|
| 192 |
+
if isinstance(news_dict['published_date'], str):
|
| 193 |
+
news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
|
| 194 |
+
else:
|
| 195 |
+
news_dict['published_date'] = news_dict['published_date']
|
| 196 |
+
except:
|
| 197 |
+
news_dict['published_date'] = datetime.now()
|
| 198 |
news_list.append(news_dict)
|
| 199 |
|
| 200 |
+
logger.info(f"找到 {len(news_list)} 篇新聞")
|
| 201 |
return news_list
|
| 202 |
|
| 203 |
except Exception as e:
|
|
|
|
| 235 |
cursor.execute("SELECT MAX(created_date) as last_update FROM news")
|
| 236 |
last_update = cursor.fetchone()['last_update']
|
| 237 |
|
| 238 |
+
# 近7天統計
|
| 239 |
+
cursor.execute("""
|
| 240 |
+
SELECT COUNT(*) as recent_count
|
| 241 |
+
FROM news
|
| 242 |
+
WHERE published_date >= ?
|
| 243 |
+
""", (datetime.now() - timedelta(days=7),))
|
| 244 |
+
recent_count = cursor.fetchone()['recent_count']
|
| 245 |
+
|
| 246 |
return {
|
| 247 |
'total_news': total_news,
|
| 248 |
+
'recent_news': recent_count,
|
| 249 |
'us_stock_count': category_stats.get('us_stock', 0),
|
| 250 |
'tw_stock_count': category_stats.get('tw_stock', 0),
|
| 251 |
'positive_count': sentiment_stats.get('positive', 0),
|
|
|
|
| 300 |
except Exception as e:
|
| 301 |
logger.error(f"記錄爬蟲統計錯誤: {e}")
|
| 302 |
|
| 303 |
+
def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.9) -> bool:
|
| 304 |
+
"""檢查標題重複性 - 修正版"""
|
| 305 |
try:
|
| 306 |
+
if not title:
|
| 307 |
+
return False
|
| 308 |
+
|
| 309 |
with self._get_connection() as conn:
|
| 310 |
cursor = conn.cursor()
|
| 311 |
|
| 312 |
+
# 先檢查完全相同的標題
|
| 313 |
+
cursor.execute("SELECT COUNT(*) as count FROM news WHERE title = ?", (title,))
|
| 314 |
+
exact_match = cursor.fetchone()['count']
|
| 315 |
+
|
| 316 |
+
if exact_match > 0:
|
| 317 |
+
logger.debug(f"發現完全相同的標題: {title}")
|
| 318 |
+
return True
|
| 319 |
+
|
| 320 |
+
# 檢查相似標題(近期的)
|
| 321 |
cursor.execute("""
|
| 322 |
SELECT title FROM news
|
| 323 |
WHERE created_date >= ?
|
| 324 |
+
ORDER BY created_date DESC
|
| 325 |
+
LIMIT 100
|
| 326 |
+
""", (datetime.now() - timedelta(hours=6),)) # 只檢查6小時內的
|
| 327 |
|
| 328 |
existing_titles = [row['title'] for row in cursor.fetchall()]
|
| 329 |
|
| 330 |
+
if not existing_titles:
|
| 331 |
+
return False
|
| 332 |
+
|
| 333 |
# 計算相似度(簡化版)
|
| 334 |
title_words = set(title.lower().split())
|
| 335 |
|
|
|
|
| 345 |
similarity = len(intersection) / len(union) if union else 0
|
| 346 |
|
| 347 |
if similarity > similarity_threshold:
|
| 348 |
+
logger.debug(f"發現相似標題 (相似度: {similarity:.2f})")
|
| 349 |
+
logger.debug(f"新標題: {title}")
|
| 350 |
+
logger.debug(f"既有標題: {existing_title}")
|
| 351 |
return True
|
| 352 |
|
| 353 |
return False
|
| 354 |
|
| 355 |
except Exception as e:
|
| 356 |
logger.error(f"檢查標題重複性錯誤: {e}")
|
| 357 |
+
return False
|
| 358 |
+
|
| 359 |
+
def get_keywords_stats(self, days: int = 7) -> List[Dict]:
|
| 360 |
+
"""獲取關鍵字統計"""
|
| 361 |
+
try:
|
| 362 |
+
with self._get_connection() as conn:
|
| 363 |
+
cursor = conn.cursor()
|
| 364 |
+
|
| 365 |
+
cursor.execute("""
|
| 366 |
+
SELECT title, content
|
| 367 |
+
FROM news
|
| 368 |
+
WHERE published_date >= ?
|
| 369 |
+
""", (datetime.now() - timedelta(days=days),))
|
| 370 |
+
|
| 371 |
+
rows = cursor.fetchall()
|
| 372 |
+
|
| 373 |
+
# 簡單的關鍵字提取(可以後續改進)
|
| 374 |
+
keyword_count = {}
|
| 375 |
+
common_words = {'的', '了', '在', '是', '有', '和', '與', '為', '一', '不', '上', '下', '中', '也', '會', '將', '及', '或', '等'}
|
| 376 |
+
|
| 377 |
+
for row in rows:
|
| 378 |
+
text = (row['title'] + ' ' + row['content']).lower()
|
| 379 |
+
words = text.split()
|
| 380 |
+
|
| 381 |
+
for word in words:
|
| 382 |
+
if len(word) > 1 and word not in common_words:
|
| 383 |
+
keyword_count[word] = keyword_count.get(word, 0) + 1
|
| 384 |
+
|
| 385 |
+
# 返回前20個關鍵字
|
| 386 |
+
sorted_keywords = sorted(keyword_count.items(), key=lambda x: x[1], reverse=True)[:20]
|
| 387 |
+
|
| 388 |
+
return [{'keyword': k, 'count': v} for k, v in sorted_keywords]
|
| 389 |
+
|
| 390 |
+
except Exception as e:
|
| 391 |
+
logger.error(f"獲取關鍵字統計錯誤: {e}")
|
| 392 |
+
return []
|