khjhs60199 commited on
Commit
9bf2819
·
verified ·
1 Parent(s): d740988

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +111 -15
database.py CHANGED
@@ -9,7 +9,7 @@ from contextlib import contextmanager
9
  logger = logging.getLogger(__name__)
10
 
11
  class NewsDatabase:
12
- """新聞資料庫管理器"""
13
 
14
  def __init__(self, db_path: str = "news.db"):
15
  self.db_path = db_path
@@ -46,6 +46,8 @@ class NewsDatabase:
46
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
47
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
48
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
 
 
49
 
50
  # 創建統計表
51
  cursor.execute("""
@@ -131,27 +133,51 @@ class NewsDatabase:
131
  logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
132
  return inserted_count, duplicate_count
133
 
134
- def get_recent_news(self, category: str = "all", limit: int = 50, days: int = 7) -> List[Dict]:
135
- """獲取最近的新聞"""
 
136
  try:
137
  with self._get_connection() as conn:
138
  cursor = conn.cursor()
139
 
140
  # 構建查詢條件
141
- where_clause = "WHERE published_date >= ?"
142
- params = [datetime.now() - timedelta(days=days)]
143
 
 
 
 
 
 
 
144
  if category != "all":
145
- where_clause += " AND category = ?"
146
  params.append(category)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  query = f"""
149
  SELECT * FROM news
150
  {where_clause}
151
- ORDER BY published_date DESC
152
- LIMIT ?
153
  """
154
- params.append(limit)
 
 
155
 
156
  cursor.execute(query, params)
157
  rows = cursor.fetchall()
@@ -162,9 +188,16 @@ class NewsDatabase:
162
  news_dict = dict(row)
163
  # 轉換日期格式
164
  if news_dict['published_date']:
165
- news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
 
 
 
 
 
 
166
  news_list.append(news_dict)
167
 
 
168
  return news_list
169
 
170
  except Exception as e:
@@ -202,8 +235,17 @@ class NewsDatabase:
202
  cursor.execute("SELECT MAX(created_date) as last_update FROM news")
203
  last_update = cursor.fetchone()['last_update']
204
 
 
 
 
 
 
 
 
 
205
  return {
206
  'total_news': total_news,
 
207
  'us_stock_count': category_stats.get('us_stock', 0),
208
  'tw_stock_count': category_stats.get('tw_stock', 0),
209
  'positive_count': sentiment_stats.get('positive', 0),
@@ -258,20 +300,36 @@ class NewsDatabase:
258
  except Exception as e:
259
  logger.error(f"記錄爬蟲統計錯誤: {e}")
260
 
261
- def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.8) -> bool:
262
- """檢查標題重複性"""
263
  try:
 
 
 
264
  with self._get_connection() as conn:
265
  cursor = conn.cursor()
266
 
267
- # 簡單的標題相似度檢查
 
 
 
 
 
 
 
 
268
  cursor.execute("""
269
  SELECT title FROM news
270
  WHERE created_date >= ?
271
- """, (datetime.now() - timedelta(days=1),))
 
 
272
 
273
  existing_titles = [row['title'] for row in cursor.fetchall()]
274
 
 
 
 
275
  # 計算相似度(簡化版)
276
  title_words = set(title.lower().split())
277
 
@@ -287,10 +345,48 @@ class NewsDatabase:
287
  similarity = len(intersection) / len(union) if union else 0
288
 
289
  if similarity > similarity_threshold:
 
 
 
290
  return True
291
 
292
  return False
293
 
294
  except Exception as e:
295
  logger.error(f"檢查標題重複性錯誤: {e}")
296
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  logger = logging.getLogger(__name__)
10
 
11
  class NewsDatabase:
12
+ """新聞資料庫管理器 - 增強版"""
13
 
14
  def __init__(self, db_path: str = "news.db"):
15
  self.db_path = db_path
 
46
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
47
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
48
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
49
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_title ON news(title)")
50
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_content ON news(content)")
51
 
52
  # 創建統計表
53
  cursor.execute("""
 
133
  logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
134
  return inserted_count, duplicate_count
135
 
136
+ def get_recent_news(self, category: str = "all", days: int = 7,
137
+ keyword: str = "", sentiment_filter: str = "all") -> List[Dict]:
138
+ """獲取最近的新聞 - 增強版"""
139
  try:
140
  with self._get_connection() as conn:
141
  cursor = conn.cursor()
142
 
143
  # 構建查詢條件
144
+ where_conditions = []
145
+ params = []
146
 
147
+ # 時間條件
148
+ if days > 0:
149
+ where_conditions.append("published_date >= ?")
150
+ params.append(datetime.now() - timedelta(days=days))
151
+
152
+ # 分類條件
153
  if category != "all":
154
+ where_conditions.append("category = ?")
155
  params.append(category)
156
 
157
+ # 關鍵字搜尋
158
+ if keyword:
159
+ where_conditions.append("(title LIKE ? OR content LIKE ?)")
160
+ keyword_pattern = f"%{keyword}%"
161
+ params.extend([keyword_pattern, keyword_pattern])
162
+
163
+ # 情緒篩選
164
+ if sentiment_filter != "all":
165
+ where_conditions.append("sentiment = ?")
166
+ params.append(sentiment_filter)
167
+
168
+ # 組合查詢
169
+ where_clause = ""
170
+ if where_conditions:
171
+ where_clause = "WHERE " + " AND ".join(where_conditions)
172
+
173
  query = f"""
174
  SELECT * FROM news
175
  {where_clause}
176
+ ORDER BY published_date DESC
 
177
  """
178
+
179
+ logger.info(f"執行查詢: {query}")
180
+ logger.info(f"參數: {params}")
181
 
182
  cursor.execute(query, params)
183
  rows = cursor.fetchall()
 
188
  news_dict = dict(row)
189
  # 轉換日期格式
190
  if news_dict['published_date']:
191
+ try:
192
+ if isinstance(news_dict['published_date'], str):
193
+ news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
194
+ else:
195
+ news_dict['published_date'] = news_dict['published_date']
196
+ except:
197
+ news_dict['published_date'] = datetime.now()
198
  news_list.append(news_dict)
199
 
200
+ logger.info(f"找到 {len(news_list)} 篇新聞")
201
  return news_list
202
 
203
  except Exception as e:
 
235
  cursor.execute("SELECT MAX(created_date) as last_update FROM news")
236
  last_update = cursor.fetchone()['last_update']
237
 
238
+ # 近7天統計
239
+ cursor.execute("""
240
+ SELECT COUNT(*) as recent_count
241
+ FROM news
242
+ WHERE published_date >= ?
243
+ """, (datetime.now() - timedelta(days=7),))
244
+ recent_count = cursor.fetchone()['recent_count']
245
+
246
  return {
247
  'total_news': total_news,
248
+ 'recent_news': recent_count,
249
  'us_stock_count': category_stats.get('us_stock', 0),
250
  'tw_stock_count': category_stats.get('tw_stock', 0),
251
  'positive_count': sentiment_stats.get('positive', 0),
 
300
  except Exception as e:
301
  logger.error(f"記錄爬蟲統計錯誤: {e}")
302
 
303
+ def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.9) -> bool:
304
+ """檢查標題重複性 - 修正版"""
305
  try:
306
+ if not title:
307
+ return False
308
+
309
  with self._get_connection() as conn:
310
  cursor = conn.cursor()
311
 
312
+ # 先檢查完全相同的標題
313
+ cursor.execute("SELECT COUNT(*) as count FROM news WHERE title = ?", (title,))
314
+ exact_match = cursor.fetchone()['count']
315
+
316
+ if exact_match > 0:
317
+ logger.debug(f"發現完全相同的標題: {title}")
318
+ return True
319
+
320
+ # 檢查相似標題(近期的)
321
  cursor.execute("""
322
  SELECT title FROM news
323
  WHERE created_date >= ?
324
+ ORDER BY created_date DESC
325
+ LIMIT 100
326
+ """, (datetime.now() - timedelta(hours=6),)) # 只檢查6小時內的
327
 
328
  existing_titles = [row['title'] for row in cursor.fetchall()]
329
 
330
+ if not existing_titles:
331
+ return False
332
+
333
  # 計算相似度(簡化版)
334
  title_words = set(title.lower().split())
335
 
 
345
  similarity = len(intersection) / len(union) if union else 0
346
 
347
  if similarity > similarity_threshold:
348
+ logger.debug(f"發現相似標題 (相似度: {similarity:.2f})")
349
+ logger.debug(f"新標題: {title}")
350
+ logger.debug(f"既有標題: {existing_title}")
351
  return True
352
 
353
  return False
354
 
355
  except Exception as e:
356
  logger.error(f"檢查標題重複性錯誤: {e}")
357
+ return False
358
+
359
+ def get_keywords_stats(self, days: int = 7) -> List[Dict]:
360
+ """獲取關鍵字統計"""
361
+ try:
362
+ with self._get_connection() as conn:
363
+ cursor = conn.cursor()
364
+
365
+ cursor.execute("""
366
+ SELECT title, content
367
+ FROM news
368
+ WHERE published_date >= ?
369
+ """, (datetime.now() - timedelta(days=days),))
370
+
371
+ rows = cursor.fetchall()
372
+
373
+ # 簡單的關鍵字提取(可以後續改進)
374
+ keyword_count = {}
375
+ common_words = {'的', '了', '在', '是', '有', '和', '與', '為', '一', '不', '上', '下', '中', '也', '會', '將', '及', '或', '等'}
376
+
377
+ for row in rows:
378
+ text = (row['title'] + ' ' + row['content']).lower()
379
+ words = text.split()
380
+
381
+ for word in words:
382
+ if len(word) > 1 and word not in common_words:
383
+ keyword_count[word] = keyword_count.get(word, 0) + 1
384
+
385
+ # 返回前20個關鍵字
386
+ sorted_keywords = sorted(keyword_count.items(), key=lambda x: x[1], reverse=True)[:20]
387
+
388
+ return [{'keyword': k, 'count': v} for k, v in sorted_keywords]
389
+
390
+ except Exception as e:
391
+ logger.error(f"獲取關鍵字統計錯誤: {e}")
392
+ return []