Spaces:

khjhs60199
/

pyCrawing

Sleeping

App Files Files Community

khjhs60199 commited on Sep 17, 2025

Commit

9bf2819

verified ·

1 Parent(s): d740988

Update database.py

Browse files

Files changed (1) hide show

database.py +111 -15

database.py CHANGED Viewed

@@ -9,7 +9,7 @@ from contextlib import contextmanager
 logger = logging.getLogger(__name__)
 class NewsDatabase:
-    """新聞資料庫管理器"""
     def __init__(self, db_path: str = "news.db"):
         self.db_path = db_path
@@ -46,6 +46,8 @@ class NewsDatabase:
                 cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
                 cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
                 cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
                 # 創建統計表
                 cursor.execute("""
@@ -131,27 +133,51 @@ class NewsDatabase:
         logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
         return inserted_count, duplicate_count
-    def get_recent_news(self, category: str = "all", limit: int = 50, days: int = 7) -> List[Dict]:
-        """獲取最近的新聞"""
         try:
             with self._get_connection() as conn:
                 cursor = conn.cursor()
                 # 構建查詢條件
-                where_clause = "WHERE published_date >= ?"
-                params = [datetime.now() - timedelta(days=days)]
                 if category != "all":
-                    where_clause += " AND category = ?"
                     params.append(category)
                 query = f"""
                     SELECT * FROM news
                     {where_clause}
-                    ORDER BY published_date DESC
-                    LIMIT ?
                 """
-                params.append(limit)
                 cursor.execute(query, params)
                 rows = cursor.fetchall()
@@ -162,9 +188,16 @@ class NewsDatabase:
                     news_dict = dict(row)
                     # 轉換日期格式
                     if news_dict['published_date']:
-                        news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
                     news_list.append(news_dict)
                 return news_list
         except Exception as e:
@@ -202,8 +235,17 @@ class NewsDatabase:
                 cursor.execute("SELECT MAX(created_date) as last_update FROM news")
                 last_update = cursor.fetchone()['last_update']
                 return {
                     'total_news': total_news,
                     'us_stock_count': category_stats.get('us_stock', 0),
                     'tw_stock_count': category_stats.get('tw_stock', 0),
                     'positive_count': sentiment_stats.get('positive', 0),
@@ -258,20 +300,36 @@ class NewsDatabase:
         except Exception as e:
             logger.error(f"記錄爬蟲統計錯誤: {e}")
-    def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.8) -> bool:
-        """檢查標題重複性"""
         try:
             with self._get_connection() as conn:
                 cursor = conn.cursor()
-                # 簡單的標題相似度檢查
                 cursor.execute("""
                     SELECT title FROM news
                     WHERE created_date >= ?
-                """, (datetime.now() - timedelta(days=1),))
                 existing_titles = [row['title'] for row in cursor.fetchall()]
                 # 計算相似度（簡化版）
                 title_words = set(title.lower().split())
@@ -287,10 +345,48 @@ class NewsDatabase:
                     similarity = len(intersection) / len(union) if union else 0
                     if similarity > similarity_threshold:
                         return True
                 return False
         except Exception as e:
             logger.error(f"檢查標題重複性錯誤: {e}")
-            return False

 logger = logging.getLogger(__name__)
 class NewsDatabase:
+    """新聞資料庫管理器 - 增強版"""
     def __init__(self, db_path: str = "news.db"):
         self.db_path = db_path
                 cursor.execute("CREATE INDEX IF NOT EXISTS idx_category ON news(category)")
                 cursor.execute("CREATE INDEX IF NOT EXISTS idx_published_date ON news(published_date)")
                 cursor.execute("CREATE INDEX IF NOT EXISTS idx_sentiment ON news(sentiment)")
+                cursor.execute("CREATE INDEX IF NOT EXISTS idx_title ON news(title)")
+                cursor.execute("CREATE INDEX IF NOT EXISTS idx_content ON news(content)")
                 # 創建統計表
                 cursor.execute("""
         logger.info(f"插入新聞完成 - 新增: {inserted_count}, 重複: {duplicate_count}")
         return inserted_count, duplicate_count
+    def get_recent_news(self, category: str = "all", days: int = 7,
+                       keyword: str = "", sentiment_filter: str = "all") -> List[Dict]:
+        """獲取最近的新聞 - 增強版"""
         try:
             with self._get_connection() as conn:
                 cursor = conn.cursor()
                 # 構建查詢條件
+                where_conditions = []
+                params = []
+                # 時間條件
+                if days > 0:
+                    where_conditions.append("published_date >= ?")
+                    params.append(datetime.now() - timedelta(days=days))
+                # 分類條件
                 if category != "all":
+                    where_conditions.append("category = ?")
                     params.append(category)
+                # 關鍵字搜尋
+                if keyword:
+                    where_conditions.append("(title LIKE ? OR content LIKE ?)")
+                    keyword_pattern = f"%{keyword}%"
+                    params.extend([keyword_pattern, keyword_pattern])
+                # 情緒篩選
+                if sentiment_filter != "all":
+                    where_conditions.append("sentiment = ?")
+                    params.append(sentiment_filter)
+                # 組合查詢
+                where_clause = ""
+                if where_conditions:
+                    where_clause = "WHERE " + " AND ".join(where_conditions)
                 query = f"""
                     SELECT * FROM news
                     {where_clause}
+                    ORDER BY published_date DESC
                 """
+                logger.info(f"執行查詢: {query}")
+                logger.info(f"參數: {params}")
                 cursor.execute(query, params)
                 rows = cursor.fetchall()
                     news_dict = dict(row)
                     # 轉換日期格式
                     if news_dict['published_date']:
+                        try:
+                            if isinstance(news_dict['published_date'], str):
+                                news_dict['published_date'] = datetime.fromisoformat(news_dict['published_date'])
+                            else:
+                                news_dict['published_date'] = news_dict['published_date']
+                        except:
+                            news_dict['published_date'] = datetime.now()
                     news_list.append(news_dict)
+                logger.info(f"找到 {len(news_list)} 篇新聞")
                 return news_list
         except Exception as e:
                 cursor.execute("SELECT MAX(created_date) as last_update FROM news")
                 last_update = cursor.fetchone()['last_update']
+                # 近7天統計
+                cursor.execute("""
+                    SELECT COUNT(*) as recent_count
+                    FROM news
+                    WHERE published_date >= ?
+                """, (datetime.now() - timedelta(days=7),))
+                recent_count = cursor.fetchone()['recent_count']
                 return {
                     'total_news': total_news,
+                    'recent_news': recent_count,
                     'us_stock_count': category_stats.get('us_stock', 0),
                     'tw_stock_count': category_stats.get('tw_stock', 0),
                     'positive_count': sentiment_stats.get('positive', 0),
         except Exception as e:
             logger.error(f"記錄爬蟲統計錯誤: {e}")
+    def check_duplicate_by_title(self, title: str, similarity_threshold: float = 0.9) -> bool:
+        """檢查標題重複性 - 修正版"""
         try:
+            if not title:
+                return False
             with self._get_connection() as conn:
                 cursor = conn.cursor()
+                # 先檢查完全相同的標題
+                cursor.execute("SELECT COUNT(*) as count FROM news WHERE title = ?", (title,))
+                exact_match = cursor.fetchone()['count']
+                if exact_match > 0:
+                    logger.debug(f"發現完全相同的標題: {title}")
+                    return True
+                # 檢查相似標題（近期的）
                 cursor.execute("""
                     SELECT title FROM news
                     WHERE created_date >= ?
+                    ORDER BY created_date DESC
+                    LIMIT 100
+                """, (datetime.now() - timedelta(hours=6),))  # 只檢查6小時內的
                 existing_titles = [row['title'] for row in cursor.fetchall()]
+                if not existing_titles:
+                    return False
                 # 計算相似度（簡化版）
                 title_words = set(title.lower().split())
                     similarity = len(intersection) / len(union) if union else 0
                     if similarity > similarity_threshold:
+                        logger.debug(f"發現相似標題 (相似度: {similarity:.2f})")
+                        logger.debug(f"新標題: {title}")
+                        logger.debug(f"既有標題: {existing_title}")
                         return True
                 return False
         except Exception as e:
             logger.error(f"檢查標題重複性錯誤: {e}")
+            return False
+    def get_keywords_stats(self, days: int = 7) -> List[Dict]:
+        """獲取關鍵字統計"""
+        try:
+            with self._get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute("""
+                    SELECT title, content
+                    FROM news
+                    WHERE published_date >= ?
+                """, (datetime.now() - timedelta(days=days),))
+                rows = cursor.fetchall()
+                # 簡單的關鍵字提取（可以後續改進）
+                keyword_count = {}
+                common_words = {'的', '了', '在', '是', '有', '和', '與', '為', '一', '不', '上', '下', '中', '也', '會', '將', '及', '或', '等'}
+                for row in rows:
+                    text = (row['title'] + ' ' + row['content']).lower()
+                    words = text.split()
+                    for word in words:
+                        if len(word) > 1 and word not in common_words:
+                            keyword_count[word] = keyword_count.get(word, 0) + 1
+                # 返回前20個關鍵字
+                sorted_keywords = sorted(keyword_count.items(), key=lambda x: x[1], reverse=True)[:20]
+                return [{'keyword': k, 'count': v} for k, v in sorted_keywords]
+        except Exception as e:
+            logger.error(f"獲取關鍵字統計錯誤: {e}")
+            return []