Spaces:

lune-lune
/

NewsCheck

Sleeping

App Files Files Community

lune-lune commited on Dec 31, 2025

Commit

c6a8a49

verified ·

1 Parent(s): 3095f95

Update news_collector.py

Browse files

Files changed (1) hide show

news_collector.py +62 -42

news_collector.py CHANGED Viewed

@@ -1,82 +1,102 @@
 import requests
 import feedparser
 import re
-# config.pyの代わりに直接定義
 RSS_URLS = {
-    "NHK": "[https://www.nhk.or.jp/rss/news/cat5.xml](https://www.nhk.or.jp/rss/news/cat5.xml)",
-    "HATENA": "[https://b.hatena.ne.jp/hotentry/economics.rss](https://b.hatena.ne.jp/hotentry/economics.rss)"
 }
 def clean_html(raw_html):
     cleanr = re.compile('<.*?>')
-    cleantext = re.sub(cleanr, '', raw_html)
-    return cleantext
 def get_hatena_bookmark_counts(urls):
-    """
-    Fetch bookmark counts for a list of URLs using Hatena Bookmark API.
-    API Endpoint: https://bookmark.hatenaapis.com/count/entries
-    """
     if not urls:
         return {}
     api_url = "https://bookmark.hatenaapis.com/count/entries"
     counts = {}
-    # Process in batches of 50 (API limit)
     batch_size = 50
     for i in range(0, len(urls), batch_size):
         batch_urls = urls[i:i + batch_size]
         params = [('url', url) for url in batch_urls]
         try:
-            response = requests.get(api_url, params=params, timeout=10)
             if response.status_code == 200:
                 counts.update(response.json())
-            else:
-                print(f"Warning: Hatena API returned status {response.status_code}")
         except Exception as e:
-            print(f"Error fetching Hatena counts: {e}")
     return counts
 def fetch_rss_news():
     articles = []
     seen_titles = set()
-    # 1. Fetch from RSS
     for source_name, url in RSS_URLS.items():
-        feed = feedparser.parse(url)
-        for entry in feed.entries:
-            title = entry.title
-            if title in seen_titles:
                 continue
-            seen_titles.add(title)
-            link = entry.link
-            summary = clean_html(entry.get('description', ''))
-            # Initial object
-            articles.append({
-                "title": title,
-                "link": link,
-                "summary": summary,
-                "source": source_name,
-                "attention_score": 0 # Placeholder
-            })
-    # 2. Fetch Attention Scores (Hatena Bookmarks)
     if articles:
-        urls = [a["link"] for a in articles]
-        counts = get_hatena_bookmark_counts(urls)
-        for article in articles:
-            # API returns dict {url: count}
-            count = counts.get(article["link"], 0)
-            article["attention_score"] = int(count)
-    # 3. Sort by Attention Score (Descending)
-    articles.sort(key=lambda x: x["attention_score"], reverse=True)
     return articles

 import requests
 import feedparser
 import re
+import time
+# ニュース取得元（config.pyの代わりにここに直接記述）
+# URLが古くなったりアクセス拒否されたりしにくい大手サイトを選定
 RSS_URLS = {
+    "NHKビジネス": "https://www.nhk.or.jp/rss/news/cat5.xml",
+    "日経(主要)": "https://www.nikkei.com/news/category/main/rss",
+    "ITmedia(経済)": "https://rss.itmedia.co.jp/rss/2.0/news_itmarket.xml",
+    "はてなブックマーク(経済)": "https://b.hatena.ne.jp/hotentry/economics.rss"
 }
 def clean_html(raw_html):
+    """HTMLタグを除去し、テキストをクリーンアップする"""
+    if not raw_html:
+        return ""
     cleanr = re.compile('<.*?>')
+    cleantext = re.sub(cleanr, '', str(raw_html))
+    # 余分な改行や空白を整理
+    cleantext = cleantext.replace('\n', ' ').strip()
+    return cleantext[:200]  # 要約が長すぎる場合はカット
 def get_hatena_bookmark_counts(urls):
+    """はてなブックマーク数を取得（エラーハンドリング強化）"""
     if not urls:
         return {}
     api_url = "https://bookmark.hatenaapis.com/count/entries"
     counts = {}
+    # 50件ずつバッチ処理
     batch_size = 50
     for i in range(0, len(urls), batch_size):
         batch_urls = urls[i:i + batch_size]
         params = [('url', url) for url in batch_urls]
         try:
+            # タイムアウトを短めに設定し、全体の処理が止まらないようにする
+            response = requests.get(api_url, params=params, timeout=5)
             if response.status_code == 200:
                 counts.update(response.json())
         except Exception as e:
+            print(f"Warning: Hatena API Error: {e}")
     return counts
 def fetch_rss_news():
+    """RSSフィードからニュースを取得するメイン関数"""
     articles = []
     seen_titles = set()
     for source_name, url in RSS_URLS.items():
+        try:
+            # RSSの取得（ユーザーエージェントを設定して拒否を防ぐ）
+            headers = {'User-Agent': 'Mozilla/5.0'}
+            # feedparserは直接URLを渡すと不安定な場合があるため、requestsで中身を取ってから解析
+            resp = requests.get(url, headers=headers, timeout=10)
+            feed = feedparser.parse(resp.content)
+            if not feed.entries:
+                print(f"No entries found for {source_name}")
                 continue
+            for entry in feed.entries:
+                title = entry.get('title', 'No Title')
+                # 重複チェック
+                if title in seen_titles or title == 'No Title':
+                    continue
+                seen_titles.add(title)
+                link = entry.get('link', '')
+                # 要約の取得（summaryがない場合はdescription、それもない場合はタイトルで代用）
+                summary = clean_html(entry.get('summary', entry.get('description', '内容なし')))
+                articles.append({
+                    "title": title,
+                    "link": link,
+                    "summary": summary,
+                    "source": source_name,
+                    "attention_score": 0  # 後で更新
+                })
+        except Exception as e:
+            print(f"Error fetching {source_name}: {e}")
+            continue # 1つのサイトがダメでも次へ行く
+    # 注目度（はてブ数）の取得
     if articles:
+        try:
+            urls = [a["link"] for a in articles]
+            counts = get_hatena_bookmark_counts(urls)
+            for article in articles:
+                count = counts.get(article["link"], 0)
+                article["attention_score"] = int(count)
+        except Exception as e:
+            print(f"Error updating attention scores: {e}")
+    # 注目度が高い順にソート（スコアが同じなら新着順）
+    articles.sort(key=lambda x: x['attention_score'], reverse=True)
     return articles