Update news_collector.py
Browse files- news_collector.py +62 -42
news_collector.py
CHANGED
|
@@ -1,82 +1,102 @@
|
|
| 1 |
import requests
|
| 2 |
import feedparser
|
| 3 |
import re
|
|
|
|
| 4 |
|
| 5 |
-
# config.pyの代わりに直接
|
|
|
|
| 6 |
RSS_URLS = {
|
| 7 |
-
"NHK": "
|
| 8 |
-
"
|
|
|
|
|
|
|
| 9 |
}
|
| 10 |
|
| 11 |
def clean_html(raw_html):
|
|
|
|
|
|
|
|
|
|
| 12 |
cleanr = re.compile('<.*?>')
|
| 13 |
-
cleantext = re.sub(cleanr, '', raw_html)
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def get_hatena_bookmark_counts(urls):
|
| 17 |
-
"""
|
| 18 |
-
Fetch bookmark counts for a list of URLs using Hatena Bookmark API.
|
| 19 |
-
API Endpoint: https://bookmark.hatenaapis.com/count/entries
|
| 20 |
-
"""
|
| 21 |
if not urls:
|
| 22 |
return {}
|
| 23 |
|
| 24 |
api_url = "https://bookmark.hatenaapis.com/count/entries"
|
| 25 |
counts = {}
|
| 26 |
|
| 27 |
-
#
|
| 28 |
batch_size = 50
|
| 29 |
for i in range(0, len(urls), batch_size):
|
| 30 |
batch_urls = urls[i:i + batch_size]
|
| 31 |
params = [('url', url) for url in batch_urls]
|
| 32 |
try:
|
| 33 |
-
|
|
|
|
| 34 |
if response.status_code == 200:
|
| 35 |
counts.update(response.json())
|
| 36 |
-
else:
|
| 37 |
-
print(f"Warning: Hatena API returned status {response.status_code}")
|
| 38 |
except Exception as e:
|
| 39 |
-
print(f"
|
| 40 |
|
| 41 |
return counts
|
| 42 |
|
| 43 |
def fetch_rss_news():
|
|
|
|
| 44 |
articles = []
|
| 45 |
seen_titles = set()
|
| 46 |
|
| 47 |
-
# 1. Fetch from RSS
|
| 48 |
for source_name, url in RSS_URLS.items():
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
if
|
|
|
|
| 54 |
continue
|
| 55 |
-
seen_titles.add(title)
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
if articles:
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
articles.sort(key=lambda x: x[
|
| 81 |
|
| 82 |
return articles
|
|
|
|
| 1 |
import requests
|
| 2 |
import feedparser
|
| 3 |
import re
|
| 4 |
+
import time
|
| 5 |
|
| 6 |
+
# ニュース取得元(config.pyの代わりにここに直接記述)
|
| 7 |
+
# URLが古くなったりアクセス拒否されたりしにくい大手サイトを選定
|
| 8 |
RSS_URLS = {
|
| 9 |
+
"NHKビジネス": "https://www.nhk.or.jp/rss/news/cat5.xml",
|
| 10 |
+
"日経(主要)": "https://www.nikkei.com/news/category/main/rss",
|
| 11 |
+
"ITmedia(経済)": "https://rss.itmedia.co.jp/rss/2.0/news_itmarket.xml",
|
| 12 |
+
"はてなブックマーク(経済)": "https://b.hatena.ne.jp/hotentry/economics.rss"
|
| 13 |
}
|
| 14 |
|
| 15 |
def clean_html(raw_html):
|
| 16 |
+
"""HTMLタグを除去し、テキストをクリーンアップする"""
|
| 17 |
+
if not raw_html:
|
| 18 |
+
return ""
|
| 19 |
cleanr = re.compile('<.*?>')
|
| 20 |
+
cleantext = re.sub(cleanr, '', str(raw_html))
|
| 21 |
+
# 余分な改行や空白を整理
|
| 22 |
+
cleantext = cleantext.replace('\n', ' ').strip()
|
| 23 |
+
return cleantext[:200] # 要約が長すぎる場合はカット
|
| 24 |
|
| 25 |
def get_hatena_bookmark_counts(urls):
|
| 26 |
+
"""はてなブックマーク数を取得(エラーハンドリング強化)"""
|
|
|
|
|
|
|
|
|
|
| 27 |
if not urls:
|
| 28 |
return {}
|
| 29 |
|
| 30 |
api_url = "https://bookmark.hatenaapis.com/count/entries"
|
| 31 |
counts = {}
|
| 32 |
|
| 33 |
+
# 50件ずつバッチ処理
|
| 34 |
batch_size = 50
|
| 35 |
for i in range(0, len(urls), batch_size):
|
| 36 |
batch_urls = urls[i:i + batch_size]
|
| 37 |
params = [('url', url) for url in batch_urls]
|
| 38 |
try:
|
| 39 |
+
# タイムアウトを短めに設定し、全体の処理が止まらないようにする
|
| 40 |
+
response = requests.get(api_url, params=params, timeout=5)
|
| 41 |
if response.status_code == 200:
|
| 42 |
counts.update(response.json())
|
|
|
|
|
|
|
| 43 |
except Exception as e:
|
| 44 |
+
print(f"Warning: Hatena API Error: {e}")
|
| 45 |
|
| 46 |
return counts
|
| 47 |
|
| 48 |
def fetch_rss_news():
|
| 49 |
+
"""RSSフィードからニュースを取得するメイン関数"""
|
| 50 |
articles = []
|
| 51 |
seen_titles = set()
|
| 52 |
|
|
|
|
| 53 |
for source_name, url in RSS_URLS.items():
|
| 54 |
+
try:
|
| 55 |
+
# RSSの取得(ユーザーエージェントを設定して拒否を防ぐ)
|
| 56 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
| 57 |
+
# feedparserは直接URLを渡すと不安定な場合があるため、requestsで中身を取ってから解析
|
| 58 |
+
resp = requests.get(url, headers=headers, timeout=10)
|
| 59 |
+
feed = feedparser.parse(resp.content)
|
| 60 |
|
| 61 |
+
if not feed.entries:
|
| 62 |
+
print(f"No entries found for {source_name}")
|
| 63 |
continue
|
|
|
|
| 64 |
|
| 65 |
+
for entry in feed.entries:
|
| 66 |
+
title = entry.get('title', 'No Title')
|
| 67 |
+
|
| 68 |
+
# 重複チェック
|
| 69 |
+
if title in seen_titles or title == 'No Title':
|
| 70 |
+
continue
|
| 71 |
+
seen_titles.add(title)
|
| 72 |
+
|
| 73 |
+
link = entry.get('link', '')
|
| 74 |
+
# 要約の取得(summaryがない場合はdescription、それもない場合はタイトルで代用)
|
| 75 |
+
summary = clean_html(entry.get('summary', entry.get('description', '内容なし')))
|
| 76 |
+
|
| 77 |
+
articles.append({
|
| 78 |
+
"title": title,
|
| 79 |
+
"link": link,
|
| 80 |
+
"summary": summary,
|
| 81 |
+
"source": source_name,
|
| 82 |
+
"attention_score": 0 # 後で更新
|
| 83 |
+
})
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"Error fetching {source_name}: {e}")
|
| 86 |
+
continue # 1つのサイトがダメでも次へ行く
|
| 87 |
+
|
| 88 |
+
# 注目度(はてブ数)の取得
|
| 89 |
if articles:
|
| 90 |
+
try:
|
| 91 |
+
urls = [a["link"] for a in articles]
|
| 92 |
+
counts = get_hatena_bookmark_counts(urls)
|
| 93 |
+
for article in articles:
|
| 94 |
+
count = counts.get(article["link"], 0)
|
| 95 |
+
article["attention_score"] = int(count)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
print(f"Error updating attention scores: {e}")
|
| 98 |
|
| 99 |
+
# 注目度が高い順にソート(スコアが同じなら新着順)
|
| 100 |
+
articles.sort(key=lambda x: x['attention_score'], reverse=True)
|
| 101 |
|
| 102 |
return articles
|