lune-lune commited on
Commit
c6a8a49
·
verified ·
1 Parent(s): 3095f95

Update news_collector.py

Browse files
Files changed (1) hide show
  1. news_collector.py +62 -42
news_collector.py CHANGED
@@ -1,82 +1,102 @@
1
  import requests
2
  import feedparser
3
  import re
 
4
 
5
- # config.pyの代わりに直接定義
 
6
  RSS_URLS = {
7
- "NHK": "[https://www.nhk.or.jp/rss/news/cat5.xml](https://www.nhk.or.jp/rss/news/cat5.xml)",
8
- "HATENA": "[https://b.hatena.ne.jp/hotentry/economics.rss](https://b.hatena.ne.jp/hotentry/economics.rss)"
 
 
9
  }
10
 
11
  def clean_html(raw_html):
 
 
 
12
  cleanr = re.compile('<.*?>')
13
- cleantext = re.sub(cleanr, '', raw_html)
14
- return cleantext
 
 
15
 
16
  def get_hatena_bookmark_counts(urls):
17
- """
18
- Fetch bookmark counts for a list of URLs using Hatena Bookmark API.
19
- API Endpoint: https://bookmark.hatenaapis.com/count/entries
20
- """
21
  if not urls:
22
  return {}
23
 
24
  api_url = "https://bookmark.hatenaapis.com/count/entries"
25
  counts = {}
26
 
27
- # Process in batches of 50 (API limit)
28
  batch_size = 50
29
  for i in range(0, len(urls), batch_size):
30
  batch_urls = urls[i:i + batch_size]
31
  params = [('url', url) for url in batch_urls]
32
  try:
33
- response = requests.get(api_url, params=params, timeout=10)
 
34
  if response.status_code == 200:
35
  counts.update(response.json())
36
- else:
37
- print(f"Warning: Hatena API returned status {response.status_code}")
38
  except Exception as e:
39
- print(f"Error fetching Hatena counts: {e}")
40
 
41
  return counts
42
 
43
  def fetch_rss_news():
 
44
  articles = []
45
  seen_titles = set()
46
 
47
- # 1. Fetch from RSS
48
  for source_name, url in RSS_URLS.items():
49
- feed = feedparser.parse(url)
50
- for entry in feed.entries:
51
- title = entry.title
 
 
 
52
 
53
- if title in seen_titles:
 
54
  continue
55
- seen_titles.add(title)
56
 
57
- link = entry.link
58
- summary = clean_html(entry.get('description', ''))
59
-
60
- # Initial object
61
- articles.append({
62
- "title": title,
63
- "link": link,
64
- "summary": summary,
65
- "source": source_name,
66
- "attention_score": 0 # Placeholder
67
- })
68
-
69
- # 2. Fetch Attention Scores (Hatena Bookmarks)
 
 
 
 
 
 
 
 
 
 
 
70
  if articles:
71
- urls = [a["link"] for a in articles]
72
- counts = get_hatena_bookmark_counts(urls)
73
-
74
- for article in articles:
75
- # API returns dict {url: count}
76
- count = counts.get(article["link"], 0)
77
- article["attention_score"] = int(count)
 
78
 
79
- # 3. Sort by Attention Score (Descending)
80
- articles.sort(key=lambda x: x["attention_score"], reverse=True)
81
 
82
  return articles
 
1
  import requests
2
  import feedparser
3
  import re
4
+ import time
5
 
6
+ # ニュース取得元(config.pyの代わりにここに直接記述)
7
+ # URLが古くなったりアクセス拒否されたりしにくい大手サイトを選定
8
  RSS_URLS = {
9
+ "NHKビジネス": "https://www.nhk.or.jp/rss/news/cat5.xml",
10
+ "日経(主要)": "https://www.nikkei.com/news/category/main/rss",
11
+ "ITmedia(経済)": "https://rss.itmedia.co.jp/rss/2.0/news_itmarket.xml",
12
+ "はてなブックマーク(経済)": "https://b.hatena.ne.jp/hotentry/economics.rss"
13
  }
14
 
15
  def clean_html(raw_html):
16
+ """HTMLタグを除去し、テキストをクリーンアップする"""
17
+ if not raw_html:
18
+ return ""
19
  cleanr = re.compile('<.*?>')
20
+ cleantext = re.sub(cleanr, '', str(raw_html))
21
+ # 余分な改行や空白を整理
22
+ cleantext = cleantext.replace('\n', ' ').strip()
23
+ return cleantext[:200] # 要約が長すぎる場合はカット
24
 
25
  def get_hatena_bookmark_counts(urls):
26
+ """はてなブックマーク数を取得(エラーハンドリング強化)"""
 
 
 
27
  if not urls:
28
  return {}
29
 
30
  api_url = "https://bookmark.hatenaapis.com/count/entries"
31
  counts = {}
32
 
33
+ # 50件ずつバッチ処理
34
  batch_size = 50
35
  for i in range(0, len(urls), batch_size):
36
  batch_urls = urls[i:i + batch_size]
37
  params = [('url', url) for url in batch_urls]
38
  try:
39
+ # タイムアウトを短めに設定し、全体の処理が止まらないようにする
40
+ response = requests.get(api_url, params=params, timeout=5)
41
  if response.status_code == 200:
42
  counts.update(response.json())
 
 
43
  except Exception as e:
44
+ print(f"Warning: Hatena API Error: {e}")
45
 
46
  return counts
47
 
48
  def fetch_rss_news():
49
+ """RSSフィードからニュースを取得するメイン関数"""
50
  articles = []
51
  seen_titles = set()
52
 
 
53
  for source_name, url in RSS_URLS.items():
54
+ try:
55
+ # RSSの取得(ユーザーエージェントを設定して拒否を防ぐ)
56
+ headers = {'User-Agent': 'Mozilla/5.0'}
57
+ # feedparserは直接URLを渡すと不安定な場合があるため、requestsで中身を取ってから解析
58
+ resp = requests.get(url, headers=headers, timeout=10)
59
+ feed = feedparser.parse(resp.content)
60
 
61
+ if not feed.entries:
62
+ print(f"No entries found for {source_name}")
63
  continue
 
64
 
65
+ for entry in feed.entries:
66
+ title = entry.get('title', 'No Title')
67
+
68
+ # 重複チェック
69
+ if title in seen_titles or title == 'No Title':
70
+ continue
71
+ seen_titles.add(title)
72
+
73
+ link = entry.get('link', '')
74
+ # 要約の取得(summaryがない場合はdescription、それもない場合はタイトルで代用)
75
+ summary = clean_html(entry.get('summary', entry.get('description', '内容なし')))
76
+
77
+ articles.append({
78
+ "title": title,
79
+ "link": link,
80
+ "summary": summary,
81
+ "source": source_name,
82
+ "attention_score": 0 # 後で更新
83
+ })
84
+ except Exception as e:
85
+ print(f"Error fetching {source_name}: {e}")
86
+ continue # 1つのサイトがダメでも次へ行く
87
+
88
+ # 注目度(はてブ数)の取得
89
  if articles:
90
+ try:
91
+ urls = [a["link"] for a in articles]
92
+ counts = get_hatena_bookmark_counts(urls)
93
+ for article in articles:
94
+ count = counts.get(article["link"], 0)
95
+ article["attention_score"] = int(count)
96
+ except Exception as e:
97
+ print(f"Error updating attention scores: {e}")
98
 
99
+ # 注目度が高い順にソート(スコアが同じなら新着順)
100
+ articles.sort(key=lambda x: x['attention_score'], reverse=True)
101
 
102
  return articles