cwpkd commited on
Commit
ceb72a8
·
verified ·
1 Parent(s): 83f44c4

Create news_scraper.py

Browse files
Files changed (1) hide show
  1. news_scraper.py +148 -0
news_scraper.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from typing import List, Dict
4
+ import time
5
+ from config import YAHOO_FINANCE_NEWS_URL, HEADERS, MAX_NEWS_ITEMS
6
+
7
+ class YahooFinanceNewsScraper:
8
+ def __init__(self):
9
+ self.base_url = YAHOO_FINANCE_NEWS_URL
10
+ self.headers = HEADERS
11
+
12
+ def scrape_news(self, query: str = "", max_items: int = MAX_NEWS_ITEMS) -> List[Dict]:
13
+ """
14
+ ดึงข่าวจาก Yahoo Finance
15
+
16
+ Args:
17
+ query: คำค้นหา (ถ้าไม่ระบุจะดึงข่าวหน้าแรก)
18
+ max_items: จำนวนข่าวที่ต้องการ
19
+
20
+ Returns:
21
+ List of dictionaries containing news data
22
+ """
23
+ try:
24
+ # สร้าง URL
25
+ if query:
26
+ search_url = f"https://finance.yahoo.com/search?q={query.replace(' ', '+')}"
27
+ else:
28
+ search_url = self.base_url
29
+
30
+ # ส่ง request
31
+ response = requests.get(search_url, headers=self.headers, timeout=10)
32
+ response.raise_for_status()
33
+
34
+ # Parse HTML
35
+ soup = BeautifulSoup(response.content, 'html.parser')
36
+
37
+ news_items = []
38
+
39
+ # ค้นหา news items (Yahoo Finance ใช้ structure ต่างๆ)
40
+ # ลอง selector หลายแบบ
41
+ article_containers = (
42
+ soup.find_all('div', class_='Ov(h)') or
43
+ soup.find_all('div', class_='js-stream-content') or
44
+ soup.find_all('li', class_='js-stream-content') or
45
+ soup.find_all('h3')
46
+ )
47
+
48
+ for item in article_containers[:max_items]:
49
+ try:
50
+ # หา title
51
+ title_elem = item.find('a') or item.find('h3')
52
+ if not title_elem:
53
+ continue
54
+
55
+ title = title_elem.get_text(strip=True)
56
+ link = title_elem.get('href', '')
57
+
58
+ # แก้ไข relative URL
59
+ if link and not link.startswith('http'):
60
+ link = f"https://finance.yahoo.com{link}"
61
+
62
+ # หา description/summary
63
+ desc_elem = item.find('p')
64
+ description = desc_elem.get_text(strip=True) if desc_elem else ""
65
+
66
+ if title and len(title) > 10: # Filter out invalid titles
67
+ news_items.append({
68
+ 'title': title,
69
+ 'description': description,
70
+ 'link': link,
71
+ 'source': 'Yahoo Finance'
72
+ })
73
+
74
+ except Exception as e:
75
+ continue
76
+
77
+ # ถ้าไม่เจอข่าว ลองวิธีอื่น
78
+ if not news_items:
79
+ news_items = self._fallback_scrape(soup, max_items)
80
+
81
+ return news_items[:max_items]
82
+
83
+ except Exception as e:
84
+ print(f"Error scraping news: {str(e)}")
85
+ return self._get_sample_news()
86
+
87
+ def _fallback_scrape(self, soup, max_items: int) -> List[Dict]:
88
+ """วิธีสำรอง: หา headlines ทั้งหมด"""
89
+ news_items = []
90
+
91
+ # หาทุก link ที่มี text ยาวพอ
92
+ all_links = soup.find_all('a')
93
+
94
+ for link in all_links:
95
+ text = link.get_text(strip=True)
96
+ href = link.get('href', '')
97
+
98
+ if len(text) > 20 and ('news' in href or 'article' in href):
99
+ if not href.startswith('http'):
100
+ href = f"https://finance.yahoo.com{href}"
101
+
102
+ news_items.append({
103
+ 'title': text,
104
+ 'description': '',
105
+ 'link': href,
106
+ 'source': 'Yahoo Finance'
107
+ })
108
+
109
+ if len(news_items) >= max_items:
110
+ break
111
+
112
+ return news_items
113
+
114
+ def _get_sample_news(self) -> List[Dict]:
115
+ """ข่าวตัวอย่างในกรณีที่ scrape ไม่สำเร็จ"""
116
+ return [
117
+ {
118
+ 'title': 'Stock Market Rallies on Strong Economic Data',
119
+ 'description': 'Major indices posted significant gains as investors reacted positively to economic indicators.',
120
+ 'link': 'https://finance.yahoo.com',
121
+ 'source': 'Yahoo Finance (Sample)'
122
+ },
123
+ {
124
+ 'title': 'Tech Stocks Lead Market Higher Amid AI Boom',
125
+ 'description': 'Technology sector outperforms as artificial intelligence investments surge.',
126
+ 'link': 'https://finance.yahoo.com',
127
+ 'source': 'Yahoo Finance (Sample)'
128
+ },
129
+ {
130
+ 'title': 'Federal Reserve Holds Interest Rates Steady',
131
+ 'description': 'Central bank maintains current policy stance citing inflation concerns.',
132
+ 'link': 'https://finance.yahoo.com',
133
+ 'source': 'Yahoo Finance (Sample)'
134
+ }
135
+ ]
136
+
137
+ def test_scraper():
138
+ """ทดสอบ scraper"""
139
+ scraper = YahooFinanceNewsScraper()
140
+ news = scraper.scrape_news(query="technology", max_items=5)
141
+
142
+ print(f"Found {len(news)} news items:")
143
+ for i, item in enumerate(news, 1):
144
+ print(f"\n{i}. {item['title']}")
145
+ print(f" Link: {item['link']}")
146
+
147
+ if __name__ == "__main__":
148
+ test_scraper()