| import requests |
| from bs4 import BeautifulSoup |
| from typing import List, Dict |
| import time |
| from config import YAHOO_FINANCE_NEWS_URL, HEADERS, MAX_NEWS_ITEMS |
|
|
| class YahooFinanceNewsScraper: |
| def __init__(self): |
| self.base_url = YAHOO_FINANCE_NEWS_URL |
| self.headers = HEADERS |
| |
| def scrape_news(self, query: str = "", max_items: int = MAX_NEWS_ITEMS) -> List[Dict]: |
| """ |
| ดึงข่าวจาก Yahoo Finance |
| |
| Args: |
| query: คำค้นหา (ถ้าไม่ระบุจะดึงข่าวหน้าแรก) |
| max_items: จำนวนข่าวที่ต้องการ |
| |
| Returns: |
| List of dictionaries containing news data |
| """ |
| try: |
| |
| if query: |
| search_url = f"https://finance.yahoo.com/search?q={query.replace(' ', '+')}" |
| else: |
| search_url = self.base_url |
| |
| |
| response = requests.get(search_url, headers=self.headers, timeout=10) |
| response.raise_for_status() |
| |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| news_items = [] |
| |
| |
| |
| article_containers = ( |
| soup.find_all('div', class_='Ov(h)') or |
| soup.find_all('div', class_='js-stream-content') or |
| soup.find_all('li', class_='js-stream-content') or |
| soup.find_all('h3') |
| ) |
| |
| for item in article_containers[:max_items]: |
| try: |
| |
| title_elem = item.find('a') or item.find('h3') |
| if not title_elem: |
| continue |
| |
| title = title_elem.get_text(strip=True) |
| link = title_elem.get('href', '') |
| |
| |
| if link and not link.startswith('http'): |
| link = f"https://finance.yahoo.com{link}" |
| |
| |
| desc_elem = item.find('p') |
| description = desc_elem.get_text(strip=True) if desc_elem else "" |
| |
| if title and len(title) > 10: |
| news_items.append({ |
| 'title': title, |
| 'description': description, |
| 'link': link, |
| 'source': 'Yahoo Finance' |
| }) |
| |
| except Exception as e: |
| continue |
| |
| |
| if not news_items: |
| news_items = self._fallback_scrape(soup, max_items) |
| |
| return news_items[:max_items] |
| |
| except Exception as e: |
| print(f"Error scraping news: {str(e)}") |
| return self._get_sample_news() |
| |
| def _fallback_scrape(self, soup, max_items: int) -> List[Dict]: |
| """วิธีสำรอง: หา headlines ทั้งหมด""" |
| news_items = [] |
| |
| |
| all_links = soup.find_all('a') |
| |
| for link in all_links: |
| text = link.get_text(strip=True) |
| href = link.get('href', '') |
| |
| if len(text) > 20 and ('news' in href or 'article' in href): |
| if not href.startswith('http'): |
| href = f"https://finance.yahoo.com{href}" |
| |
| news_items.append({ |
| 'title': text, |
| 'description': '', |
| 'link': href, |
| 'source': 'Yahoo Finance' |
| }) |
| |
| if len(news_items) >= max_items: |
| break |
| |
| return news_items |
| |
| def _get_sample_news(self) -> List[Dict]: |
| """ข่าวตัวอย่างในกรณีที่ scrape ไม่สำเร็จ""" |
| return [ |
| { |
| 'title': 'Stock Market Rallies on Strong Economic Data', |
| 'description': 'Major indices posted significant gains as investors reacted positively to economic indicators.', |
| 'link': 'https://finance.yahoo.com', |
| 'source': 'Yahoo Finance (Sample)' |
| }, |
| { |
| 'title': 'Tech Stocks Lead Market Higher Amid AI Boom', |
| 'description': 'Technology sector outperforms as artificial intelligence investments surge.', |
| 'link': 'https://finance.yahoo.com', |
| 'source': 'Yahoo Finance (Sample)' |
| }, |
| { |
| 'title': 'Federal Reserve Holds Interest Rates Steady', |
| 'description': 'Central bank maintains current policy stance citing inflation concerns.', |
| 'link': 'https://finance.yahoo.com', |
| 'source': 'Yahoo Finance (Sample)' |
| } |
| ] |
|
|
| def test_scraper(): |
| """ทดสอบ scraper""" |
| scraper = YahooFinanceNewsScraper() |
| news = scraper.scrape_news(query="technology", max_items=5) |
| |
| print(f"Found {len(news)} news items:") |
| for i, item in enumerate(news, 1): |
| print(f"\n{i}. {item['title']}") |
| print(f" Link: {item['link']}") |
|
|
| if __name__ == "__main__": |
| test_scraper() |