Spaces:

cwpkd
/

todlong2

No application file

App Files Files Community

cwpkd commited on Nov 3, 2025

Commit

ceb72a8

verified ·

1 Parent(s): 83f44c4

Create news_scraper.py

Browse files

Files changed (1) hide show

news_scraper.py +148 -0

news_scraper.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import requests
+from bs4 import BeautifulSoup
+from typing import List, Dict
+import time
+from config import YAHOO_FINANCE_NEWS_URL, HEADERS, MAX_NEWS_ITEMS
+class YahooFinanceNewsScraper:
+    def __init__(self):
+        self.base_url = YAHOO_FINANCE_NEWS_URL
+        self.headers = HEADERS
+    def scrape_news(self, query: str = "", max_items: int = MAX_NEWS_ITEMS) -> List[Dict]:
+        """
+        ดึงข่าวจาก Yahoo Finance
+        Args:
+            query: คำค้นหา (ถ้าไม่ระบุจะดึงข่าวหน้าแรก)
+            max_items: จำนวนข่าวที่ต้องการ
+        Returns:
+            List of dictionaries containing news data
+        """
+        try:
+            # สร้าง URL
+            if query:
+                search_url = f"https://finance.yahoo.com/search?q={query.replace(' ', '+')}"
+            else:
+                search_url = self.base_url
+            # ส่ง request
+            response = requests.get(search_url, headers=self.headers, timeout=10)
+            response.raise_for_status()
+            # Parse HTML
+            soup = BeautifulSoup(response.content, 'html.parser')
+            news_items = []
+            # ค้นหา news items (Yahoo Finance ใช้ structure ต่างๆ)
+            # ลอง selector หลายแบบ
+            article_containers = (
+                soup.find_all('div', class_='Ov(h)') or
+                soup.find_all('div', class_='js-stream-content') or
+                soup.find_all('li', class_='js-stream-content') or
+                soup.find_all('h3')
+            )
+            for item in article_containers[:max_items]:
+                try:
+                    # หา title
+                    title_elem = item.find('a') or item.find('h3')
+                    if not title_elem:
+                        continue
+                    title = title_elem.get_text(strip=True)
+                    link = title_elem.get('href', '')
+                    # แก้ไข relative URL
+                    if link and not link.startswith('http'):
+                        link = f"https://finance.yahoo.com{link}"
+                    # หา description/summary
+                    desc_elem = item.find('p')
+                    description = desc_elem.get_text(strip=True) if desc_elem else ""
+                    if title and len(title) > 10:  # Filter out invalid titles
+                        news_items.append({
+                            'title': title,
+                            'description': description,
+                            'link': link,
+                            'source': 'Yahoo Finance'
+                        })
+                except Exception as e:
+                    continue
+            # ถ้าไม่เจอข่าว ลองวิธีอื่น
+            if not news_items:
+                news_items = self._fallback_scrape(soup, max_items)
+            return news_items[:max_items]
+        except Exception as e:
+            print(f"Error scraping news: {str(e)}")
+            return self._get_sample_news()
+    def _fallback_scrape(self, soup, max_items: int) -> List[Dict]:
+        """วิธีสำรอง: หา headlines ทั้งหมด"""
+        news_items = []
+        # หาทุก link ที่มี text ยาวพอ
+        all_links = soup.find_all('a')
+        for link in all_links:
+            text = link.get_text(strip=True)
+            href = link.get('href', '')
+            if len(text) > 20 and ('news' in href or 'article' in href):
+                if not href.startswith('http'):
+                    href = f"https://finance.yahoo.com{href}"
+                news_items.append({
+                    'title': text,
+                    'description': '',
+                    'link': href,
+                    'source': 'Yahoo Finance'
+                })
+                if len(news_items) >= max_items:
+                    break
+        return news_items
+    def _get_sample_news(self) -> List[Dict]:
+        """ข่าวตัวอย่างในกรณีที่ scrape ไม่สำเร็จ"""
+        return [
+            {
+                'title': 'Stock Market Rallies on Strong Economic Data',
+                'description': 'Major indices posted significant gains as investors reacted positively to economic indicators.',
+                'link': 'https://finance.yahoo.com',
+                'source': 'Yahoo Finance (Sample)'
+            },
+            {
+                'title': 'Tech Stocks Lead Market Higher Amid AI Boom',
+                'description': 'Technology sector outperforms as artificial intelligence investments surge.',
+                'link': 'https://finance.yahoo.com',
+                'source': 'Yahoo Finance (Sample)'
+            },
+            {
+                'title': 'Federal Reserve Holds Interest Rates Steady',
+                'description': 'Central bank maintains current policy stance citing inflation concerns.',
+                'link': 'https://finance.yahoo.com',
+                'source': 'Yahoo Finance (Sample)'
+            }
+        ]
+def test_scraper():
+    """ทดสอบ scraper"""
+    scraper = YahooFinanceNewsScraper()
+    news = scraper.scrape_news(query="technology", max_items=5)
+    print(f"Found {len(news)} news items:")
+    for i, item in enumerate(news, 1):
+        print(f"\n{i}. {item['title']}")
+        print(f"   Link: {item['link']}")
+if __name__ == "__main__":
+    test_scraper()