Spaces:
Sleeping
Sleeping
| # milliyet_link_scraper.py | |
| import requests | |
| from bs4 import BeautifulSoup | |
| def get_sondakika_links(): | |
| url = "https://www.milliyet.com.tr/son-dakika/" | |
| headers = { | |
| "User-Agent": "Mozilla/5.0" | |
| } | |
| base_url = "https://www.milliyet.com.tr" | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| seen = set() | |
| news_links = [] | |
| for a in soup.find_all("a", href=True): | |
| href = a["href"].strip() | |
| if href.startswith("/"): | |
| href = base_url + href | |
| elif not href.startswith("http"): | |
| continue | |
| # -737 ile biten haber ID'sine sahip olanları al (haber linkleri) | |
| if "-737" in href and "milliyet.com.tr" in href: | |
| if href not in seen: | |
| seen.add(href) | |
| news_links.append(href) | |
| return news_links | |
| def get_news_content(url): | |
| headers = {"User-Agent": "Mozilla/5.0"} | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Başlık bulma (farklı HTML yapıları için deneme) | |
| title = None | |
| for selector in [ | |
| ("h1", {"id": "title"}), | |
| ("h1", {"class": "news-title"}), | |
| ("h1", {}) | |
| ]: | |
| found = soup.find(selector[0], selector[1]) | |
| if found: | |
| title = found.get_text(strip=True) | |
| break | |
| if not title: | |
| title = "Başlık bulunamadı" | |
| # İçerik bulma | |
| content = "" | |
| article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content") | |
| if article_div: | |
| paragraphs = article_div.find_all("p") | |
| content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) | |
| else: | |
| # Yedek olarak tüm paragrafları dene | |
| paragraphs = soup.find_all("p") | |
| content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) | |
| return { | |
| "title": title, | |
| "content": content.strip() | |
| } | |