Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from datetime import datetime | |
| import hashlib | |
| def get_article_id(url): | |
| return hashlib.md5(url.encode()).hexdigest() | |
| def clean_haberler_content(content): | |
| """Gereksiz telif ve site reklam metinlerini siler""" | |
| blacklist_phrases = [ | |
| "© Copyright", | |
| "Haberler.com:", | |
| "Haber:", | |
| "bildirimlerimize izin vererek", | |
| "masaüstü", | |
| "Tüm Hakları Gizlidir", | |
| "Haberler.com’da" | |
| ] | |
| lines = content.split("\n") | |
| cleaned_lines = [] | |
| for line in lines: | |
| if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases): | |
| cleaned_lines.append(line) | |
| return "\n".join(cleaned_lines).strip() | |
| def extract_full_content(soup): | |
| """Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır""" | |
| candidate_containers = [ | |
| # Özel div class'ları | |
| ("div", {"class": "haber-metin"}), | |
| ("div", {"class": "article-content"}), | |
| ("div", {"class": "news-content"}), | |
| ("div", {"class": "detail-text"}), | |
| ("div", {"class": "content-text"}), | |
| ("div", {"id": "content"}), | |
| # Article | |
| ("article", {}), | |
| # Section/main | |
| ("section", {}), | |
| ("main", {}), | |
| ] | |
| for tag, attr in candidate_containers: | |
| container = soup.find(tag, attr) | |
| if container: | |
| paragraphs = container.find_all("p") | |
| content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) | |
| if content and len(content.strip()) > 50: | |
| return content | |
| # Son çare: tüm <p> etiketlerini tara | |
| paragraphs = soup.find_all("p") | |
| content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) | |
| if content and len(content.strip()) > 50: | |
| return content | |
| # Ekstra: bazı haberler <div> içinde tek blok metin olabilir | |
| all_divs = soup.find_all("div") | |
| text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)] | |
| fallback_content = "\n".join(text_blobs) | |
| if fallback_content and len(fallback_content.strip()) > 50: | |
| return fallback_content | |
| return "" # Hiçbir içerik bulunamadıysa | |
| def scrape_haberler(): | |
| url = "https://www.haberler.com/son-dakika/" | |
| headers = {"User-Agent": "Mozilla/5.0"} | |
| response = requests.get(url, headers=headers) | |
| response.encoding = "utf-8" | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| articles = [] | |
| seen = set() | |
| for a_tag in soup.select("a"): | |
| href = a_tag.get("href", "") | |
| text = a_tag.get_text(strip=True) | |
| if not href or not text or "haberi" not in href: | |
| continue | |
| if not href.startswith("http"): | |
| href = "https://www.haberler.com" + href | |
| if href in seen: | |
| continue | |
| seen.add(href) | |
| try: | |
| detail_resp = requests.get(href, headers=headers, timeout=10) | |
| detail_resp.encoding = "utf-8" | |
| detail_soup = BeautifulSoup(detail_resp.text, "html.parser") | |
| title_tag = detail_soup.select_one("h1") | |
| full_content = extract_full_content(detail_soup) | |
| full_content = clean_haberler_content(full_content) | |
| if title_tag and full_content and len(full_content.strip()) > 50: | |
| article = { | |
| "id": get_article_id(href), | |
| "title": title_tag.get_text(strip=True), | |
| "content": full_content, | |
| "url": href, | |
| "source": "haberler.com", | |
| "timestamp": datetime.utcnow().isoformat() | |
| } | |
| articles.append(article) | |
| print(f"{article['title']} → {href}") | |
| else: | |
| print(f"İçerik bulunamadı → {href}") | |
| except Exception as e: | |
| print(f"Hata ({href}): {e}") | |
| print(f"\nToplam {len(articles)} haber çekildi.") | |
| return articles | |
| # Test / terminal çıktısı | |
| if __name__ == "__main__": | |
| print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n") | |
| articles = scrape_haberler() | |
| print("\nÇekilen Haber Özeti:") | |
| for i, article in enumerate(articles, 1): | |
| print(f"\n{i}. Haber") | |
| print(f"Başlık: {article['title']}") | |
| print(f"Link: {article['url']}") | |
| print(f"İçerik Uzunluğu: {len(article['content'])} karakter") | |
| print(f"Zaman Damgası: {article['timestamp']}") | |
| print(f"\nİçerik:\n{article['content']}") | |
| print("-" * 120) | |