Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Sleeping

App Files Files Community

Turkish-LLM-RAG-Chatbot / RAG /scraper /haberler.py

iamseyhmus7

Upload 17 files

70d956a verified 7 months ago

raw

history blame contribute delete

4.76 kB

	import requests
	from bs4 import BeautifulSoup
	from datetime import datetime
	import hashlib

	def get_article_id(url):
	return hashlib.md5(url.encode()).hexdigest()

	def clean_haberler_content(content):
	"""Gereksiz telif ve site reklam metinlerini siler"""
	blacklist_phrases = [
	"© Copyright",
	"Haberler.com:",
	"Haber:",
	"bildirimlerimize izin vererek",
	"masaüstü",
	"Tüm Hakları Gizlidir",
	"Haberler.com’da"
	]
	lines = content.split("\n")
	cleaned_lines = []
	for line in lines:
	if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases):
	cleaned_lines.append(line)
	return "\n".join(cleaned_lines).strip()

	def extract_full_content(soup):
	"""Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır"""

	candidate_containers = [
	# Özel div class'ları
	("div", {"class": "haber-metin"}),
	("div", {"class": "article-content"}),
	("div", {"class": "news-content"}),
	("div", {"class": "detail-text"}),
	("div", {"class": "content-text"}),
	("div", {"id": "content"}),
	# Article
	("article", {}),
	# Section/main
	("section", {}),
	("main", {}),
	]

	for tag, attr in candidate_containers:
	container = soup.find(tag, attr)
	if container:
	paragraphs = container.find_all("p")
	content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
	if content and len(content.strip()) > 50:
	return content

	# Son çare: tüm <p> etiketlerini tara
	paragraphs = soup.find_all("p")
	content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
	if content and len(content.strip()) > 50:
	return content

	# Ekstra: bazı haberler <div> içinde tek blok metin olabilir
	all_divs = soup.find_all("div")
	text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)]
	fallback_content = "\n".join(text_blobs)
	if fallback_content and len(fallback_content.strip()) > 50:
	return fallback_content

	return "" # Hiçbir içerik bulunamadıysa

	def scrape_haberler():
	url = "https://www.haberler.com/son-dakika/"
	headers = {"User-Agent": "Mozilla/5.0"}
	response = requests.get(url, headers=headers)
	response.encoding = "utf-8"
	soup = BeautifulSoup(response.text, "html.parser")

	articles = []
	seen = set()

	for a_tag in soup.select("a"):
	href = a_tag.get("href", "")
	text = a_tag.get_text(strip=True)

	if not href or not text or "haberi" not in href:
	continue

	if not href.startswith("http"):
	href = "https://www.haberler.com" + href

	if href in seen:
	continue
	seen.add(href)

	try:
	detail_resp = requests.get(href, headers=headers, timeout=10)
	detail_resp.encoding = "utf-8"
	detail_soup = BeautifulSoup(detail_resp.text, "html.parser")

	title_tag = detail_soup.select_one("h1")
	full_content = extract_full_content(detail_soup)
	full_content = clean_haberler_content(full_content)

	if title_tag and full_content and len(full_content.strip()) > 50:
	article = {
	"id": get_article_id(href),
	"title": title_tag.get_text(strip=True),
	"content": full_content,
	"url": href,
	"source": "haberler.com",
	"timestamp": datetime.utcnow().isoformat()
	}
	articles.append(article)
	print(f"{article['title']} → {href}")
	else:
	print(f"İçerik bulunamadı → {href}")

	except Exception as e:
	print(f"Hata ({href}): {e}")

	print(f"\nToplam {len(articles)} haber çekildi.")
	return articles

	# Test / terminal çıktısı
	if __name__ == "__main__":
	print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n")

	articles = scrape_haberler()

	print("\nÇekilen Haber Özeti:")
	for i, article in enumerate(articles, 1):
	print(f"\n{i}. Haber")
	print(f"Başlık: {article['title']}")
	print(f"Link: {article['url']}")
	print(f"İçerik Uzunluğu: {len(article['content'])} karakter")
	print(f"Zaman Damgası: {article['timestamp']}")
	print(f"\nİçerik:\n{article['content']}")
	print("-" * 120)