Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Sleeping

App Files Files Community

Turkish-LLM-RAG-Chatbot / RAG /scraper /milliyet.py

iamseyhmus7

Upload 17 files

70d956a verified 7 months ago

raw

history blame contribute delete

2.15 kB

	# milliyet_link_scraper.py
	import requests
	from bs4 import BeautifulSoup

	def get_sondakika_links():
	url = "https://www.milliyet.com.tr/son-dakika/"
	headers = {
	"User-Agent": "Mozilla/5.0"
	}
	base_url = "https://www.milliyet.com.tr"

	response = requests.get(url, headers=headers)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	seen = set()
	news_links = []

	for a in soup.find_all("a", href=True):
	href = a["href"].strip()

	if href.startswith("/"):
	href = base_url + href
	elif not href.startswith("http"):
	continue

	# -737 ile biten haber ID'sine sahip olanları al (haber linkleri)
	if "-737" in href and "milliyet.com.tr" in href:
	if href not in seen:
	seen.add(href)
	news_links.append(href)

	return news_links
	def get_news_content(url):
	headers = {"User-Agent": "Mozilla/5.0"}
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	# Başlık bulma (farklı HTML yapıları için deneme)
	title = None
	for selector in [
	("h1", {"id": "title"}),
	("h1", {"class": "news-title"}),
	("h1", {})
	]:
	found = soup.find(selector[0], selector[1])
	if found:
	title = found.get_text(strip=True)
	break
	if not title:
	title = "Başlık bulunamadı"

	# İçerik bulma
	content = ""
	article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content")
	if article_div:
	paragraphs = article_div.find_all("p")
	content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
	else:
	# Yedek olarak tüm paragrafları dene
	paragraphs = soup.find_all("p")
	content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])

	return {
	"title": title,
	"content": content.strip()
	}