Spaces:

Dyno1307
/

Translate

Configuration error

App Files Files Community

Translate / scripts /scrape_bbc_nepali.py

Dyno1307

Upload 48 files

b653f91 verified 2 months ago

raw

history blame contribute delete

3.17 kB

	# scripts/scrape_bbc_nepali.py

	import requests
	from bs4 import BeautifulSoup
	import datetime
	import os

	def scrape_bbc_nepali():
	"""
	Scrapes news articles from the BBC Nepali homepage and saves them to a file.
	"""
	# The base URL for BBC Nepali news
	BASE_URL = "https://www.bbc.com"
	START_URL = f"{BASE_URL}/nepali"

	# Get the current date to create a unique filename
	current_date = datetime.datetime.now().strftime("%Y-%m-%d")
	output_filename = f"bbc_nepali_articles_{current_date}.txt"

	# Ensure the output directory exists
	output_dir = "data/raw"
	os.makedirs(output_dir, exist_ok=True)
	output_path = os.path.join(output_dir, output_filename)

	print(f"Starting scrape of {START_URL}")
	print(f"Saving data to: {output_path}")

	try:
	# 1. Fetch the main homepage
	main_page = requests.get(START_URL)
	main_page.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

	main_soup = BeautifulSoup(main_page.content, "html.parser")

	# 2. Find all links that likely lead to articles
	# This is a bit of trial and error; we look for <a> tags with hrefs
	# that match the pattern of BBC articles.
	article_links = set() # Use a set to avoid duplicate links
	for a_tag in main_soup.find_all("a", href=True):
	href = a_tag['href']
	# We filter for links that look like internal news articles
	if href.startswith("/nepali/articles/"):
	full_url = f"{BASE_URL}{href}"
	article_links.add(full_url)

	print(f"Found {len(article_links)} unique article links.")

	# 3. Visit each article and extract its text
	all_article_text = []
	for i, link in enumerate(article_links):
	try:
	print(f" Scraping ({i+1}/{len(article_links)}): {link}")
	article_page = requests.get(link)
	article_page.raise_for_status()

	article_soup = BeautifulSoup(article_page.content, "html.parser")

	# Find all paragraph tags (<p>) which usually contain the article text
	paragraphs = article_soup.find_all("p")

	article_text = "\n".join([p.get_text() for p in paragraphs])
	all_article_text.append(article_text)

	except requests.exceptions.RequestException as e:
	print(f" Could not fetch article {link}: {e}")
	except Exception as e:
	print(f" An error occurred while processing {link}: {e}")

	# 4. Save the collected text to a file
	with open(output_path, "w", encoding="utf-8") as f:
	# Separate articles with a clear delimiter
	f.write("\n\n--- NEW ARTICLE ---\n\n".join(all_article_text))

	print(f"\nScraping complete. All text saved to {output_path}")

	except requests.exceptions.RequestException as e:
	print(f"Failed to fetch the main page {START_URL}: {e}")

	if __name__ == "__main__":
	scrape_bbc_nepali()