|
|
|
|
|
|
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import datetime |
|
|
import os |
|
|
|
|
|
def scrape_bbc_nepali(): |
|
|
""" |
|
|
Scrapes news articles from the BBC Nepali homepage and saves them to a file. |
|
|
""" |
|
|
|
|
|
BASE_URL = "https://www.bbc.com" |
|
|
START_URL = f"{BASE_URL}/nepali" |
|
|
|
|
|
|
|
|
current_date = datetime.datetime.now().strftime("%Y-%m-%d") |
|
|
output_filename = f"bbc_nepali_articles_{current_date}.txt" |
|
|
|
|
|
|
|
|
output_dir = "data/raw" |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
output_path = os.path.join(output_dir, output_filename) |
|
|
|
|
|
print(f"Starting scrape of {START_URL}") |
|
|
print(f"Saving data to: {output_path}") |
|
|
|
|
|
try: |
|
|
|
|
|
main_page = requests.get(START_URL) |
|
|
main_page.raise_for_status() |
|
|
|
|
|
main_soup = BeautifulSoup(main_page.content, "html.parser") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
article_links = set() |
|
|
for a_tag in main_soup.find_all("a", href=True): |
|
|
href = a_tag['href'] |
|
|
|
|
|
if href.startswith("/nepali/articles/"): |
|
|
full_url = f"{BASE_URL}{href}" |
|
|
article_links.add(full_url) |
|
|
|
|
|
print(f"Found {len(article_links)} unique article links.") |
|
|
|
|
|
|
|
|
all_article_text = [] |
|
|
for i, link in enumerate(article_links): |
|
|
try: |
|
|
print(f" Scraping ({i+1}/{len(article_links)}): {link}") |
|
|
article_page = requests.get(link) |
|
|
article_page.raise_for_status() |
|
|
|
|
|
article_soup = BeautifulSoup(article_page.content, "html.parser") |
|
|
|
|
|
|
|
|
paragraphs = article_soup.find_all("p") |
|
|
|
|
|
article_text = "\n".join([p.get_text() for p in paragraphs]) |
|
|
all_article_text.append(article_text) |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f" Could not fetch article {link}: {e}") |
|
|
except Exception as e: |
|
|
print(f" An error occurred while processing {link}: {e}") |
|
|
|
|
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f: |
|
|
|
|
|
f.write("\n\n--- NEW ARTICLE ---\n\n".join(all_article_text)) |
|
|
|
|
|
print(f"\nScraping complete. All text saved to {output_path}") |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Failed to fetch the main page {START_URL}: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
scrape_bbc_nepali() |