Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- scripts/clean_text_data.py +62 -0
- scripts/create_sinhala_test_set.py +37 -0
- scripts/create_test_set.py +44 -0
- scripts/download_model.py +36 -0
- scripts/fetch_parallel_data.py +81 -0
- scripts/scrape_bbc_nepali.py +80 -0
scripts/clean_text_data.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/clean_text_data.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import datetime
|
| 5 |
+
|
| 6 |
+
def clean_data():
|
| 7 |
+
"""
|
| 8 |
+
Reads a raw text file, cleans it, and saves it to the processed data folder.
|
| 9 |
+
"""
|
| 10 |
+
# --- Configuration ---
|
| 11 |
+
# Construct the filename based on today's date, matching the scraper's output
|
| 12 |
+
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
|
| 13 |
+
raw_filename = f"bbc_nepali_articles_{current_date}.txt"
|
| 14 |
+
cleaned_filename = f"bbc_nepali_articles_{current_date}_cleaned.txt"
|
| 15 |
+
|
| 16 |
+
# Define the paths using our project structure
|
| 17 |
+
raw_file_path = os.path.join("data", "raw", raw_filename)
|
| 18 |
+
processed_file_path = os.path.join("data", "processed", cleaned_filename)
|
| 19 |
+
|
| 20 |
+
# Simple rule: we'll discard any line that has fewer than this many words.
|
| 21 |
+
MIN_WORDS_PER_LINE = 5
|
| 22 |
+
# --- End Configuration ---
|
| 23 |
+
|
| 24 |
+
print("--- Starting data cleaning process ---")
|
| 25 |
+
|
| 26 |
+
# Check if the raw file exists before we start
|
| 27 |
+
if not os.path.exists(raw_file_path):
|
| 28 |
+
print(f"Error: Raw data file not found at '{raw_file_path}'")
|
| 29 |
+
print("Please run the scraping script first.")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
print(f"Reading raw data from: {raw_file_path}")
|
| 33 |
+
|
| 34 |
+
# Read all lines from the raw file
|
| 35 |
+
with open(raw_file_path, "r", encoding="utf-8") as f:
|
| 36 |
+
lines = f.readlines()
|
| 37 |
+
|
| 38 |
+
cleaned_lines = []
|
| 39 |
+
for line in lines:
|
| 40 |
+
# 1. Strip leading/trailing whitespace from the line
|
| 41 |
+
text = line.strip()
|
| 42 |
+
|
| 43 |
+
# 2. Apply our cleaning rules
|
| 44 |
+
# We keep the line only if it's not empty AND has enough words
|
| 45 |
+
if text and len(text.split()) >= MIN_WORDS_PER_LINE:
|
| 46 |
+
cleaned_lines.append(text)
|
| 47 |
+
|
| 48 |
+
# 3. Save the cleaned lines to the new file
|
| 49 |
+
print(f"Saving cleaned data to: {processed_file_path}")
|
| 50 |
+
os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
|
| 51 |
+
with open(processed_file_path, "w", encoding="utf-8") as f:
|
| 52 |
+
f.write("\n".join(cleaned_lines))
|
| 53 |
+
|
| 54 |
+
# Print a summary report
|
| 55 |
+
print("\n--- Cleaning Summary ---")
|
| 56 |
+
print(f"Total lines read: {len(lines)}")
|
| 57 |
+
print(f"Lines after cleaning: {len(cleaned_lines)}")
|
| 58 |
+
print(f"Lines discarded: {len(lines) - len(cleaned_lines)}")
|
| 59 |
+
print("------------------------")
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
clean_data()
|
scripts/create_sinhala_test_set.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/create_sinhala_test_set.py
|
| 2 |
+
import os
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
|
| 5 |
+
# --- Configuration ---
|
| 6 |
+
DATA_DIR = "data/processed"
|
| 7 |
+
TEST_DIR = "data/test_sets"
|
| 8 |
+
DATASET_NAME = "Programmer-RD-AI/sinhala-english-singlish-translation"
|
| 9 |
+
NUM_TEST_LINES = 500
|
| 10 |
+
# ---
|
| 11 |
+
|
| 12 |
+
print("--- Creating a held-back test set for Sinhalese ---")
|
| 13 |
+
os.makedirs(TEST_DIR, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
# Load the dataset from Hugging Face
|
| 16 |
+
dataset = load_dataset(DATASET_NAME, split='train')
|
| 17 |
+
|
| 18 |
+
# Split the dataset
|
| 19 |
+
train_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES))
|
| 20 |
+
test_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES, len(dataset)))
|
| 21 |
+
|
| 22 |
+
# Write the new training files
|
| 23 |
+
with open(os.path.join(DATA_DIR, "sinhala.si"), "w", encoding="utf-8") as f_source, \
|
| 24 |
+
open(os.path.join(DATA_DIR, "sinhala.en"), "w", encoding="utf-8") as f_target:
|
| 25 |
+
for example in train_dataset:
|
| 26 |
+
f_source.write(example['Sinhala'] + "\n")
|
| 27 |
+
f_target.write(example['English'] + "\n")
|
| 28 |
+
|
| 29 |
+
# Write the new test files
|
| 30 |
+
with open(os.path.join(TEST_DIR, "test.si"), "w", encoding="utf-8") as f_source, \
|
| 31 |
+
open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f_target:
|
| 32 |
+
for example in test_dataset:
|
| 33 |
+
f_source.write(example['Sinhala'] + "\n")
|
| 34 |
+
f_target.write(example['English'] + "\n")
|
| 35 |
+
|
| 36 |
+
print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Sinhalese.")
|
| 37 |
+
print(f"The original training files in '{DATA_DIR}' have been updated.")
|
scripts/create_test_set.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/create_test_set.py
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# --- Configuration ---
|
| 5 |
+
DATA_DIR = "data/processed"
|
| 6 |
+
TEST_DIR = "data/test_sets"
|
| 7 |
+
SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne")
|
| 8 |
+
TARGET_FILE = os.path.join(DATA_DIR, "nepali.en")
|
| 9 |
+
NUM_TEST_LINES = 500
|
| 10 |
+
# ---
|
| 11 |
+
|
| 12 |
+
print("--- Creating a held-back test set for Nepali ---")
|
| 13 |
+
os.makedirs(TEST_DIR, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
# Read all lines from the original files
|
| 16 |
+
with open(SOURCE_FILE, "r", encoding="utf-8") as f:
|
| 17 |
+
source_lines = f.readlines()
|
| 18 |
+
with open(TARGET_FILE, "r", encoding="utf-8") as f:
|
| 19 |
+
target_lines = f.readlines()
|
| 20 |
+
|
| 21 |
+
# Ensure the files have the same number of lines
|
| 22 |
+
assert len(source_lines) == len(target_lines), "Source and target files have different lengths!"
|
| 23 |
+
|
| 24 |
+
# Split the data
|
| 25 |
+
train_source_lines = source_lines[:-NUM_TEST_LINES]
|
| 26 |
+
test_source_lines = source_lines[-NUM_TEST_LINES:]
|
| 27 |
+
|
| 28 |
+
train_target_lines = target_lines[:-NUM_TEST_LINES]
|
| 29 |
+
test_target_lines = target_lines[-NUM_TEST_LINES:]
|
| 30 |
+
|
| 31 |
+
# Write the new, smaller training files (overwriting the old ones)
|
| 32 |
+
with open(SOURCE_FILE, "w", encoding="utf-8") as f:
|
| 33 |
+
f.writelines(train_source_lines)
|
| 34 |
+
with open(TARGET_FILE, "w", encoding="utf-8") as f:
|
| 35 |
+
f.writelines(train_target_lines)
|
| 36 |
+
|
| 37 |
+
# Write the new test files
|
| 38 |
+
with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f:
|
| 39 |
+
f.writelines(test_source_lines)
|
| 40 |
+
with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f:
|
| 41 |
+
f.writelines(test_target_lines)
|
| 42 |
+
|
| 43 |
+
print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.")
|
| 44 |
+
print(f"The original training files in '{DATA_DIR}' have been updated.")
|
scripts/download_model.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
from huggingface_hub import snapshot_download
|
| 4 |
+
|
| 5 |
+
def download_model():
|
| 6 |
+
"""
|
| 7 |
+
Downloads the NLLB model from Hugging Face Hub.
|
| 8 |
+
"""
|
| 9 |
+
# --- Configuration ---
|
| 10 |
+
# Note: The original script referred to 'nllb-finetuned-nepali-en', which is not a public model.
|
| 11 |
+
# We are downloading the base model 'facebook/nllb-200-distilled-600M' instead.
|
| 12 |
+
# You may need to fine-tune this model on your own dataset to get the desired performance.
|
| 13 |
+
model_name = "facebook/nllb-200-distilled-600M"
|
| 14 |
+
|
| 15 |
+
# --- Path setup ---
|
| 16 |
+
# Construct the path to save the model, relative to this script's location.
|
| 17 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
+
# We want to save it in saksi_translation/models/nllb-finetuned-nepali-en
|
| 19 |
+
target_dir = os.path.abspath(os.path.join(script_dir, '..', 'models', 'nllb-finetuned-nepali-en'))
|
| 20 |
+
|
| 21 |
+
print(f"Downloading model: {model_name}")
|
| 22 |
+
print(f"Saving to: {target_dir}")
|
| 23 |
+
|
| 24 |
+
# --- Download ---
|
| 25 |
+
try:
|
| 26 |
+
if not os.path.exists(target_dir):
|
| 27 |
+
os.makedirs(target_dir)
|
| 28 |
+
|
| 29 |
+
snapshot_download(repo_id=model_name, local_dir=target_dir, local_dir_use_symlinks=False)
|
| 30 |
+
print("Model downloaded successfully.")
|
| 31 |
+
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"An error occurred during download: {e}")
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
download_model()
|
scripts/fetch_parallel_data.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/fetch_parallel_data.py
|
| 2 |
+
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def fetch_and_save_parallel_data(lang_pair, dataset_name, output_name):
|
| 8 |
+
"""
|
| 9 |
+
Downloads a parallel dataset and saves it into two
|
| 10 |
+
separate text files (one for each language).
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
lang_pair (str): Language pair, e.g., "en-ne" for English-Nepali.
|
| 14 |
+
dataset_name (str): The name of the dataset on Hugging Face Hub.
|
| 15 |
+
output_name (str): The name to use for the output files.
|
| 16 |
+
"""
|
| 17 |
+
source_lang, target_lang = lang_pair.split("-")
|
| 18 |
+
output_dir = "data/processed"
|
| 19 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
source_filepath = os.path.join(output_dir, f"{output_name}.{source_lang}")
|
| 22 |
+
target_filepath = os.path.join(output_dir, f"{output_name}.{target_lang}")
|
| 23 |
+
|
| 24 |
+
print(f"--- Starting download for {lang_pair} from {dataset_name} ---")
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
# Load the dataset from Hugging Face
|
| 28 |
+
if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
|
| 29 |
+
dataset = load_dataset(dataset_name, split='train')
|
| 30 |
+
else:
|
| 31 |
+
dataset = load_dataset(dataset_name, lang_pair, split='train')
|
| 32 |
+
print(f"Dataset loaded successfully. Total pairs: {len(dataset)}")
|
| 33 |
+
|
| 34 |
+
print(f"Processing and saving files...")
|
| 35 |
+
with open(source_filepath, "w", encoding="utf-8") as f_source, \
|
| 36 |
+
open(target_filepath, "w", encoding="utf-8") as f_target:
|
| 37 |
+
|
| 38 |
+
for example in dataset:
|
| 39 |
+
if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
|
| 40 |
+
source_sentence = example['Sinhala']
|
| 41 |
+
target_sentence = example['English']
|
| 42 |
+
else:
|
| 43 |
+
source_sentence = example['translation'][source_lang]
|
| 44 |
+
target_sentence = example['translation'][target_lang]
|
| 45 |
+
|
| 46 |
+
if source_sentence and target_sentence:
|
| 47 |
+
f_source.write(source_sentence.strip() + "\n")
|
| 48 |
+
f_target.write(target_sentence.strip() + "\n")
|
| 49 |
+
|
| 50 |
+
print(f"Successfully saved data for {lang_pair}")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"An error occurred for {lang_pair}: {e}")
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
# --- Fetch Nepali Data ---
|
| 56 |
+
print("Fetching Nepali data...")
|
| 57 |
+
fetch_and_save_parallel_data(lang_pair="en-ne", dataset_name="Helsinki-NLP/opus-100", output_name="nepali")
|
| 58 |
+
|
| 59 |
+
# --- Fetch Sinhalese Data ---
|
| 60 |
+
print("\nFetching Sinhalese data...")
|
| 61 |
+
fetch_and_save_parallel_data(lang_pair="si-en", dataset_name="Programmer-RD-AI/sinhala-english-singlish-translation", output_name="sinhala")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# --- Fetch Sinhalese Idioms Data ---
|
| 65 |
+
print("\nFetching Sinhalese idioms data...")
|
| 66 |
+
output_dir = "data/processed"
|
| 67 |
+
try:
|
| 68 |
+
idioms_dataset = load_dataset("Venuraa/English-Sinhala-Idioms-Parallel-Translations", split='train')
|
| 69 |
+
print(f"Idioms dataset loaded successfully. Total pairs: {len(idioms_dataset)}")
|
| 70 |
+
|
| 71 |
+
with open(os.path.join(output_dir, "sinhala.si"), "a", encoding="utf-8") as f_source, \
|
| 72 |
+
open(os.path.join(output_dir, "sinhala.en"), "a", encoding="utf-8") as f_target:
|
| 73 |
+
for example in idioms_dataset:
|
| 74 |
+
parts = example['text'].split('\n')
|
| 75 |
+
if len(parts) == 2:
|
| 76 |
+
f_target.write(parts[0] + "\n")
|
| 77 |
+
f_source.write(parts[1] + "\n")
|
| 78 |
+
print("Successfully appended idioms data.")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"An error occurred while fetching idioms data: {e}")
|
| 81 |
+
print("\nAll data fetching complete.")
|
scripts/scrape_bbc_nepali.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/scrape_bbc_nepali.py
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
import datetime
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
def scrape_bbc_nepali():
|
| 9 |
+
"""
|
| 10 |
+
Scrapes news articles from the BBC Nepali homepage and saves them to a file.
|
| 11 |
+
"""
|
| 12 |
+
# The base URL for BBC Nepali news
|
| 13 |
+
BASE_URL = "https://www.bbc.com"
|
| 14 |
+
START_URL = f"{BASE_URL}/nepali"
|
| 15 |
+
|
| 16 |
+
# Get the current date to create a unique filename
|
| 17 |
+
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
|
| 18 |
+
output_filename = f"bbc_nepali_articles_{current_date}.txt"
|
| 19 |
+
|
| 20 |
+
# Ensure the output directory exists
|
| 21 |
+
output_dir = "data/raw"
|
| 22 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 23 |
+
output_path = os.path.join(output_dir, output_filename)
|
| 24 |
+
|
| 25 |
+
print(f"Starting scrape of {START_URL}")
|
| 26 |
+
print(f"Saving data to: {output_path}")
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
# 1. Fetch the main homepage
|
| 30 |
+
main_page = requests.get(START_URL)
|
| 31 |
+
main_page.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
|
| 32 |
+
|
| 33 |
+
main_soup = BeautifulSoup(main_page.content, "html.parser")
|
| 34 |
+
|
| 35 |
+
# 2. Find all links that likely lead to articles
|
| 36 |
+
# This is a bit of trial and error; we look for <a> tags with hrefs
|
| 37 |
+
# that match the pattern of BBC articles.
|
| 38 |
+
article_links = set() # Use a set to avoid duplicate links
|
| 39 |
+
for a_tag in main_soup.find_all("a", href=True):
|
| 40 |
+
href = a_tag['href']
|
| 41 |
+
# We filter for links that look like internal news articles
|
| 42 |
+
if href.startswith("/nepali/articles/"):
|
| 43 |
+
full_url = f"{BASE_URL}{href}"
|
| 44 |
+
article_links.add(full_url)
|
| 45 |
+
|
| 46 |
+
print(f"Found {len(article_links)} unique article links.")
|
| 47 |
+
|
| 48 |
+
# 3. Visit each article and extract its text
|
| 49 |
+
all_article_text = []
|
| 50 |
+
for i, link in enumerate(article_links):
|
| 51 |
+
try:
|
| 52 |
+
print(f" Scraping ({i+1}/{len(article_links)}): {link}")
|
| 53 |
+
article_page = requests.get(link)
|
| 54 |
+
article_page.raise_for_status()
|
| 55 |
+
|
| 56 |
+
article_soup = BeautifulSoup(article_page.content, "html.parser")
|
| 57 |
+
|
| 58 |
+
# Find all paragraph tags (<p>) which usually contain the article text
|
| 59 |
+
paragraphs = article_soup.find_all("p")
|
| 60 |
+
|
| 61 |
+
article_text = "\n".join([p.get_text() for p in paragraphs])
|
| 62 |
+
all_article_text.append(article_text)
|
| 63 |
+
|
| 64 |
+
except requests.exceptions.RequestException as e:
|
| 65 |
+
print(f" Could not fetch article {link}: {e}")
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f" An error occurred while processing {link}: {e}")
|
| 68 |
+
|
| 69 |
+
# 4. Save the collected text to a file
|
| 70 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 71 |
+
# Separate articles with a clear delimiter
|
| 72 |
+
f.write("\n\n--- NEW ARTICLE ---\n\n".join(all_article_text))
|
| 73 |
+
|
| 74 |
+
print(f"\nScraping complete. All text saved to {output_path}")
|
| 75 |
+
|
| 76 |
+
except requests.exceptions.RequestException as e:
|
| 77 |
+
print(f"Failed to fetch the main page {START_URL}: {e}")
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
scrape_bbc_nepali()
|