Dyno1307 commited on
Commit
fd2f49a
·
verified ·
1 Parent(s): 03b2ad1

Upload 6 files

Browse files
scripts/clean_text_data.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/clean_text_data.py
2
+
3
+ import os
4
+ import datetime
5
+
6
+ def clean_data():
7
+ """
8
+ Reads a raw text file, cleans it, and saves it to the processed data folder.
9
+ """
10
+ # --- Configuration ---
11
+ # Construct the filename based on today's date, matching the scraper's output
12
+ current_date = datetime.datetime.now().strftime("%Y-%m-%d")
13
+ raw_filename = f"bbc_nepali_articles_{current_date}.txt"
14
+ cleaned_filename = f"bbc_nepali_articles_{current_date}_cleaned.txt"
15
+
16
+ # Define the paths using our project structure
17
+ raw_file_path = os.path.join("data", "raw", raw_filename)
18
+ processed_file_path = os.path.join("data", "processed", cleaned_filename)
19
+
20
+ # Simple rule: we'll discard any line that has fewer than this many words.
21
+ MIN_WORDS_PER_LINE = 5
22
+ # --- End Configuration ---
23
+
24
+ print("--- Starting data cleaning process ---")
25
+
26
+ # Check if the raw file exists before we start
27
+ if not os.path.exists(raw_file_path):
28
+ print(f"Error: Raw data file not found at '{raw_file_path}'")
29
+ print("Please run the scraping script first.")
30
+ return
31
+
32
+ print(f"Reading raw data from: {raw_file_path}")
33
+
34
+ # Read all lines from the raw file
35
+ with open(raw_file_path, "r", encoding="utf-8") as f:
36
+ lines = f.readlines()
37
+
38
+ cleaned_lines = []
39
+ for line in lines:
40
+ # 1. Strip leading/trailing whitespace from the line
41
+ text = line.strip()
42
+
43
+ # 2. Apply our cleaning rules
44
+ # We keep the line only if it's not empty AND has enough words
45
+ if text and len(text.split()) >= MIN_WORDS_PER_LINE:
46
+ cleaned_lines.append(text)
47
+
48
+ # 3. Save the cleaned lines to the new file
49
+ print(f"Saving cleaned data to: {processed_file_path}")
50
+ os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
51
+ with open(processed_file_path, "w", encoding="utf-8") as f:
52
+ f.write("\n".join(cleaned_lines))
53
+
54
+ # Print a summary report
55
+ print("\n--- Cleaning Summary ---")
56
+ print(f"Total lines read: {len(lines)}")
57
+ print(f"Lines after cleaning: {len(cleaned_lines)}")
58
+ print(f"Lines discarded: {len(lines) - len(cleaned_lines)}")
59
+ print("------------------------")
60
+
61
+ if __name__ == "__main__":
62
+ clean_data()
scripts/create_sinhala_test_set.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/create_sinhala_test_set.py
2
+ import os
3
+ from datasets import load_dataset
4
+
5
+ # --- Configuration ---
6
+ DATA_DIR = "data/processed"
7
+ TEST_DIR = "data/test_sets"
8
+ DATASET_NAME = "Programmer-RD-AI/sinhala-english-singlish-translation"
9
+ NUM_TEST_LINES = 500
10
+ # ---
11
+
12
+ print("--- Creating a held-back test set for Sinhalese ---")
13
+ os.makedirs(TEST_DIR, exist_ok=True)
14
+
15
+ # Load the dataset from Hugging Face
16
+ dataset = load_dataset(DATASET_NAME, split='train')
17
+
18
+ # Split the dataset
19
+ train_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES))
20
+ test_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES, len(dataset)))
21
+
22
+ # Write the new training files
23
+ with open(os.path.join(DATA_DIR, "sinhala.si"), "w", encoding="utf-8") as f_source, \
24
+ open(os.path.join(DATA_DIR, "sinhala.en"), "w", encoding="utf-8") as f_target:
25
+ for example in train_dataset:
26
+ f_source.write(example['Sinhala'] + "\n")
27
+ f_target.write(example['English'] + "\n")
28
+
29
+ # Write the new test files
30
+ with open(os.path.join(TEST_DIR, "test.si"), "w", encoding="utf-8") as f_source, \
31
+ open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f_target:
32
+ for example in test_dataset:
33
+ f_source.write(example['Sinhala'] + "\n")
34
+ f_target.write(example['English'] + "\n")
35
+
36
+ print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Sinhalese.")
37
+ print(f"The original training files in '{DATA_DIR}' have been updated.")
scripts/create_test_set.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/create_test_set.py
2
+ import os
3
+
4
+ # --- Configuration ---
5
+ DATA_DIR = "data/processed"
6
+ TEST_DIR = "data/test_sets"
7
+ SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne")
8
+ TARGET_FILE = os.path.join(DATA_DIR, "nepali.en")
9
+ NUM_TEST_LINES = 500
10
+ # ---
11
+
12
+ print("--- Creating a held-back test set for Nepali ---")
13
+ os.makedirs(TEST_DIR, exist_ok=True)
14
+
15
+ # Read all lines from the original files
16
+ with open(SOURCE_FILE, "r", encoding="utf-8") as f:
17
+ source_lines = f.readlines()
18
+ with open(TARGET_FILE, "r", encoding="utf-8") as f:
19
+ target_lines = f.readlines()
20
+
21
+ # Ensure the files have the same number of lines
22
+ assert len(source_lines) == len(target_lines), "Source and target files have different lengths!"
23
+
24
+ # Split the data
25
+ train_source_lines = source_lines[:-NUM_TEST_LINES]
26
+ test_source_lines = source_lines[-NUM_TEST_LINES:]
27
+
28
+ train_target_lines = target_lines[:-NUM_TEST_LINES]
29
+ test_target_lines = target_lines[-NUM_TEST_LINES:]
30
+
31
+ # Write the new, smaller training files (overwriting the old ones)
32
+ with open(SOURCE_FILE, "w", encoding="utf-8") as f:
33
+ f.writelines(train_source_lines)
34
+ with open(TARGET_FILE, "w", encoding="utf-8") as f:
35
+ f.writelines(train_target_lines)
36
+
37
+ # Write the new test files
38
+ with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f:
39
+ f.writelines(test_source_lines)
40
+ with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f:
41
+ f.writelines(test_target_lines)
42
+
43
+ print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.")
44
+ print(f"The original training files in '{DATA_DIR}' have been updated.")
scripts/download_model.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from huggingface_hub import snapshot_download
4
+
5
+ def download_model():
6
+ """
7
+ Downloads the NLLB model from Hugging Face Hub.
8
+ """
9
+ # --- Configuration ---
10
+ # Note: The original script referred to 'nllb-finetuned-nepali-en', which is not a public model.
11
+ # We are downloading the base model 'facebook/nllb-200-distilled-600M' instead.
12
+ # You may need to fine-tune this model on your own dataset to get the desired performance.
13
+ model_name = "facebook/nllb-200-distilled-600M"
14
+
15
+ # --- Path setup ---
16
+ # Construct the path to save the model, relative to this script's location.
17
+ script_dir = os.path.dirname(os.path.abspath(__file__))
18
+ # We want to save it in saksi_translation/models/nllb-finetuned-nepali-en
19
+ target_dir = os.path.abspath(os.path.join(script_dir, '..', 'models', 'nllb-finetuned-nepali-en'))
20
+
21
+ print(f"Downloading model: {model_name}")
22
+ print(f"Saving to: {target_dir}")
23
+
24
+ # --- Download ---
25
+ try:
26
+ if not os.path.exists(target_dir):
27
+ os.makedirs(target_dir)
28
+
29
+ snapshot_download(repo_id=model_name, local_dir=target_dir, local_dir_use_symlinks=False)
30
+ print("Model downloaded successfully.")
31
+
32
+ except Exception as e:
33
+ print(f"An error occurred during download: {e}")
34
+
35
+ if __name__ == "__main__":
36
+ download_model()
scripts/fetch_parallel_data.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/fetch_parallel_data.py
2
+
3
+ from datasets import load_dataset
4
+ import os
5
+
6
+
7
+ def fetch_and_save_parallel_data(lang_pair, dataset_name, output_name):
8
+ """
9
+ Downloads a parallel dataset and saves it into two
10
+ separate text files (one for each language).
11
+
12
+ Args:
13
+ lang_pair (str): Language pair, e.g., "en-ne" for English-Nepali.
14
+ dataset_name (str): The name of the dataset on Hugging Face Hub.
15
+ output_name (str): The name to use for the output files.
16
+ """
17
+ source_lang, target_lang = lang_pair.split("-")
18
+ output_dir = "data/processed"
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ source_filepath = os.path.join(output_dir, f"{output_name}.{source_lang}")
22
+ target_filepath = os.path.join(output_dir, f"{output_name}.{target_lang}")
23
+
24
+ print(f"--- Starting download for {lang_pair} from {dataset_name} ---")
25
+
26
+ try:
27
+ # Load the dataset from Hugging Face
28
+ if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
29
+ dataset = load_dataset(dataset_name, split='train')
30
+ else:
31
+ dataset = load_dataset(dataset_name, lang_pair, split='train')
32
+ print(f"Dataset loaded successfully. Total pairs: {len(dataset)}")
33
+
34
+ print(f"Processing and saving files...")
35
+ with open(source_filepath, "w", encoding="utf-8") as f_source, \
36
+ open(target_filepath, "w", encoding="utf-8") as f_target:
37
+
38
+ for example in dataset:
39
+ if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
40
+ source_sentence = example['Sinhala']
41
+ target_sentence = example['English']
42
+ else:
43
+ source_sentence = example['translation'][source_lang]
44
+ target_sentence = example['translation'][target_lang]
45
+
46
+ if source_sentence and target_sentence:
47
+ f_source.write(source_sentence.strip() + "\n")
48
+ f_target.write(target_sentence.strip() + "\n")
49
+
50
+ print(f"Successfully saved data for {lang_pair}")
51
+ except Exception as e:
52
+ print(f"An error occurred for {lang_pair}: {e}")
53
+
54
+ if __name__ == "__main__":
55
+ # --- Fetch Nepali Data ---
56
+ print("Fetching Nepali data...")
57
+ fetch_and_save_parallel_data(lang_pair="en-ne", dataset_name="Helsinki-NLP/opus-100", output_name="nepali")
58
+
59
+ # --- Fetch Sinhalese Data ---
60
+ print("\nFetching Sinhalese data...")
61
+ fetch_and_save_parallel_data(lang_pair="si-en", dataset_name="Programmer-RD-AI/sinhala-english-singlish-translation", output_name="sinhala")
62
+
63
+
64
+ # --- Fetch Sinhalese Idioms Data ---
65
+ print("\nFetching Sinhalese idioms data...")
66
+ output_dir = "data/processed"
67
+ try:
68
+ idioms_dataset = load_dataset("Venuraa/English-Sinhala-Idioms-Parallel-Translations", split='train')
69
+ print(f"Idioms dataset loaded successfully. Total pairs: {len(idioms_dataset)}")
70
+
71
+ with open(os.path.join(output_dir, "sinhala.si"), "a", encoding="utf-8") as f_source, \
72
+ open(os.path.join(output_dir, "sinhala.en"), "a", encoding="utf-8") as f_target:
73
+ for example in idioms_dataset:
74
+ parts = example['text'].split('\n')
75
+ if len(parts) == 2:
76
+ f_target.write(parts[0] + "\n")
77
+ f_source.write(parts[1] + "\n")
78
+ print("Successfully appended idioms data.")
79
+ except Exception as e:
80
+ print(f"An error occurred while fetching idioms data: {e}")
81
+ print("\nAll data fetching complete.")
scripts/scrape_bbc_nepali.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/scrape_bbc_nepali.py
2
+
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import datetime
6
+ import os
7
+
8
+ def scrape_bbc_nepali():
9
+ """
10
+ Scrapes news articles from the BBC Nepali homepage and saves them to a file.
11
+ """
12
+ # The base URL for BBC Nepali news
13
+ BASE_URL = "https://www.bbc.com"
14
+ START_URL = f"{BASE_URL}/nepali"
15
+
16
+ # Get the current date to create a unique filename
17
+ current_date = datetime.datetime.now().strftime("%Y-%m-%d")
18
+ output_filename = f"bbc_nepali_articles_{current_date}.txt"
19
+
20
+ # Ensure the output directory exists
21
+ output_dir = "data/raw"
22
+ os.makedirs(output_dir, exist_ok=True)
23
+ output_path = os.path.join(output_dir, output_filename)
24
+
25
+ print(f"Starting scrape of {START_URL}")
26
+ print(f"Saving data to: {output_path}")
27
+
28
+ try:
29
+ # 1. Fetch the main homepage
30
+ main_page = requests.get(START_URL)
31
+ main_page.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
32
+
33
+ main_soup = BeautifulSoup(main_page.content, "html.parser")
34
+
35
+ # 2. Find all links that likely lead to articles
36
+ # This is a bit of trial and error; we look for <a> tags with hrefs
37
+ # that match the pattern of BBC articles.
38
+ article_links = set() # Use a set to avoid duplicate links
39
+ for a_tag in main_soup.find_all("a", href=True):
40
+ href = a_tag['href']
41
+ # We filter for links that look like internal news articles
42
+ if href.startswith("/nepali/articles/"):
43
+ full_url = f"{BASE_URL}{href}"
44
+ article_links.add(full_url)
45
+
46
+ print(f"Found {len(article_links)} unique article links.")
47
+
48
+ # 3. Visit each article and extract its text
49
+ all_article_text = []
50
+ for i, link in enumerate(article_links):
51
+ try:
52
+ print(f" Scraping ({i+1}/{len(article_links)}): {link}")
53
+ article_page = requests.get(link)
54
+ article_page.raise_for_status()
55
+
56
+ article_soup = BeautifulSoup(article_page.content, "html.parser")
57
+
58
+ # Find all paragraph tags (<p>) which usually contain the article text
59
+ paragraphs = article_soup.find_all("p")
60
+
61
+ article_text = "\n".join([p.get_text() for p in paragraphs])
62
+ all_article_text.append(article_text)
63
+
64
+ except requests.exceptions.RequestException as e:
65
+ print(f" Could not fetch article {link}: {e}")
66
+ except Exception as e:
67
+ print(f" An error occurred while processing {link}: {e}")
68
+
69
+ # 4. Save the collected text to a file
70
+ with open(output_path, "w", encoding="utf-8") as f:
71
+ # Separate articles with a clear delimiter
72
+ f.write("\n\n--- NEW ARTICLE ---\n\n".join(all_article_text))
73
+
74
+ print(f"\nScraping complete. All text saved to {output_path}")
75
+
76
+ except requests.exceptions.RequestException as e:
77
+ print(f"Failed to fetch the main page {START_URL}: {e}")
78
+
79
+ if __name__ == "__main__":
80
+ scrape_bbc_nepali()