Spaces:

Dyno1307
/

Translate-V2

Runtime error

App Files Files Community

Dyno1307 commited on Oct 14, 2025

Commit

fd2f49a

verified ·

1 Parent(s): 03b2ad1

Upload 6 files

Browse files

Files changed (6) hide show

scripts/clean_text_data.py +62 -0
scripts/create_sinhala_test_set.py +37 -0
scripts/create_test_set.py +44 -0
scripts/download_model.py +36 -0
scripts/fetch_parallel_data.py +81 -0
scripts/scrape_bbc_nepali.py +80 -0

scripts/clean_text_data.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# scripts/clean_text_data.py
+import os
+import datetime
+def clean_data():
+    """
+    Reads a raw text file, cleans it, and saves it to the processed data folder.
+    """
+    # --- Configuration ---
+    # Construct the filename based on today's date, matching the scraper's output
+    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
+    raw_filename = f"bbc_nepali_articles_{current_date}.txt"
+    cleaned_filename = f"bbc_nepali_articles_{current_date}_cleaned.txt"
+    # Define the paths using our project structure
+    raw_file_path = os.path.join("data", "raw", raw_filename)
+    processed_file_path = os.path.join("data", "processed", cleaned_filename)
+    # Simple rule: we'll discard any line that has fewer than this many words.
+    MIN_WORDS_PER_LINE = 5
+    # --- End Configuration ---
+    print("--- Starting data cleaning process ---")
+    # Check if the raw file exists before we start
+    if not os.path.exists(raw_file_path):
+        print(f"Error: Raw data file not found at '{raw_file_path}'")
+        print("Please run the scraping script first.")
+        return
+    print(f"Reading raw data from: {raw_file_path}")
+    # Read all lines from the raw file
+    with open(raw_file_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    cleaned_lines = []
+    for line in lines:
+        # 1. Strip leading/trailing whitespace from the line
+        text = line.strip()
+        # 2. Apply our cleaning rules
+        # We keep the line only if it's not empty AND has enough words
+        if text and len(text.split()) >= MIN_WORDS_PER_LINE:
+            cleaned_lines.append(text)
+    # 3. Save the cleaned lines to the new file
+    print(f"Saving cleaned data to: {processed_file_path}")
+    os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
+    with open(processed_file_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(cleaned_lines))
+    # Print a summary report
+    print("\n--- Cleaning Summary ---")
+    print(f"Total lines read: {len(lines)}")
+    print(f"Lines after cleaning: {len(cleaned_lines)}")
+    print(f"Lines discarded: {len(lines) - len(cleaned_lines)}")
+    print("------------------------")
+if __name__ == "__main__":
+    clean_data()

scripts/create_sinhala_test_set.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# scripts/create_sinhala_test_set.py
+import os
+from datasets import load_dataset
+# --- Configuration ---
+DATA_DIR = "data/processed"
+TEST_DIR = "data/test_sets"
+DATASET_NAME = "Programmer-RD-AI/sinhala-english-singlish-translation"
+NUM_TEST_LINES = 500
+# ---
+print("--- Creating a held-back test set for Sinhalese ---")
+os.makedirs(TEST_DIR, exist_ok=True)
+# Load the dataset from Hugging Face
+dataset = load_dataset(DATASET_NAME, split='train')
+# Split the dataset
+train_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES))
+test_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES, len(dataset)))
+# Write the new training files
+with open(os.path.join(DATA_DIR, "sinhala.si"), "w", encoding="utf-8") as f_source, \
+     open(os.path.join(DATA_DIR, "sinhala.en"), "w", encoding="utf-8") as f_target:
+    for example in train_dataset:
+        f_source.write(example['Sinhala'] + "\n")
+        f_target.write(example['English'] + "\n")
+# Write the new test files
+with open(os.path.join(TEST_DIR, "test.si"), "w", encoding="utf-8") as f_source, \
+     open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f_target:
+    for example in test_dataset:
+        f_source.write(example['Sinhala'] + "\n")
+        f_target.write(example['English'] + "\n")
+print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Sinhalese.")
+print(f"The original training files in '{DATA_DIR}' have been updated.")

scripts/create_test_set.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# scripts/create_test_set.py
+import os
+# --- Configuration ---
+DATA_DIR = "data/processed"
+TEST_DIR = "data/test_sets"
+SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne")
+TARGET_FILE = os.path.join(DATA_DIR, "nepali.en")
+NUM_TEST_LINES = 500
+# ---
+print("--- Creating a held-back test set for Nepali ---")
+os.makedirs(TEST_DIR, exist_ok=True)
+# Read all lines from the original files
+with open(SOURCE_FILE, "r", encoding="utf-8") as f:
+    source_lines = f.readlines()
+with open(TARGET_FILE, "r", encoding="utf-8") as f:
+    target_lines = f.readlines()
+# Ensure the files have the same number of lines
+assert len(source_lines) == len(target_lines), "Source and target files have different lengths!"
+# Split the data
+train_source_lines = source_lines[:-NUM_TEST_LINES]
+test_source_lines = source_lines[-NUM_TEST_LINES:]
+train_target_lines = target_lines[:-NUM_TEST_LINES]
+test_target_lines = target_lines[-NUM_TEST_LINES:]
+# Write the new, smaller training files (overwriting the old ones)
+with open(SOURCE_FILE, "w", encoding="utf-8") as f:
+    f.writelines(train_source_lines)
+with open(TARGET_FILE, "w", encoding="utf-8") as f:
+    f.writelines(train_target_lines)
+# Write the new test files
+with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f:
+    f.writelines(test_source_lines)
+with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f:
+    f.writelines(test_target_lines)
+print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.")
+print(f"The original training files in '{DATA_DIR}' have been updated.")

scripts/download_model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+from huggingface_hub import snapshot_download
+def download_model():
+    """
+    Downloads the NLLB model from Hugging Face Hub.
+    """
+    # --- Configuration ---
+    # Note: The original script referred to 'nllb-finetuned-nepali-en', which is not a public model.
+    # We are downloading the base model 'facebook/nllb-200-distilled-600M' instead.
+    # You may need to fine-tune this model on your own dataset to get the desired performance.
+    model_name = "facebook/nllb-200-distilled-600M"
+    # --- Path setup ---
+    # Construct the path to save the model, relative to this script's location.
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    # We want to save it in saksi_translation/models/nllb-finetuned-nepali-en
+    target_dir = os.path.abspath(os.path.join(script_dir, '..', 'models', 'nllb-finetuned-nepali-en'))
+    print(f"Downloading model: {model_name}")
+    print(f"Saving to: {target_dir}")
+    # --- Download ---
+    try:
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+        snapshot_download(repo_id=model_name, local_dir=target_dir, local_dir_use_symlinks=False)
+        print("Model downloaded successfully.")
+    except Exception as e:
+        print(f"An error occurred during download: {e}")
+if __name__ == "__main__":
+    download_model()

scripts/fetch_parallel_data.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# scripts/fetch_parallel_data.py
+from datasets import load_dataset
+import os
+def fetch_and_save_parallel_data(lang_pair, dataset_name, output_name):
+    """
+    Downloads a parallel dataset and saves it into two
+    separate text files (one for each language).
+    Args:
+        lang_pair (str): Language pair, e.g., "en-ne" for English-Nepali.
+        dataset_name (str): The name of the dataset on Hugging Face Hub.
+        output_name (str): The name to use for the output files.
+    """
+    source_lang, target_lang = lang_pair.split("-")
+    output_dir = "data/processed"
+    os.makedirs(output_dir, exist_ok=True)
+    source_filepath = os.path.join(output_dir, f"{output_name}.{source_lang}")
+    target_filepath = os.path.join(output_dir, f"{output_name}.{target_lang}")
+    print(f"--- Starting download for {lang_pair} from {dataset_name} ---")
+    try:
+        # Load the dataset from Hugging Face
+        if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
+            dataset = load_dataset(dataset_name, split='train')
+        else:
+            dataset = load_dataset(dataset_name, lang_pair, split='train')
+        print(f"Dataset loaded successfully. Total pairs: {len(dataset)}")
+        print(f"Processing and saving files...")
+        with open(source_filepath, "w", encoding="utf-8") as f_source, \
+             open(target_filepath, "w", encoding="utf-8") as f_target:
+            for example in dataset:
+                if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
+                    source_sentence = example['Sinhala']
+                    target_sentence = example['English']
+                else:
+                    source_sentence = example['translation'][source_lang]
+                    target_sentence = example['translation'][target_lang]
+                if source_sentence and target_sentence:
+                    f_source.write(source_sentence.strip() + "\n")
+                    f_target.write(target_sentence.strip() + "\n")
+        print(f"Successfully saved data for {lang_pair}")
+    except Exception as e:
+        print(f"An error occurred for {lang_pair}: {e}")
+if __name__ == "__main__":
+    # --- Fetch Nepali Data ---
+    print("Fetching Nepali data...")
+    fetch_and_save_parallel_data(lang_pair="en-ne", dataset_name="Helsinki-NLP/opus-100", output_name="nepali")
+    # --- Fetch Sinhalese Data ---
+    print("\nFetching Sinhalese data...")
+    fetch_and_save_parallel_data(lang_pair="si-en", dataset_name="Programmer-RD-AI/sinhala-english-singlish-translation", output_name="sinhala")
+    # --- Fetch Sinhalese Idioms Data ---
+    print("\nFetching Sinhalese idioms data...")
+    output_dir = "data/processed"
+    try:
+        idioms_dataset = load_dataset("Venuraa/English-Sinhala-Idioms-Parallel-Translations", split='train')
+        print(f"Idioms dataset loaded successfully. Total pairs: {len(idioms_dataset)}")
+        with open(os.path.join(output_dir, "sinhala.si"), "a", encoding="utf-8") as f_source, \
+             open(os.path.join(output_dir, "sinhala.en"), "a", encoding="utf-8") as f_target:
+            for example in idioms_dataset:
+                parts = example['text'].split('\n')
+                if len(parts) == 2:
+                    f_target.write(parts[0] + "\n")
+                    f_source.write(parts[1] + "\n")
+        print("Successfully appended idioms data.")
+    except Exception as e:
+        print(f"An error occurred while fetching idioms data: {e}")
+    print("\nAll data fetching complete.")

scripts/scrape_bbc_nepali.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# scripts/scrape_bbc_nepali.py
+import requests
+from bs4 import BeautifulSoup
+import datetime
+import os
+def scrape_bbc_nepali():
+    """
+    Scrapes news articles from the BBC Nepali homepage and saves them to a file.
+    """
+    # The base URL for BBC Nepali news
+    BASE_URL = "https://www.bbc.com"
+    START_URL = f"{BASE_URL}/nepali"
+    # Get the current date to create a unique filename
+    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
+    output_filename = f"bbc_nepali_articles_{current_date}.txt"
+    # Ensure the output directory exists
+    output_dir = "data/raw"
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, output_filename)
+    print(f"Starting scrape of {START_URL}")
+    print(f"Saving data to: {output_path}")
+    try:
+        # 1. Fetch the main homepage
+        main_page = requests.get(START_URL)
+        main_page.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
+        main_soup = BeautifulSoup(main_page.content, "html.parser")
+        # 2. Find all links that likely lead to articles
+        # This is a bit of trial and error; we look for <a> tags with hrefs
+        # that match the pattern of BBC articles.
+        article_links = set() # Use a set to avoid duplicate links
+        for a_tag in main_soup.find_all("a", href=True):
+            href = a_tag['href']
+            # We filter for links that look like internal news articles
+            if href.startswith("/nepali/articles/"):
+                full_url = f"{BASE_URL}{href}"
+                article_links.add(full_url)
+        print(f"Found {len(article_links)} unique article links.")
+        # 3. Visit each article and extract its text
+        all_article_text = []
+        for i, link in enumerate(article_links):
+            try:
+                print(f"  Scraping ({i+1}/{len(article_links)}): {link}")
+                article_page = requests.get(link)
+                article_page.raise_for_status()
+                article_soup = BeautifulSoup(article_page.content, "html.parser")
+                # Find all paragraph tags (<p>) which usually contain the article text
+                paragraphs = article_soup.find_all("p")
+                article_text = "\n".join([p.get_text() for p in paragraphs])
+                all_article_text.append(article_text)
+            except requests.exceptions.RequestException as e:
+                print(f"    Could not fetch article {link}: {e}")
+            except Exception as e:
+                print(f"    An error occurred while processing {link}: {e}")
+        # 4. Save the collected text to a file
+        with open(output_path, "w", encoding="utf-8") as f:
+            # Separate articles with a clear delimiter
+            f.write("\n\n--- NEW ARTICLE ---\n\n".join(all_article_text))
+        print(f"\nScraping complete. All text saved to {output_path}")
+    except requests.exceptions.RequestException as e:
+        print(f"Failed to fetch the main page {START_URL}: {e}")
+if __name__ == "__main__":
+    scrape_bbc_nepali()