Spaces:

Adityak204
/

BPE_Tokenizer_for_Devanagri

Sleeping

File size: 2,547 Bytes

4211f06

import os
import re
from tqdm import tqdm


def clean_devanagari_text(text):
    """
    Cleans the given Devanagari text by removing newline characters, extra spaces, and English letters.

    Args:
        text (str): The input text containing Devanagari script.

    Returns:
        str: The cleaned Devanagari text.
    """
    # Remove newline characters
    text = re.sub(r"\n", " ", text)

    # Regex pattern to match anything that is not Devanagari letters, punctuation, or numbers
    pattern = r'[^0-9\u0900-\u097F.,!?;:()\'"—-₹₹\s]'
    text = re.sub(pattern, "", text)

    # Regex pattern to match Devanagari words and separate it from other characters
    pattern = r"([\u0900-\u097F]+)"  # Matches Devanagari characters
    text = re.sub(pattern, r" \1 ", text)  # Add spaces around Devanagari words

    # Add space before danda (।) and double danda (॥)
    pattern = r"([\u0964\u0965])"  # Matches danda and double danda
    text = re.sub(pattern, r" \1 ", text)  # Add spaces around danda and double danda

    # Replace following : \u202c \u202a \u2061
    text = re.sub(r"\u202c", "", text)
    text = re.sub(r"\u202a", "", text)
    text = re.sub(r"\u2061", "", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


def read_txt_files(folder_path="", clean_text=True, data_percentage_limit=1.0):
    """
    Reads all .txt files in the given folder and returns their content as a list.

    Args:
        folder_path (str): The path to the folder containing .txt files.

    Returns:
        list: A list containing the content of each .txt file.
    """
    content_list = []
    num_files_to_read = int(len(os.listdir(folder_path)) * data_percentage_limit)
    num_files_read = 0

    # Iterate through all files in the folder
    for filename in tqdm(os.listdir(folder_path)):
        num_files_read += 1
        if num_files_read > num_files_to_read:
            break
        else:
            # Check if the file has a .txt extension
            if filename.endswith(".txt"):
                file_path = os.path.join(folder_path, filename)

                # Open the file and read its content
                with open(file_path, "r", encoding="utf-8") as file:
                    content = file.read()
                    if clean_text:
                        clean_content = clean_devanagari_text(content)
                    else:
                        clean_content = content
                    content_list.append(clean_content)

    return content_list