|
|
import os |
|
|
import re |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
def clean_devanagari_text(text): |
|
|
""" |
|
|
Cleans the given Devanagari text by removing newline characters, extra spaces, and English letters. |
|
|
|
|
|
Args: |
|
|
text (str): The input text containing Devanagari script. |
|
|
|
|
|
Returns: |
|
|
str: The cleaned Devanagari text. |
|
|
""" |
|
|
|
|
|
text = re.sub(r"\n", " ", text) |
|
|
|
|
|
|
|
|
pattern = r'[^0-9\u0900-\u097F.,!?;:()\'"—-₹₹\s]' |
|
|
text = re.sub(pattern, "", text) |
|
|
|
|
|
|
|
|
pattern = r"([\u0900-\u097F]+)" |
|
|
text = re.sub(pattern, r" \1 ", text) |
|
|
|
|
|
|
|
|
pattern = r"([\u0964\u0965])" |
|
|
text = re.sub(pattern, r" \1 ", text) |
|
|
|
|
|
|
|
|
text = re.sub(r"\u202c", "", text) |
|
|
text = re.sub(r"\u202a", "", text) |
|
|
text = re.sub(r"\u2061", "", text) |
|
|
|
|
|
|
|
|
text = re.sub(r"\s+", " ", text).strip() |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def read_txt_files(folder_path="", clean_text=True, data_percentage_limit=1.0): |
|
|
""" |
|
|
Reads all .txt files in the given folder and returns their content as a list. |
|
|
|
|
|
Args: |
|
|
folder_path (str): The path to the folder containing .txt files. |
|
|
|
|
|
Returns: |
|
|
list: A list containing the content of each .txt file. |
|
|
""" |
|
|
content_list = [] |
|
|
num_files_to_read = int(len(os.listdir(folder_path)) * data_percentage_limit) |
|
|
num_files_read = 0 |
|
|
|
|
|
|
|
|
for filename in tqdm(os.listdir(folder_path)): |
|
|
num_files_read += 1 |
|
|
if num_files_read > num_files_to_read: |
|
|
break |
|
|
else: |
|
|
|
|
|
if filename.endswith(".txt"): |
|
|
file_path = os.path.join(folder_path, filename) |
|
|
|
|
|
|
|
|
with open(file_path, "r", encoding="utf-8") as file: |
|
|
content = file.read() |
|
|
if clean_text: |
|
|
clean_content = clean_devanagari_text(content) |
|
|
else: |
|
|
clean_content = content |
|
|
content_list.append(clean_content) |
|
|
|
|
|
return content_list |
|
|
|