import re import chardet from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import nltk from tqdm import tqdm import torch nltk.download('stopwords') nltk.download('wordnet') def clean_text(text): text = text.lower() text = re.sub(r'[^a-zA-Z0-9\s]', '', text) text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]) return text # Specify the input and output file paths input_file = 'wstg-v4.2_cleaned.txt' output_file = 'data\wstg-v4.2_cleaned.txt' # Detect the file encoding with open(input_file, 'rb') as rawdata: result = chardet.detect(rawdata.read()) # Read your text data from the input file using the detected encoding with open(input_file, 'r', encoding=result['encoding']) as file: book_text = file.read() # Use GPU if available, otherwise use CPU device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'Using device: {device}') # Initialize tqdm for progress bar with tqdm(total=len(book_text), desc='Text Cleaning Progress', unit='char') as pbar: # Clean the text cleaned_text = clean_text(book_text) # Update tqdm pbar.update(len(book_text)) # Write the cleaned text to an output file with open(output_file, 'w', encoding='utf-8') as file: file.write(cleaned_text) print("Text cleaning and preprocessing complete.")