|
|
import re |
|
|
import chardet |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.stem import WordNetLemmatizer |
|
|
import nltk |
|
|
from tqdm import tqdm |
|
|
import torch |
|
|
|
|
|
nltk.download('stopwords') |
|
|
nltk.download('wordnet') |
|
|
|
|
|
def clean_text(text): |
|
|
text = text.lower() |
|
|
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) |
|
|
text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]) |
|
|
return text |
|
|
|
|
|
|
|
|
input_file = 'wstg-v4.2_cleaned.txt' |
|
|
output_file = 'data\wstg-v4.2_cleaned.txt' |
|
|
|
|
|
|
|
|
with open(input_file, 'rb') as rawdata: |
|
|
result = chardet.detect(rawdata.read()) |
|
|
|
|
|
|
|
|
with open(input_file, 'r', encoding=result['encoding']) as file: |
|
|
book_text = file.read() |
|
|
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
print(f'Using device: {device}') |
|
|
|
|
|
|
|
|
with tqdm(total=len(book_text), desc='Text Cleaning Progress', unit='char') as pbar: |
|
|
|
|
|
cleaned_text = clean_text(book_text) |
|
|
|
|
|
|
|
|
pbar.update(len(book_text)) |
|
|
|
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as file: |
|
|
file.write(cleaned_text) |
|
|
|
|
|
print("Text cleaning and preprocessing complete.") |
|
|
|