File size: 1,394 Bytes
9d0cab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import re
import chardet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm
import torch

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    return text

# Specify the input and output file paths
input_file = 'wstg-v4.2_cleaned.txt'
output_file = 'data\wstg-v4.2_cleaned.txt'

# Detect the file encoding
with open(input_file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read())

# Read your text data from the input file using the detected encoding
with open(input_file, 'r', encoding=result['encoding']) as file:
    book_text = file.read()

# Use GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Initialize tqdm for progress bar
with tqdm(total=len(book_text), desc='Text Cleaning Progress', unit='char') as pbar:
    # Clean the text
    cleaned_text = clean_text(book_text)

    # Update tqdm
    pbar.update(len(book_text))

# Write the cleaned text to an output file
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print("Text cleaning and preprocessing complete.")