CyberSecurity_LLM / Text_Cleaner_UTF8.py
at0m-b0mb's picture
Upload Text_Cleaner_UTF8.py
9d0cab4
import re
import chardet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm
import torch
nltk.download('stopwords')
nltk.download('wordnet')
def clean_text(text):
text = text.lower()
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
return text
# Specify the input and output file paths
input_file = 'wstg-v4.2_cleaned.txt'
output_file = 'data\wstg-v4.2_cleaned.txt'
# Detect the file encoding
with open(input_file, 'rb') as rawdata:
result = chardet.detect(rawdata.read())
# Read your text data from the input file using the detected encoding
with open(input_file, 'r', encoding=result['encoding']) as file:
book_text = file.read()
# Use GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
# Initialize tqdm for progress bar
with tqdm(total=len(book_text), desc='Text Cleaning Progress', unit='char') as pbar:
# Clean the text
cleaned_text = clean_text(book_text)
# Update tqdm
pbar.update(len(book_text))
# Write the cleaned text to an output file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(cleaned_text)
print("Text cleaning and preprocessing complete.")