at0m-b0mb
/

CyberSecurity_LLM

Model card Files Files and versions

CyberSecurity_LLM / Text_Cleaner_UTF8.py

at0m-b0mb's picture

Upload Text_Cleaner_UTF8.py

9d0cab4 about 2 years ago

history blame contribute delete

1.39 kB

	import re
	import chardet
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	import nltk
	from tqdm import tqdm
	import torch

	nltk.download('stopwords')
	nltk.download('wordnet')

	def clean_text(text):
	text = text.lower()
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
	text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
	return text

	# Specify the input and output file paths
	input_file = 'wstg-v4.2_cleaned.txt'
	output_file = 'data\wstg-v4.2_cleaned.txt'

	# Detect the file encoding
	with open(input_file, 'rb') as rawdata:
	result = chardet.detect(rawdata.read())

	# Read your text data from the input file using the detected encoding
	with open(input_file, 'r', encoding=result['encoding']) as file:
	book_text = file.read()

	# Use GPU if available, otherwise use CPU
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f'Using device: {device}')

	# Initialize tqdm for progress bar
	with tqdm(total=len(book_text), desc='Text Cleaning Progress', unit='char') as pbar:
	# Clean the text
	cleaned_text = clean_text(book_text)

	# Update tqdm
	pbar.update(len(book_text))

	# Write the cleaned text to an output file
	with open(output_file, 'w', encoding='utf-8') as file:
	file.write(cleaned_text)

	print("Text cleaning and preprocessing complete.")