rag / test.py
poemsforaphrodite's picture
Upload folder using huggingface_hub
8e0205b verified
import re
def clean_text(text):
# Remove extra spaces and newlines
text = re.sub(r'\s+', ' ', text)
# Remove any unwanted special characters
text = re.sub(r'[^\w\s.,;?!-]', '', text)
# Optional: Remove redundant spaces around punctuation
text = re.sub(r'\s([?.!.,;])', r'\1', text)
# Capitalize the first letter of each sentence
text = re.sub(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ' ', text).capitalize()
return text.strip()
def process_text_file(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as file:
content = file.read()
cleaned_content = clean_text(content)
with open(output_file, 'w', encoding='utf-8') as file:
file.write(cleaned_content)
# Usage
input_file = '/Users/push/pro/rag/data/books/test/- Hunter_s Tropical Medicine and Emerging Infectious Disease-Saunders (2012)(Z-Lib.io).txt'
output_file = './output.txt'
process_text_file(input_file, output_file)
print(f"Cleaned text saved as {output_file}")