Spaces:
Paused
Paused
| import re | |
| def clean_text(text): | |
| # Remove extra spaces and newlines | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove any unwanted special characters | |
| text = re.sub(r'[^\w\s.,;?!-]', '', text) | |
| # Optional: Remove redundant spaces around punctuation | |
| text = re.sub(r'\s([?.!.,;])', r'\1', text) | |
| # Capitalize the first letter of each sentence | |
| text = re.sub(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ' ', text).capitalize() | |
| return text.strip() | |
| def process_text_file(input_file, output_file): | |
| with open(input_file, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| cleaned_content = clean_text(content) | |
| with open(output_file, 'w', encoding='utf-8') as file: | |
| file.write(cleaned_content) | |
| # Usage | |
| input_file = '/Users/push/pro/rag/data/books/test/- Hunter_s Tropical Medicine and Emerging Infectious Disease-Saunders (2012)(Z-Lib.io).txt' | |
| output_file = './output.txt' | |
| process_text_file(input_file, output_file) | |
| print(f"Cleaned text saved as {output_file}") | |