""" Clean raw text: remove Gutenberg headers/footers, normalize whitespace, keep chapter markers. """ import re def clean_text(raw_path: str) -> str: """ Load raw text and return a cleaned string. # TODO hints: # - Strip front/back matter by searching for known separators. # - Normalize whitespace with regex; keep blank lines between paragraphs. # - Preserve CHAPTER markers if present. # Acceptance: # - Returns a non-empty cleaned string. """ with open(raw_path, 'r', encoding='utf-8') as file: text = file.read() # Find and extract content between Gutenberg markers # Pattern matches: *** START OF THE PROJECT GUTENBERG EBOOK [number] *** start_pattern = r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK \d+ \*\*\*' end_pattern = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK \d+ \*\*\*' # Find start marker start_match = re.search(start_pattern, text, re.IGNORECASE) if start_match: # Extract text after the start marker text = text[start_match.end():] else: # Fallback: try alternative patterns alt_start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK \*\*\*', text, re.IGNORECASE) if alt_start: text = text[alt_start.end():] # Find end marker end_match = re.search(end_pattern, text, re.IGNORECASE) if end_match: # Extract text before the end marker text = text[:end_match.start()] else: # Fallback: try alternative patterns alt_end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK \*\*\*', text, re.IGNORECASE) if alt_end: text = text[:alt_end.start()] # Remove common Gutenberg metadata at the beginning (title, author, table of contents) # But keep the actual book content including chapter markers lines = text.split('\n') cleaned_lines = [] in_toc = False # Track if we're in table of contents content_started = False for line in lines: line_stripped = line.strip() # Skip empty lines at the very beginning if not content_started and not line_stripped: continue # Detect table of contents section if not content_started and line_stripped.lower() == 'contents': in_toc = True continue # Skip table of contents entries (simple chapter lists) if in_toc: if re.match(r'^CHAPTER [IVX]+\.?$', line_stripped, re.IGNORECASE): continue # End of TOC when we hit actual content if line_stripped and len(line_stripped) > 20: in_toc = False content_started = True else: continue # Skip simple title/author lines (very short, title-case only) if not content_started and line_stripped: # Skip if it's a simple title (2-4 words, all title case, no punctuation) if re.match(r'^[A-Z][a-z]+( [A-Z][a-z]+){1,3}$', line_stripped) and len(line_stripped) < 50: continue # Skip "by Author Name" lines if re.match(r'^by [A-Z][a-z]+ [A-Z]', line_stripped, re.IGNORECASE): continue # Once we hit substantial content, start keeping everything if len(line_stripped) > 20 or 'CHAPTER' in line_stripped.upper(): content_started = True cleaned_lines.append(line) text = '\n'.join(cleaned_lines) # Normalize whitespace: collapse multiple spaces/tabs but preserve newlines (paragraph breaks) text = re.sub(r'[ \t]+', ' ', text) # Collapse spaces/tabs to single space text = re.sub(r'[ \t]*\n[ \t]*', '\n', text) # Normalize line breaks text = re.sub(r'\n{3,}', '\n\n', text) # Collapse 3+ newlines to double newline text = text.strip() return text