Spaces:
Sleeping
Sleeping
File size: 3,926 Bytes
6c189ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
"""
Clean raw text: remove Gutenberg headers/footers, normalize whitespace, keep chapter markers.
"""
import re
def clean_text(raw_path: str) -> str:
"""
Load raw text and return a cleaned string.
# TODO hints:
# - Strip front/back matter by searching for known separators.
# - Normalize whitespace with regex; keep blank lines between paragraphs.
# - Preserve CHAPTER markers if present.
# Acceptance:
# - Returns a non-empty cleaned string.
"""
with open(raw_path, 'r', encoding='utf-8') as file:
text = file.read()
# Find and extract content between Gutenberg markers
# Pattern matches: *** START OF THE PROJECT GUTENBERG EBOOK [number] ***
start_pattern = r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK \d+ \*\*\*'
end_pattern = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK \d+ \*\*\*'
# Find start marker
start_match = re.search(start_pattern, text, re.IGNORECASE)
if start_match:
# Extract text after the start marker
text = text[start_match.end():]
else:
# Fallback: try alternative patterns
alt_start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK \*\*\*', text, re.IGNORECASE)
if alt_start:
text = text[alt_start.end():]
# Find end marker
end_match = re.search(end_pattern, text, re.IGNORECASE)
if end_match:
# Extract text before the end marker
text = text[:end_match.start()]
else:
# Fallback: try alternative patterns
alt_end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK \*\*\*', text, re.IGNORECASE)
if alt_end:
text = text[:alt_end.start()]
# Remove common Gutenberg metadata at the beginning (title, author, table of contents)
# But keep the actual book content including chapter markers
lines = text.split('\n')
cleaned_lines = []
in_toc = False # Track if we're in table of contents
content_started = False
for line in lines:
line_stripped = line.strip()
# Skip empty lines at the very beginning
if not content_started and not line_stripped:
continue
# Detect table of contents section
if not content_started and line_stripped.lower() == 'contents':
in_toc = True
continue
# Skip table of contents entries (simple chapter lists)
if in_toc:
if re.match(r'^CHAPTER [IVX]+\.?$', line_stripped, re.IGNORECASE):
continue
# End of TOC when we hit actual content
if line_stripped and len(line_stripped) > 20:
in_toc = False
content_started = True
else:
continue
# Skip simple title/author lines (very short, title-case only)
if not content_started and line_stripped:
# Skip if it's a simple title (2-4 words, all title case, no punctuation)
if re.match(r'^[A-Z][a-z]+( [A-Z][a-z]+){1,3}$', line_stripped) and len(line_stripped) < 50:
continue
# Skip "by Author Name" lines
if re.match(r'^by [A-Z][a-z]+ [A-Z]', line_stripped, re.IGNORECASE):
continue
# Once we hit substantial content, start keeping everything
if len(line_stripped) > 20 or 'CHAPTER' in line_stripped.upper():
content_started = True
cleaned_lines.append(line)
text = '\n'.join(cleaned_lines)
# Normalize whitespace: collapse multiple spaces/tabs but preserve newlines (paragraph breaks)
text = re.sub(r'[ \t]+', ' ', text) # Collapse spaces/tabs to single space
text = re.sub(r'[ \t]*\n[ \t]*', '\n', text) # Normalize line breaks
text = re.sub(r'\n{3,}', '\n\n', text) # Collapse 3+ newlines to double newline
text = text.strip()
return text
|