Spaces:

Tuminha
/

classics-rag-qa

Sleeping

File size: 3,926 Bytes

6c189ac

"""
Clean raw text: remove Gutenberg headers/footers, normalize whitespace, keep chapter markers.
"""
import re

def clean_text(raw_path: str) -> str:
    """
    Load raw text and return a cleaned string.

    # TODO hints:
    # - Strip front/back matter by searching for known separators.
    # - Normalize whitespace with regex; keep blank lines between paragraphs.
    # - Preserve CHAPTER markers if present.

    # Acceptance:
    # - Returns a non-empty cleaned string.
    """
    with open(raw_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Find and extract content between Gutenberg markers
    # Pattern matches: *** START OF THE PROJECT GUTENBERG EBOOK [number] ***
    start_pattern = r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK \d+ \*\*\*'
    end_pattern = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK \d+ \*\*\*'
    
    # Find start marker
    start_match = re.search(start_pattern, text, re.IGNORECASE)
    if start_match:
        # Extract text after the start marker
        text = text[start_match.end():]
    else:
        # Fallback: try alternative patterns
        alt_start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK \*\*\*', text, re.IGNORECASE)
        if alt_start:
            text = text[alt_start.end():]
    
    # Find end marker
    end_match = re.search(end_pattern, text, re.IGNORECASE)
    if end_match:
        # Extract text before the end marker
        text = text[:end_match.start()]
    else:
        # Fallback: try alternative patterns
        alt_end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK \*\*\*', text, re.IGNORECASE)
        if alt_end:
            text = text[:alt_end.start()]

    # Remove common Gutenberg metadata at the beginning (title, author, table of contents)
    # But keep the actual book content including chapter markers
    lines = text.split('\n')
    cleaned_lines = []
    in_toc = False  # Track if we're in table of contents
    content_started = False
    
    for line in lines:
        line_stripped = line.strip()
        
        # Skip empty lines at the very beginning
        if not content_started and not line_stripped:
            continue
            
        # Detect table of contents section
        if not content_started and line_stripped.lower() == 'contents':
            in_toc = True
            continue
        
        # Skip table of contents entries (simple chapter lists)
        if in_toc:
            if re.match(r'^CHAPTER [IVX]+\.?$', line_stripped, re.IGNORECASE):
                continue
            # End of TOC when we hit actual content
            if line_stripped and len(line_stripped) > 20:
                in_toc = False
                content_started = True
            else:
                continue
        
        # Skip simple title/author lines (very short, title-case only)
        if not content_started and line_stripped:
            # Skip if it's a simple title (2-4 words, all title case, no punctuation)
            if re.match(r'^[A-Z][a-z]+( [A-Z][a-z]+){1,3}$', line_stripped) and len(line_stripped) < 50:
                continue
            # Skip "by Author Name" lines
            if re.match(r'^by [A-Z][a-z]+ [A-Z]', line_stripped, re.IGNORECASE):
                continue
            # Once we hit substantial content, start keeping everything
            if len(line_stripped) > 20 or 'CHAPTER' in line_stripped.upper():
                content_started = True
        
        cleaned_lines.append(line)
    
    text = '\n'.join(cleaned_lines)

    # Normalize whitespace: collapse multiple spaces/tabs but preserve newlines (paragraph breaks)
    text = re.sub(r'[ \t]+', ' ', text)  # Collapse spaces/tabs to single space
    text = re.sub(r'[ \t]*\n[ \t]*', '\n', text)  # Normalize line breaks
    text = re.sub(r'\n{3,}', '\n\n', text)  # Collapse 3+ newlines to double newline
    text = text.strip()

    return text