Spaces:

rojaldo
/

francis-botcon

Sleeping

File size: 4,686 Bytes

4e5fc16

"""Text processing utilities for Francis Botcon project."""

import re
from pathlib import Path
from typing import List, Tuple
from src.logger import get_logger

logger = get_logger(__name__)


class TextCleaner:
    """Clean and preprocess texts from Project Gutenberg."""

    # Project Gutenberg header/footer patterns
    PG_HEADER_PATTERN = r"\*\*\*.*?START.*?PROJECT GUTENBERG.*?\*\*\*"
    PG_FOOTER_PATTERN = r"\*\*\*.*?END.*?PROJECT GUTENBERG.*?\*\*\*"

    @staticmethod
    def remove_pg_metadata(text: str) -> str:
        """Remove Project Gutenberg header and footer.



        Args:

            text: Raw text from Project Gutenberg



        Returns:

            Cleaned text

        """
        # Remove header
        text = re.sub(
            TextCleaner.PG_HEADER_PATTERN,
            "",
            text,
            flags=re.DOTALL | re.IGNORECASE
        )

        # Remove footer
        text = re.sub(
            TextCleaner.PG_FOOTER_PATTERN,
            "",
            text,
            flags=re.DOTALL | re.IGNORECASE
        )

        return text

    @staticmethod
    def normalize_whitespace(text: str) -> str:
        """Normalize whitespace in text.



        Args:

            text: Input text



        Returns:

            Text with normalized whitespace

        """
        # Remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # Remove multiple newlines
        text = re.sub(r'\n\n+', '\n\n', text)
        # Strip leading/trailing whitespace
        text = text.strip()

        return text

    @staticmethod
    def clean_text(text: str) -> str:
        """Apply all cleaning operations.



        Args:

            text: Raw text



        Returns:

            Cleaned text

        """
        text = TextCleaner.remove_pg_metadata(text)
        text = TextCleaner.normalize_whitespace(text)
        return text


class TextSegmenter:
    """Segment text into meaningful chunks."""

    @staticmethod
    def segment_by_paragraphs(text: str, min_length: int = 100) -> List[str]:
        """Segment text into paragraphs.



        Args:

            text: Input text

            min_length: Minimum paragraph length in characters



        Returns:

            List of paragraph segments

        """
        paragraphs = text.split('\n\n')
        # Filter out very short paragraphs
        paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
        return paragraphs

    @staticmethod
    def segment_by_length(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
        """Segment text into fixed-size chunks with overlap.



        Args:

            text: Input text

            chunk_size: Size of each chunk in characters

            overlap: Overlap between chunks



        Returns:

            List of text chunks

        """
        chunks = []
        words = text.split()

        current_chunk = []
        current_size = 0

        for word in words:
            current_chunk.append(word)
            current_size += len(word) + 1  # +1 for space

            if current_size >= chunk_size:
                chunks.append(' '.join(current_chunk))
                # Create overlap
                current_chunk = current_chunk[-(overlap // 5):]  # Approximate overlap
                current_size = sum(len(w) for w in current_chunk)

        # Add remaining chunk
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    @staticmethod
    def extract_title_and_author(text: str) -> Tuple[str, str]:
        """Extract title and author from text.



        Args:

            text: Input text



        Returns:

            Tuple of (title, author)

        """
        lines = text.split('\n')
        title = "Unknown"
        author = "Francis Bacon"

        for i, line in enumerate(lines[:50]):  # Check first 50 lines
            if 'by' in line.lower() and 'bacon' in line.lower():
                author = line.strip()
                if i > 0:
                    title = lines[i - 1].strip()
                break

        return title, author


def process_raw_file(file_path: Path) -> Tuple[str, str]:
    """Process a raw Project Gutenberg file.



    Args:

        file_path: Path to raw text file



    Returns:

        Tuple of (cleaned_text, filename)

    """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()

    cleaned_text = TextCleaner.clean_text(text)
    return cleaned_text, file_path.stem