File size: 4,686 Bytes
4e5fc16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Text processing utilities for Francis Botcon project."""

import re
from pathlib import Path
from typing import List, Tuple
from src.logger import get_logger

logger = get_logger(__name__)


class TextCleaner:
    """Clean and preprocess texts from Project Gutenberg."""

    # Project Gutenberg header/footer patterns
    PG_HEADER_PATTERN = r"\*\*\*.*?START.*?PROJECT GUTENBERG.*?\*\*\*"
    PG_FOOTER_PATTERN = r"\*\*\*.*?END.*?PROJECT GUTENBERG.*?\*\*\*"

    @staticmethod
    def remove_pg_metadata(text: str) -> str:
        """Remove Project Gutenberg header and footer.



        Args:

            text: Raw text from Project Gutenberg



        Returns:

            Cleaned text

        """
        # Remove header
        text = re.sub(
            TextCleaner.PG_HEADER_PATTERN,
            "",
            text,
            flags=re.DOTALL | re.IGNORECASE
        )

        # Remove footer
        text = re.sub(
            TextCleaner.PG_FOOTER_PATTERN,
            "",
            text,
            flags=re.DOTALL | re.IGNORECASE
        )

        return text

    @staticmethod
    def normalize_whitespace(text: str) -> str:
        """Normalize whitespace in text.



        Args:

            text: Input text



        Returns:

            Text with normalized whitespace

        """
        # Remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # Remove multiple newlines
        text = re.sub(r'\n\n+', '\n\n', text)
        # Strip leading/trailing whitespace
        text = text.strip()

        return text

    @staticmethod
    def clean_text(text: str) -> str:
        """Apply all cleaning operations.



        Args:

            text: Raw text



        Returns:

            Cleaned text

        """
        text = TextCleaner.remove_pg_metadata(text)
        text = TextCleaner.normalize_whitespace(text)
        return text


class TextSegmenter:
    """Segment text into meaningful chunks."""

    @staticmethod
    def segment_by_paragraphs(text: str, min_length: int = 100) -> List[str]:
        """Segment text into paragraphs.



        Args:

            text: Input text

            min_length: Minimum paragraph length in characters



        Returns:

            List of paragraph segments

        """
        paragraphs = text.split('\n\n')
        # Filter out very short paragraphs
        paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
        return paragraphs

    @staticmethod
    def segment_by_length(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
        """Segment text into fixed-size chunks with overlap.



        Args:

            text: Input text

            chunk_size: Size of each chunk in characters

            overlap: Overlap between chunks



        Returns:

            List of text chunks

        """
        chunks = []
        words = text.split()

        current_chunk = []
        current_size = 0

        for word in words:
            current_chunk.append(word)
            current_size += len(word) + 1  # +1 for space

            if current_size >= chunk_size:
                chunks.append(' '.join(current_chunk))
                # Create overlap
                current_chunk = current_chunk[-(overlap // 5):]  # Approximate overlap
                current_size = sum(len(w) for w in current_chunk)

        # Add remaining chunk
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    @staticmethod
    def extract_title_and_author(text: str) -> Tuple[str, str]:
        """Extract title and author from text.



        Args:

            text: Input text



        Returns:

            Tuple of (title, author)

        """
        lines = text.split('\n')
        title = "Unknown"
        author = "Francis Bacon"

        for i, line in enumerate(lines[:50]):  # Check first 50 lines
            if 'by' in line.lower() and 'bacon' in line.lower():
                author = line.strip()
                if i > 0:
                    title = lines[i - 1].strip()
                break

        return title, author


def process_raw_file(file_path: Path) -> Tuple[str, str]:
    """Process a raw Project Gutenberg file.



    Args:

        file_path: Path to raw text file



    Returns:

        Tuple of (cleaned_text, filename)

    """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()

    cleaned_text = TextCleaner.clean_text(text)
    return cleaned_text, file_path.stem