Spaces:
Sleeping
Sleeping
| """Text processing utilities for Francis Botcon project.""" | |
| import re | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| from src.logger import get_logger | |
| logger = get_logger(__name__) | |
| class TextCleaner: | |
| """Clean and preprocess texts from Project Gutenberg.""" | |
| # Project Gutenberg header/footer patterns | |
| PG_HEADER_PATTERN = r"\*\*\*.*?START.*?PROJECT GUTENBERG.*?\*\*\*" | |
| PG_FOOTER_PATTERN = r"\*\*\*.*?END.*?PROJECT GUTENBERG.*?\*\*\*" | |
| def remove_pg_metadata(text: str) -> str: | |
| """Remove Project Gutenberg header and footer. | |
| Args: | |
| text: Raw text from Project Gutenberg | |
| Returns: | |
| Cleaned text | |
| """ | |
| # Remove header | |
| text = re.sub( | |
| TextCleaner.PG_HEADER_PATTERN, | |
| "", | |
| text, | |
| flags=re.DOTALL | re.IGNORECASE | |
| ) | |
| # Remove footer | |
| text = re.sub( | |
| TextCleaner.PG_FOOTER_PATTERN, | |
| "", | |
| text, | |
| flags=re.DOTALL | re.IGNORECASE | |
| ) | |
| return text | |
| def normalize_whitespace(text: str) -> str: | |
| """Normalize whitespace in text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Text with normalized whitespace | |
| """ | |
| # Remove multiple spaces | |
| text = re.sub(r' +', ' ', text) | |
| # Remove multiple newlines | |
| text = re.sub(r'\n\n+', '\n\n', text) | |
| # Strip leading/trailing whitespace | |
| text = text.strip() | |
| return text | |
| def clean_text(text: str) -> str: | |
| """Apply all cleaning operations. | |
| Args: | |
| text: Raw text | |
| Returns: | |
| Cleaned text | |
| """ | |
| text = TextCleaner.remove_pg_metadata(text) | |
| text = TextCleaner.normalize_whitespace(text) | |
| return text | |
| class TextSegmenter: | |
| """Segment text into meaningful chunks.""" | |
| def segment_by_paragraphs(text: str, min_length: int = 100) -> List[str]: | |
| """Segment text into paragraphs. | |
| Args: | |
| text: Input text | |
| min_length: Minimum paragraph length in characters | |
| Returns: | |
| List of paragraph segments | |
| """ | |
| paragraphs = text.split('\n\n') | |
| # Filter out very short paragraphs | |
| paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_length] | |
| return paragraphs | |
| def segment_by_length(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]: | |
| """Segment text into fixed-size chunks with overlap. | |
| Args: | |
| text: Input text | |
| chunk_size: Size of each chunk in characters | |
| overlap: Overlap between chunks | |
| Returns: | |
| List of text chunks | |
| """ | |
| chunks = [] | |
| words = text.split() | |
| current_chunk = [] | |
| current_size = 0 | |
| for word in words: | |
| current_chunk.append(word) | |
| current_size += len(word) + 1 # +1 for space | |
| if current_size >= chunk_size: | |
| chunks.append(' '.join(current_chunk)) | |
| # Create overlap | |
| current_chunk = current_chunk[-(overlap // 5):] # Approximate overlap | |
| current_size = sum(len(w) for w in current_chunk) | |
| # Add remaining chunk | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |
| def extract_title_and_author(text: str) -> Tuple[str, str]: | |
| """Extract title and author from text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Tuple of (title, author) | |
| """ | |
| lines = text.split('\n') | |
| title = "Unknown" | |
| author = "Francis Bacon" | |
| for i, line in enumerate(lines[:50]): # Check first 50 lines | |
| if 'by' in line.lower() and 'bacon' in line.lower(): | |
| author = line.strip() | |
| if i > 0: | |
| title = lines[i - 1].strip() | |
| break | |
| return title, author | |
| def process_raw_file(file_path: Path) -> Tuple[str, str]: | |
| """Process a raw Project Gutenberg file. | |
| Args: | |
| file_path: Path to raw text file | |
| Returns: | |
| Tuple of (cleaned_text, filename) | |
| """ | |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| text = f.read() | |
| cleaned_text = TextCleaner.clean_text(text) | |
| return cleaned_text, file_path.stem | |