francis-botcon / src /text_processor.py
Rojaldo
Initialize Francis Botcon Gradio Space with model files
4e5fc16
"""Text processing utilities for Francis Botcon project."""
import re
from pathlib import Path
from typing import List, Tuple
from src.logger import get_logger
logger = get_logger(__name__)
class TextCleaner:
"""Clean and preprocess texts from Project Gutenberg."""
# Project Gutenberg header/footer patterns
PG_HEADER_PATTERN = r"\*\*\*.*?START.*?PROJECT GUTENBERG.*?\*\*\*"
PG_FOOTER_PATTERN = r"\*\*\*.*?END.*?PROJECT GUTENBERG.*?\*\*\*"
@staticmethod
def remove_pg_metadata(text: str) -> str:
"""Remove Project Gutenberg header and footer.
Args:
text: Raw text from Project Gutenberg
Returns:
Cleaned text
"""
# Remove header
text = re.sub(
TextCleaner.PG_HEADER_PATTERN,
"",
text,
flags=re.DOTALL | re.IGNORECASE
)
# Remove footer
text = re.sub(
TextCleaner.PG_FOOTER_PATTERN,
"",
text,
flags=re.DOTALL | re.IGNORECASE
)
return text
@staticmethod
def normalize_whitespace(text: str) -> str:
"""Normalize whitespace in text.
Args:
text: Input text
Returns:
Text with normalized whitespace
"""
# Remove multiple spaces
text = re.sub(r' +', ' ', text)
# Remove multiple newlines
text = re.sub(r'\n\n+', '\n\n', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
@staticmethod
def clean_text(text: str) -> str:
"""Apply all cleaning operations.
Args:
text: Raw text
Returns:
Cleaned text
"""
text = TextCleaner.remove_pg_metadata(text)
text = TextCleaner.normalize_whitespace(text)
return text
class TextSegmenter:
"""Segment text into meaningful chunks."""
@staticmethod
def segment_by_paragraphs(text: str, min_length: int = 100) -> List[str]:
"""Segment text into paragraphs.
Args:
text: Input text
min_length: Minimum paragraph length in characters
Returns:
List of paragraph segments
"""
paragraphs = text.split('\n\n')
# Filter out very short paragraphs
paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
return paragraphs
@staticmethod
def segment_by_length(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
"""Segment text into fixed-size chunks with overlap.
Args:
text: Input text
chunk_size: Size of each chunk in characters
overlap: Overlap between chunks
Returns:
List of text chunks
"""
chunks = []
words = text.split()
current_chunk = []
current_size = 0
for word in words:
current_chunk.append(word)
current_size += len(word) + 1 # +1 for space
if current_size >= chunk_size:
chunks.append(' '.join(current_chunk))
# Create overlap
current_chunk = current_chunk[-(overlap // 5):] # Approximate overlap
current_size = sum(len(w) for w in current_chunk)
# Add remaining chunk
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
@staticmethod
def extract_title_and_author(text: str) -> Tuple[str, str]:
"""Extract title and author from text.
Args:
text: Input text
Returns:
Tuple of (title, author)
"""
lines = text.split('\n')
title = "Unknown"
author = "Francis Bacon"
for i, line in enumerate(lines[:50]): # Check first 50 lines
if 'by' in line.lower() and 'bacon' in line.lower():
author = line.strip()
if i > 0:
title = lines[i - 1].strip()
break
return title, author
def process_raw_file(file_path: Path) -> Tuple[str, str]:
"""Process a raw Project Gutenberg file.
Args:
file_path: Path to raw text file
Returns:
Tuple of (cleaned_text, filename)
"""
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
cleaned_text = TextCleaner.clean_text(text)
return cleaned_text, file_path.stem