Spaces:

rojaldo
/

francis-botcon

Sleeping

francis-botcon / src /text_processor.py

Rojaldo

Initialize Francis Botcon Gradio Space with model files

4e5fc16 4 months ago

4.69 kB

	"""Text processing utilities for Francis Botcon project."""

	import re
	from pathlib import Path
	from typing import List, Tuple
	from src.logger import get_logger

	logger = get_logger(__name__)


	class TextCleaner:
	"""Clean and preprocess texts from Project Gutenberg."""

	# Project Gutenberg header/footer patterns
	PG_HEADER_PATTERN = r"\\\.?START.?PROJECT GUTENBERG.?\\\*"
	PG_FOOTER_PATTERN = r"\\\.?END.?PROJECT GUTENBERG.?\\\*"

	@staticmethod
	def remove_pg_metadata(text: str) -> str:
	"""Remove Project Gutenberg header and footer.

	Args:
	text: Raw text from Project Gutenberg

	Returns:
	Cleaned text
	"""
	# Remove header
	text = re.sub(
	TextCleaner.PG_HEADER_PATTERN,
	"",
	text,
	flags=re.DOTALL \| re.IGNORECASE
	)

	# Remove footer
	text = re.sub(
	TextCleaner.PG_FOOTER_PATTERN,
	"",
	text,
	flags=re.DOTALL \| re.IGNORECASE
	)

	return text

	@staticmethod
	def normalize_whitespace(text: str) -> str:
	"""Normalize whitespace in text.

	Args:
	text: Input text

	Returns:
	Text with normalized whitespace
	"""
	# Remove multiple spaces
	text = re.sub(r' +', ' ', text)
	# Remove multiple newlines
	text = re.sub(r'\n\n+', '\n\n', text)
	# Strip leading/trailing whitespace
	text = text.strip()

	return text

	@staticmethod
	def clean_text(text: str) -> str:
	"""Apply all cleaning operations.

	Args:
	text: Raw text

	Returns:
	Cleaned text
	"""
	text = TextCleaner.remove_pg_metadata(text)
	text = TextCleaner.normalize_whitespace(text)
	return text


	class TextSegmenter:
	"""Segment text into meaningful chunks."""

	@staticmethod
	def segment_by_paragraphs(text: str, min_length: int = 100) -> List[str]:
	"""Segment text into paragraphs.

	Args:
	text: Input text
	min_length: Minimum paragraph length in characters

	Returns:
	List of paragraph segments
	"""
	paragraphs = text.split('\n\n')
	# Filter out very short paragraphs
	paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
	return paragraphs

	@staticmethod
	def segment_by_length(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
	"""Segment text into fixed-size chunks with overlap.

	Args:
	text: Input text
	chunk_size: Size of each chunk in characters
	overlap: Overlap between chunks

	Returns:
	List of text chunks
	"""
	chunks = []
	words = text.split()

	current_chunk = []
	current_size = 0

	for word in words:
	current_chunk.append(word)
	current_size += len(word) + 1 # +1 for space

	if current_size >= chunk_size:
	chunks.append(' '.join(current_chunk))
	# Create overlap
	current_chunk = current_chunk[-(overlap // 5):] # Approximate overlap
	current_size = sum(len(w) for w in current_chunk)

	# Add remaining chunk
	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	@staticmethod
	def extract_title_and_author(text: str) -> Tuple[str, str]:
	"""Extract title and author from text.

	Args:
	text: Input text

	Returns:
	Tuple of (title, author)
	"""
	lines = text.split('\n')
	title = "Unknown"
	author = "Francis Bacon"

	for i, line in enumerate(lines[:50]): # Check first 50 lines
	if 'by' in line.lower() and 'bacon' in line.lower():
	author = line.strip()
	if i > 0:
	title = lines[i - 1].strip()
	break

	return title, author


	def process_raw_file(file_path: Path) -> Tuple[str, str]:
	"""Process a raw Project Gutenberg file.

	Args:
	file_path: Path to raw text file

	Returns:
	Tuple of (cleaned_text, filename)
	"""
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	text = f.read()

	cleaned_text = TextCleaner.clean_text(text)
	return cleaned_text, file_path.stem