Spaces:

Jaykay73
/

Research-Assistant-Extension

Sleeping

App Files Files Community

Research-Assistant-Extension / ml /text_processor.py

Jaykay73

Upload 39 files

76d540d verified 3 months ago

raw

history blame contribute delete

9.17 kB

	"""
	AI Research Paper Helper - Text Processor
	Handles text cleaning, section segmentation, and chunking.
	"""

	import re
	from typing import List, Dict, Optional
	from dataclasses import dataclass
	import logging

	from config import settings

	logger = logging.getLogger(__name__)


	@dataclass
	class TextChunk:
	"""Represents a chunk of text with metadata."""
	id: str
	text: str
	section: Optional[str]
	start_idx: int
	end_idx: int
	token_count: int


	@dataclass
	class Section:
	"""Represents a document section."""
	title: str
	content: str
	section_type: str # 'abstract', 'introduction', 'methods', 'results', 'discussion', 'conclusion', 'other'
	level: int


	class TextProcessor:
	"""Handles all text processing operations."""

	# Section title patterns for academic papers
	SECTION_PATTERNS = {
	'abstract': r'^abstract\s*$',
	'introduction': r'^(1\.?\s)?introduction\s$',
	'related_work': r'^(2\.?\s)?(related\s+work\|background\|literature\s+review)\s$',
	'methods': r'^(3\.?\s)?(method(s\|ology)?\|approach\|model)\s$',
	'experiments': r'^(4\.?\s)?(experiment(s)?\|evaluation\|results)\s$',
	'results': r'^(5\.?\s)?(result(s)?\|finding(s)?)\s$',
	'discussion': r'^(6\.?\s)?discussion\s$',
	'conclusion': r'^(7\.?\s)?(conclusion(s)?\|summary)\s$',
	'references': r'^references?\s*$',
	'appendix': r'^appendix\s*'
	}

	def __init__(self):
	self.chunk_size = settings.chunk_size
	self.chunk_overlap = settings.chunk_overlap

	def clean_text(self, text: str) -> str:
	"""Clean and normalize text."""
	if not text:
	return ""

	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Remove control characters
	text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)

	# Normalize quotes
	text = text.replace('"', '"').replace('"', '"')
	text = text.replace(''', "'").replace(''', "'")

	# Remove multiple spaces
	text = re.sub(r' {2,}', ' ', text)

	return text.strip()

	def extract_sections(self, content: str, sections_data: List[Dict] = None) -> List[Section]:
	"""Extract sections from document content."""
	if sections_data:
	# Use pre-extracted section data
	return [
	Section(
	title=s.get('title', ''),
	content=self.clean_text(s.get('content', '')),
	section_type=self._classify_section(s.get('title', '')),
	level=s.get('level', 2)
	)
	for s in sections_data
	]

	# Try to parse sections from raw content
	sections = []
	lines = content.split('\n')
	current_section = None
	current_content = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Check if this line is a section header
	section_type = self._classify_section(line)
	if section_type != 'other' and len(line) < 100:
	# Save previous section
	if current_section is not None:
	sections.append(Section(
	title=current_section,
	content=self.clean_text(' '.join(current_content)),
	section_type=self._classify_section(current_section),
	level=2
	))

	current_section = line
	current_content = []
	else:
	current_content.append(line)

	# Don't forget the last section
	if current_section is not None:
	sections.append(Section(
	title=current_section,
	content=self.clean_text(' '.join(current_content)),
	section_type=self._classify_section(current_section),
	level=2
	))

	return sections

	def _classify_section(self, title: str) -> str:
	"""Classify a section based on its title."""
	title_lower = title.lower().strip()

	for section_type, pattern in self.SECTION_PATTERNS.items():
	if re.match(pattern, title_lower, re.IGNORECASE):
	return section_type

	return 'other'

	def chunk_text(
	self,
	text: str,
	section: Optional[str] = None,
	chunk_id_prefix: str = "chunk"
	) -> List[TextChunk]:
	"""
	Split text into overlapping chunks.

	Uses word-based chunking to respect word boundaries.
	Overlap ensures context is preserved across chunks.
	"""
	if not text:
	return []

	text = self.clean_text(text)
	words = text.split()

	if not words:
	return []

	chunks = []
	chunk_idx = 0
	word_idx = 0

	# Approximate tokens as words (rough estimate)
	words_per_chunk = self.chunk_size
	overlap_words = self.chunk_overlap

	while word_idx < len(words):
	# Get chunk words
	end_idx = min(word_idx + words_per_chunk, len(words))
	chunk_words = words[word_idx:end_idx]
	chunk_text = ' '.join(chunk_words)

	# Calculate character positions (approximate)
	start_char = len(' '.join(words[:word_idx])) + (1 if word_idx > 0 else 0)
	end_char = start_char + len(chunk_text)

	chunks.append(TextChunk(
	id=f"{chunk_id_prefix}_{chunk_idx}",
	text=chunk_text,
	section=section,
	start_idx=start_char,
	end_idx=end_char,
	token_count=len(chunk_words)
	))

	chunk_idx += 1
	word_idx = end_idx - overlap_words

	# Prevent infinite loop
	if word_idx >= len(words) or end_idx >= len(words):
	break
	if word_idx <= end_idx - words_per_chunk:
	word_idx = end_idx

	return chunks

	def chunk_document(
	self,
	content: str,
	abstract: Optional[str] = None,
	sections: List[Dict] = None
	) -> List[TextChunk]:
	"""
	Chunk an entire document, respecting section boundaries.

	Strategy:
	1. Abstract gets its own chunk(s)
	2. Each section is chunked separately
	3. If no sections, chunk the entire content
	"""
	all_chunks = []
	chunk_counter = 0

	# Handle abstract separately
	if abstract:
	abstract_chunks = self.chunk_text(
	abstract,
	section="abstract",
	chunk_id_prefix=f"chunk_{chunk_counter}"
	)
	all_chunks.extend(abstract_chunks)
	chunk_counter += len(abstract_chunks)

	# Process sections
	if sections:
	parsed_sections = self.extract_sections(content, sections)

	for section in parsed_sections:
	if section.section_type == 'references':
	continue # Skip references

	section_chunks = self.chunk_text(
	section.content,
	section=section.title,
	chunk_id_prefix=f"chunk_{chunk_counter}"
	)
	all_chunks.extend(section_chunks)
	chunk_counter += len(section_chunks)
	else:
	# Chunk entire content
	content_chunks = self.chunk_text(
	content,
	section=None,
	chunk_id_prefix=f"chunk_{chunk_counter}"
	)
	all_chunks.extend(content_chunks)

	return all_chunks

	def extract_sentences(self, text: str) -> List[str]:
	"""Extract sentences from text."""
	# Simple sentence splitting
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if s.strip()]

	def truncate_to_limit(self, text: str, max_words: int = 500) -> str:
	"""Truncate text to a maximum number of words."""
	words = text.split()
	if len(words) <= max_words:
	return text
	return ' '.join(words[:max_words]) + '...'


	# Singleton instance
	_text_processor = None

	def get_text_processor() -> TextProcessor:
	"""Get the singleton text processor instance."""
	global _text_processor
	if _text_processor is None:
	_text_processor = TextProcessor()
	return _text_processor