MedSearchPro / embeddings /text_chunking.py
paulhemb's picture
Initial Backend Deployment
1367957
# embeddings/text_chunking.py
"""
Multiple text chunking strategies for research papers
"""
import re
from typing import List, Dict, Any
from abc import ABC, abstractmethod
class ChunkingStrategy(ABC):
"""Abstract base class for chunking strategies"""
@abstractmethod
def chunk_text(self, text: str, **kwargs) -> List[Dict[str, Any]]:
pass
class FixedSizeChunking(ChunkingStrategy):
"""Fixed size chunking with overlap"""
def chunk_text(self, text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Dict[str, Any]]:
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = min(start + chunk_size, text_length)
# Adjust chunk to not break in middle of word if possible
if end < text_length:
# Try to find a sentence boundary
sentence_end = text.rfind('. ', start, end)
if sentence_end != -1 and sentence_end > start + chunk_size // 2:
end = sentence_end + 1
else:
# Otherwise find a word boundary
word_end = text.rfind(' ', start, end)
if word_end != -1 and word_end > start + chunk_size // 2:
end = word_end
chunk = text[start:end].strip()
if chunk:
chunks.append({
'text': chunk,
'start_char': start,
'end_char': end,
'chunk_size': len(chunk)
})
start = end - chunk_overlap if end - chunk_overlap > start else end
return chunks
class SemanticChunking(ChunkingStrategy):
"""Semantic chunking based on paragraphs and sections"""
def chunk_text(self, text: str, max_chunk_size: int = 512) -> List[Dict[str, Any]]:
chunks = []
# Split by paragraphs first
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
current_chunk = ""
current_start = 0
for i, paragraph in enumerate(paragraphs):
# If adding this paragraph would exceed max size, save current chunk
if len(current_chunk) + len(paragraph) > max_chunk_size and current_chunk:
chunks.append({
'text': current_chunk.strip(),
'start_char': current_start,
'end_char': current_start + len(current_chunk),
'chunk_size': len(current_chunk),
'type': 'semantic'
})
current_chunk = ""
current_start = current_start + len(current_chunk)
# Add paragraph to current chunk
if current_chunk:
current_chunk += "\n\n" + paragraph
else:
current_chunk = paragraph
# Estimate start position (this is approximate)
current_start = text.find(paragraph)
# Add the last chunk
if current_chunk:
chunks.append({
'text': current_chunk.strip(),
'start_char': current_start,
'end_char': current_start + len(current_chunk),
'chunk_size': len(current_chunk),
'type': 'semantic'
})
return chunks
class ResearchPaperChunker:
"""Specialized chunker for research papers"""
def __init__(self, strategy: str = "semantic"):
self.strategy = strategy
self.chunkers = {
"fixed": FixedSizeChunking(),
"semantic": SemanticChunking()
}
def chunk_paper(self, paper: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Chunk a research paper into manageable pieces"""
paper_id = paper.get('id', 'unknown')
title = paper.get('title', '')
abstract = paper.get('abstract', '')
# Combine title and abstract for chunking
full_text = f"Title: {title}\n\nAbstract: {abstract}"
# Get appropriate chunker
chunker = self.chunkers.get(self.strategy, self.chunkers["semantic"])
# Chunk the text
chunks = chunker.chunk_text(full_text)
# Add paper metadata to each chunk
for chunk in chunks:
chunk.update({
'paper_id': paper_id,
'paper_title': title,
'source': paper.get('source', ''),
'domain': paper.get('domain', ''),
'publication_date': paper.get('publication_date', ''),
'authors': paper.get('authors', []),
'chunk_strategy': self.strategy
})
return chunks
def batch_chunk_papers(self, papers: List[Dict], strategy: str = None) -> List[Dict[str, Any]]:
"""Chunk multiple papers"""
if strategy:
self.strategy = strategy
all_chunks = []
for paper in papers:
try:
chunks = self.chunk_paper(paper)
all_chunks.extend(chunks)
except Exception as e:
print(f"❌ Error chunking paper {paper.get('id', 'unknown')}: {e}")
continue
print(f"✅ Chunked {len(papers)} papers into {len(all_chunks)} chunks")
return all_chunks
# Quick test
def test_chunking_strategies():
"""Test different chunking strategies"""
test_paper = {
'id': 'test_001',
'title': 'Deep Learning for Medical Image Analysis',
'abstract': 'This paper explores the application of deep learning techniques in medical image analysis. We propose a novel transformer-based architecture that achieves state-of-the-art performance on multiple benchmark datasets. Our method improves accuracy by 15% compared to previous approaches. The model is evaluated on CT, MRI, and X-ray datasets showing consistent improvements across modalities.',
'source': 'test',
'domain': 'medical_imaging'
}
chunker = ResearchPaperChunker()
print("🧪 Testing Chunking Strategies")
print("=" * 50)
for strategy in ["fixed", "semantic"]:
print(f"\n🔬 Strategy: {strategy}")
chunks = chunker.chunk_paper(test_paper)
print(f" Number of chunks: {len(chunks)}")
for i, chunk in enumerate(chunks):
print(f" Chunk {i + 1}: {chunk['chunk_size']} chars - {chunk['text'][:80]}...")
if __name__ == "__main__":
test_chunking_strategies()