Spaces:

paulhemb
/

MedSearchPro

Sleeping

File size: 6,679 Bytes
# embeddings/text_chunking.py
"""

Multiple text chunking strategies for research papers

"""

import re
from typing import List, Dict, Any
from abc import ABC, abstractmethod


class ChunkingStrategy(ABC):
    """Abstract base class for chunking strategies"""

    @abstractmethod
    def chunk_text(self, text: str, **kwargs) -> List[Dict[str, Any]]:
        pass


class FixedSizeChunking(ChunkingStrategy):
    """Fixed size chunking with overlap"""

    def chunk_text(self, text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Dict[str, Any]]:
        chunks = []
        start = 0
        text_length = len(text)

        while start < text_length:
            end = min(start + chunk_size, text_length)

            # Adjust chunk to not break in middle of word if possible
            if end < text_length:
                # Try to find a sentence boundary
                sentence_end = text.rfind('. ', start, end)
                if sentence_end != -1 and sentence_end > start + chunk_size // 2:
                    end = sentence_end + 1
                else:
                    # Otherwise find a word boundary
                    word_end = text.rfind(' ', start, end)
                    if word_end != -1 and word_end > start + chunk_size // 2:
                        end = word_end

            chunk = text[start:end].strip()
            if chunk:
                chunks.append({
                    'text': chunk,
                    'start_char': start,
                    'end_char': end,
                    'chunk_size': len(chunk)
                })

            start = end - chunk_overlap if end - chunk_overlap > start else end

        return chunks


class SemanticChunking(ChunkingStrategy):
    """Semantic chunking based on paragraphs and sections"""

    def chunk_text(self, text: str, max_chunk_size: int = 512) -> List[Dict[str, Any]]:
        chunks = []

        # Split by paragraphs first
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

        current_chunk = ""
        current_start = 0

        for i, paragraph in enumerate(paragraphs):
            # If adding this paragraph would exceed max size, save current chunk
            if len(current_chunk) + len(paragraph) > max_chunk_size and current_chunk:
                chunks.append({
                    'text': current_chunk.strip(),
                    'start_char': current_start,
                    'end_char': current_start + len(current_chunk),
                    'chunk_size': len(current_chunk),
                    'type': 'semantic'
                })
                current_chunk = ""
                current_start = current_start + len(current_chunk)

            # Add paragraph to current chunk
            if current_chunk:
                current_chunk += "\n\n" + paragraph
            else:
                current_chunk = paragraph
                # Estimate start position (this is approximate)
                current_start = text.find(paragraph)

        # Add the last chunk
        if current_chunk:
            chunks.append({
                'text': current_chunk.strip(),
                'start_char': current_start,
                'end_char': current_start + len(current_chunk),
                'chunk_size': len(current_chunk),
                'type': 'semantic'
            })

        return chunks


class ResearchPaperChunker:
    """Specialized chunker for research papers"""

    def __init__(self, strategy: str = "semantic"):
        self.strategy = strategy
        self.chunkers = {
            "fixed": FixedSizeChunking(),
            "semantic": SemanticChunking()
        }

    def chunk_paper(self, paper: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Chunk a research paper into manageable pieces"""
        paper_id = paper.get('id', 'unknown')
        title = paper.get('title', '')
        abstract = paper.get('abstract', '')

        # Combine title and abstract for chunking
        full_text = f"Title: {title}\n\nAbstract: {abstract}"

        # Get appropriate chunker
        chunker = self.chunkers.get(self.strategy, self.chunkers["semantic"])

        # Chunk the text
        chunks = chunker.chunk_text(full_text)

        # Add paper metadata to each chunk
        for chunk in chunks:
            chunk.update({
                'paper_id': paper_id,
                'paper_title': title,
                'source': paper.get('source', ''),
                'domain': paper.get('domain', ''),
                'publication_date': paper.get('publication_date', ''),
                'authors': paper.get('authors', []),
                'chunk_strategy': self.strategy
            })

        return chunks

    def batch_chunk_papers(self, papers: List[Dict], strategy: str = None) -> List[Dict[str, Any]]:
        """Chunk multiple papers"""
        if strategy:
            self.strategy = strategy

        all_chunks = []
        for paper in papers:
            try:
                chunks = self.chunk_paper(paper)
                all_chunks.extend(chunks)
            except Exception as e:
                print(f"❌ Error chunking paper {paper.get('id', 'unknown')}: {e}")
                continue

        print(f"✅ Chunked {len(papers)} papers into {len(all_chunks)} chunks")
        return all_chunks


# Quick test
def test_chunking_strategies():
    """Test different chunking strategies"""
    test_paper = {
        'id': 'test_001',
        'title': 'Deep Learning for Medical Image Analysis',
        'abstract': 'This paper explores the application of deep learning techniques in medical image analysis. We propose a novel transformer-based architecture that achieves state-of-the-art performance on multiple benchmark datasets. Our method improves accuracy by 15% compared to previous approaches. The model is evaluated on CT, MRI, and X-ray datasets showing consistent improvements across modalities.',
        'source': 'test',
        'domain': 'medical_imaging'
    }

    chunker = ResearchPaperChunker()

    print("🧪 Testing Chunking Strategies")
    print("=" * 50)

    for strategy in ["fixed", "semantic"]:
        print(f"\n🔬 Strategy: {strategy}")
        chunks = chunker.chunk_paper(test_paper)
        print(f"   Number of chunks: {len(chunks)}")
        for i, chunk in enumerate(chunks):
            print(f"   Chunk {i + 1}: {chunk['chunk_size']} chars - {chunk['text'][:80]}...")


if __name__ == "__main__":
    test_chunking_strategies()