Spaces:
Running
Running
| # embeddings/text_chunking.py | |
| """ | |
| Multiple text chunking strategies for research papers | |
| """ | |
| import re | |
| from typing import List, Dict, Any | |
| from abc import ABC, abstractmethod | |
| class ChunkingStrategy(ABC): | |
| """Abstract base class for chunking strategies""" | |
| def chunk_text(self, text: str, **kwargs) -> List[Dict[str, Any]]: | |
| pass | |
| class FixedSizeChunking(ChunkingStrategy): | |
| """Fixed size chunking with overlap""" | |
| def chunk_text(self, text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Dict[str, Any]]: | |
| chunks = [] | |
| start = 0 | |
| text_length = len(text) | |
| while start < text_length: | |
| end = min(start + chunk_size, text_length) | |
| # Adjust chunk to not break in middle of word if possible | |
| if end < text_length: | |
| # Try to find a sentence boundary | |
| sentence_end = text.rfind('. ', start, end) | |
| if sentence_end != -1 and sentence_end > start + chunk_size // 2: | |
| end = sentence_end + 1 | |
| else: | |
| # Otherwise find a word boundary | |
| word_end = text.rfind(' ', start, end) | |
| if word_end != -1 and word_end > start + chunk_size // 2: | |
| end = word_end | |
| chunk = text[start:end].strip() | |
| if chunk: | |
| chunks.append({ | |
| 'text': chunk, | |
| 'start_char': start, | |
| 'end_char': end, | |
| 'chunk_size': len(chunk) | |
| }) | |
| start = end - chunk_overlap if end - chunk_overlap > start else end | |
| return chunks | |
| class SemanticChunking(ChunkingStrategy): | |
| """Semantic chunking based on paragraphs and sections""" | |
| def chunk_text(self, text: str, max_chunk_size: int = 512) -> List[Dict[str, Any]]: | |
| chunks = [] | |
| # Split by paragraphs first | |
| paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] | |
| current_chunk = "" | |
| current_start = 0 | |
| for i, paragraph in enumerate(paragraphs): | |
| # If adding this paragraph would exceed max size, save current chunk | |
| if len(current_chunk) + len(paragraph) > max_chunk_size and current_chunk: | |
| chunks.append({ | |
| 'text': current_chunk.strip(), | |
| 'start_char': current_start, | |
| 'end_char': current_start + len(current_chunk), | |
| 'chunk_size': len(current_chunk), | |
| 'type': 'semantic' | |
| }) | |
| current_chunk = "" | |
| current_start = current_start + len(current_chunk) | |
| # Add paragraph to current chunk | |
| if current_chunk: | |
| current_chunk += "\n\n" + paragraph | |
| else: | |
| current_chunk = paragraph | |
| # Estimate start position (this is approximate) | |
| current_start = text.find(paragraph) | |
| # Add the last chunk | |
| if current_chunk: | |
| chunks.append({ | |
| 'text': current_chunk.strip(), | |
| 'start_char': current_start, | |
| 'end_char': current_start + len(current_chunk), | |
| 'chunk_size': len(current_chunk), | |
| 'type': 'semantic' | |
| }) | |
| return chunks | |
| class ResearchPaperChunker: | |
| """Specialized chunker for research papers""" | |
| def __init__(self, strategy: str = "semantic"): | |
| self.strategy = strategy | |
| self.chunkers = { | |
| "fixed": FixedSizeChunking(), | |
| "semantic": SemanticChunking() | |
| } | |
| def chunk_paper(self, paper: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| """Chunk a research paper into manageable pieces""" | |
| paper_id = paper.get('id', 'unknown') | |
| title = paper.get('title', '') | |
| abstract = paper.get('abstract', '') | |
| # Combine title and abstract for chunking | |
| full_text = f"Title: {title}\n\nAbstract: {abstract}" | |
| # Get appropriate chunker | |
| chunker = self.chunkers.get(self.strategy, self.chunkers["semantic"]) | |
| # Chunk the text | |
| chunks = chunker.chunk_text(full_text) | |
| # Add paper metadata to each chunk | |
| for chunk in chunks: | |
| chunk.update({ | |
| 'paper_id': paper_id, | |
| 'paper_title': title, | |
| 'source': paper.get('source', ''), | |
| 'domain': paper.get('domain', ''), | |
| 'publication_date': paper.get('publication_date', ''), | |
| 'authors': paper.get('authors', []), | |
| 'chunk_strategy': self.strategy | |
| }) | |
| return chunks | |
| def batch_chunk_papers(self, papers: List[Dict], strategy: str = None) -> List[Dict[str, Any]]: | |
| """Chunk multiple papers""" | |
| if strategy: | |
| self.strategy = strategy | |
| all_chunks = [] | |
| for paper in papers: | |
| try: | |
| chunks = self.chunk_paper(paper) | |
| all_chunks.extend(chunks) | |
| except Exception as e: | |
| print(f"❌ Error chunking paper {paper.get('id', 'unknown')}: {e}") | |
| continue | |
| print(f"✅ Chunked {len(papers)} papers into {len(all_chunks)} chunks") | |
| return all_chunks | |
| # Quick test | |
| def test_chunking_strategies(): | |
| """Test different chunking strategies""" | |
| test_paper = { | |
| 'id': 'test_001', | |
| 'title': 'Deep Learning for Medical Image Analysis', | |
| 'abstract': 'This paper explores the application of deep learning techniques in medical image analysis. We propose a novel transformer-based architecture that achieves state-of-the-art performance on multiple benchmark datasets. Our method improves accuracy by 15% compared to previous approaches. The model is evaluated on CT, MRI, and X-ray datasets showing consistent improvements across modalities.', | |
| 'source': 'test', | |
| 'domain': 'medical_imaging' | |
| } | |
| chunker = ResearchPaperChunker() | |
| print("🧪 Testing Chunking Strategies") | |
| print("=" * 50) | |
| for strategy in ["fixed", "semantic"]: | |
| print(f"\n🔬 Strategy: {strategy}") | |
| chunks = chunker.chunk_paper(test_paper) | |
| print(f" Number of chunks: {len(chunks)}") | |
| for i, chunk in enumerate(chunks): | |
| print(f" Chunk {i + 1}: {chunk['chunk_size']} chars - {chunk['text'][:80]}...") | |
| if __name__ == "__main__": | |
| test_chunking_strategies() |