Spaces:
Sleeping
Sleeping
| """ | |
| Document chunking with hierarchical approach. | |
| Creates overlapping chunks while preserving context. | |
| """ | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Tuple | |
| import PyPDF2 | |
| import tiktoken | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class ChunkProcessor: | |
| """Process PDFs into hierarchical chunks for embedding.""" | |
| def __init__(self, chunk_size: int = 1000, overlap: int = 200): | |
| """Initialize chunk processor. | |
| Args: | |
| chunk_size: Target size for chunks in tokens | |
| overlap: Number of tokens to overlap between chunks | |
| """ | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| self.encoding = tiktoken.encoding_for_model("gpt-4") | |
| def count_tokens(self, text: str) -> int: | |
| """Count tokens in text.""" | |
| return len(self.encoding.encode(text)) | |
| def extract_text_from_pdf(self, pdf_path: Path) -> List[Tuple[str, int]]: | |
| """Extract text from PDF with page numbers.""" | |
| text_pages = [] | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page_num, page in enumerate(pdf_reader.pages, 1): | |
| text = page.extract_text() | |
| if text.strip(): | |
| text_pages.append((text, page_num)) | |
| except Exception as e: | |
| logger.error(f"Error reading PDF {pdf_path}: {e}") | |
| return text_pages | |
| def chunk_text(self, text: str, source: str, page: int) -> List[Dict]: | |
| """Split text into overlapping chunks.""" | |
| chunks = [] | |
| # Split into sentences (simple approach) | |
| sentences = text.replace('\n', ' ').split('. ') | |
| sentences = [s.strip() + '.' for s in sentences if s.strip()] | |
| current_chunk = [] | |
| current_tokens = 0 | |
| chunk_index = 0 | |
| for sentence in sentences: | |
| sentence_tokens = self.count_tokens(sentence) | |
| # If adding this sentence would exceed chunk size | |
| if current_tokens + sentence_tokens > self.chunk_size and current_chunk: | |
| # Create chunk | |
| chunk_text = ' '.join(current_chunk) | |
| chunks.append({ | |
| "text": chunk_text, | |
| "metadata": { | |
| "source": source, | |
| "page": page, | |
| "chunk_index": chunk_index, | |
| "token_count": current_tokens | |
| } | |
| }) | |
| # Start new chunk with overlap | |
| overlap_tokens = 0 | |
| overlap_sentences = [] | |
| # Add sentences from end of current chunk for overlap | |
| for sent in reversed(current_chunk): | |
| sent_tokens = self.count_tokens(sent) | |
| if overlap_tokens + sent_tokens <= self.overlap: | |
| overlap_sentences.insert(0, sent) | |
| overlap_tokens += sent_tokens | |
| else: | |
| break | |
| current_chunk = overlap_sentences | |
| current_tokens = overlap_tokens | |
| chunk_index += 1 | |
| current_chunk.append(sentence) | |
| current_tokens += sentence_tokens | |
| # Add final chunk | |
| if current_chunk: | |
| chunk_text = ' '.join(current_chunk) | |
| chunks.append({ | |
| "text": chunk_text, | |
| "metadata": { | |
| "source": source, | |
| "page": page, | |
| "chunk_index": chunk_index, | |
| "token_count": current_tokens | |
| } | |
| }) | |
| return chunks | |
| def chunk_pdf(self, pdf_path: Path) -> List[Dict]: | |
| """Process entire PDF into chunks.""" | |
| all_chunks = [] | |
| text_pages = self.extract_text_from_pdf(pdf_path) | |
| for text, page_num in text_pages: | |
| chunks = self.chunk_text(text, pdf_path.name, page_num) | |
| all_chunks.extend(chunks) | |
| logger.info(f"Created {len(all_chunks)} chunks from {pdf_path.name}") | |
| return all_chunks | |
| def process_directory(self, pdf_dir: Path) -> Dict[str, List[Dict]]: | |
| """Process all PDFs in directory, organized by version.""" | |
| version_chunks = {} | |
| for pdf_path in pdf_dir.glob("*.pdf"): | |
| # Extract version from filename (e.g., harmony_1_8_guide.pdf) | |
| filename = pdf_path.stem.lower() | |
| # Determine version | |
| if "harmony" in filename: | |
| if "1_8" in filename or "1.8" in filename: | |
| version = "harmony_1_8" | |
| elif "1_6" in filename or "1.6" in filename: | |
| version = "harmony_1_6" | |
| elif "1_5" in filename or "1.5" in filename: | |
| version = "harmony_1_5" | |
| elif "1_2" in filename or "1.2" in filename: | |
| version = "harmony_1_2" | |
| else: | |
| version = "harmony_general" | |
| elif "chorus" in filename: | |
| version = "chorus_1_1" | |
| else: | |
| version = "general_faq" | |
| # Process PDF | |
| chunks = self.chunk_pdf(pdf_path) | |
| # Add to version collection | |
| if version not in version_chunks: | |
| version_chunks[version] = [] | |
| version_chunks[version].extend(chunks) | |
| return version_chunks | |
| def save_chunks(self, chunks: List[Dict], output_path: Path): | |
| """Save chunks to JSON file.""" | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w') as f: | |
| json.dump({ | |
| "chunks": chunks, | |
| "chunk_size": self.chunk_size, | |
| "overlap": self.overlap | |
| }, f, indent=2) | |
| logger.info(f"Saved {len(chunks)} chunks to {output_path}") |