Spaces:

KunalShaw
/

RAG-Chatbot-Agentic-AI

Sleeping

File size: 9,863 Bytes

f9c215a

"""
utils.py - Helper functions for text processing and chunking

This module contains utility functions for:
- Text cleaning (removing extra whitespace, headers/footers)
- Token counting using tiktoken
- Text chunking with overlap
- Confidence score normalization
"""

import re
from typing import List, Dict, Tuple
import json

# Try to use tiktoken for accurate token counting, fallback to word count
try:
    import tiktoken
    TOKENIZER = tiktoken.get_encoding("cl100k_base")
    USE_TIKTOKEN = True
except ImportError:
    USE_TIKTOKEN = False
    print("WARNING: tiktoken not available, using word count approximation")


def count_tokens(text: str) -> int:
    """
    Count tokens in text using tiktoken or word count fallback.
    
    Args:
        text: Input text string
        
    Returns:
        Number of tokens (approximate if tiktoken not available)
    """
    if USE_TIKTOKEN:
        return len(TOKENIZER.encode(text))
    else:
        # Rough approximation: ~1.3 words per token on average
        words = len(text.split())
        return int(words * 1.3)


def clean_text(text: str) -> str:
    """
    Clean extracted PDF text by removing extra whitespace and common artifacts.
    
    Args:
        text: Raw text from PDF extraction
        
    Returns:
        Cleaned text string
    """
    # Remove excessive whitespace (multiple spaces, tabs)
    text = re.sub(r'[ \t]+', ' ', text)
    
    # Remove excessive newlines (more than 2 in a row)
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Remove page numbers (common patterns like "Page 1" or "- 1 -")
    text = re.sub(r'(?i)page\s*\d+', '', text)
    text = re.sub(r'-\s*\d+\s*-', '', text)
    
    # Remove common header/footer artifacts (customize based on your PDF)
    # This is a simple heuristic - you might need to adjust for your specific PDF
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Strip leading/trailing whitespace from each line
    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(lines)
    
    # Final cleanup
    text = text.strip()
    
    return text


def chunk_text(
    text: str,
    page_number: int,
    chunk_size: int = 500,
    chunk_overlap: int = 50,
    source: str = "Ebook-Agentic-AI.pdf"
) -> List[Dict]:
    """
    Split text into overlapping chunks with metadata.
    
    Uses token counting to ensure chunks are approximately chunk_size tokens,
    with overlap for context continuity.
    
    Args:
        text: Text to chunk (from one page)
        page_number: Page number for metadata
        chunk_size: Target size in tokens (default 500)
        chunk_overlap: Overlap between chunks in tokens (default 50)
        source: Source document name
        
    Returns:
        List of chunk dictionaries with id, page, text, start_char, end_char
    """
    chunks = []
    
    # If text is empty or very short, return single chunk
    if not text or count_tokens(text) <= chunk_size:
        if text.strip():
            chunk_id = f"pdfpage_{page_number}_chunk_0"
            chunks.append({
                "id": chunk_id,
                "page": page_number,
                "text": text.strip(),
                "start_char": 0,
                "end_char": len(text),
                "source": source
            })
        return chunks
    
    # Split into sentences for better chunking
    # Simple sentence splitting - handles common cases
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    current_chunk = []
    current_tokens = 0
    current_start = 0
    chunk_index = 0
    char_position = 0
    
    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)
        
        # If adding this sentence exceeds chunk_size, save current chunk
        if current_tokens + sentence_tokens > chunk_size and current_chunk:
            # Join current chunk
            chunk_text = ' '.join(current_chunk)
            chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}"
            
            chunks.append({
                "id": chunk_id,
                "page": page_number,
                "text": chunk_text,
                "start_char": current_start,
                "end_char": current_start + len(chunk_text),
                "source": source
            })
            
            chunk_index += 1
            
            # Calculate overlap - keep last few sentences that fit in overlap
            overlap_tokens = 0
            overlap_sentences = []
            for s in reversed(current_chunk):
                s_tokens = count_tokens(s)
                if overlap_tokens + s_tokens <= chunk_overlap:
                    overlap_sentences.insert(0, s)
                    overlap_tokens += s_tokens
                else:
                    break
            
            current_chunk = overlap_sentences
            current_tokens = overlap_tokens
            current_start = char_position - sum(len(s) + 1 for s in overlap_sentences)
        
        current_chunk.append(sentence)
        current_tokens += sentence_tokens
        char_position += len(sentence) + 1  # +1 for space
    
    # Don't forget the last chunk!
    if current_chunk:
        chunk_text = ' '.join(current_chunk)
        chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}"
        
        chunks.append({
            "id": chunk_id,
            "page": page_number,
            "text": chunk_text,
            "start_char": current_start,
            "end_char": current_start + len(chunk_text),
            "source": source
        })
    
    return chunks


def normalize_score(score: float) -> float:
    """
    Normalize similarity score to 0-1 range.
    
    Pinecone returns similarity scores typically between -1 and 1 for cosine.
    This function normalizes them to 0-1 range.
    
    Formula: normalized = (score + 1) / 2
    Then clamp to [0, 1] for safety.
    
    Args:
        score: Raw similarity score from Pinecone
        
    Returns:
        Normalized score between 0.0 and 1.0
    """
    # For cosine similarity, scores are in [-1, 1]
    # Normalize to [0, 1]
    normalized = (score + 1.0) / 2.0
    
    # Clamp to valid range (safety check)
    return max(0.0, min(1.0, normalized))


def compute_confidence(scores: List[float], method: str = "max") -> float:
    """
    Compute confidence score from list of similarity scores.
    
    Args:
        scores: List of raw similarity scores from retrieval
        method: "max" for maximum score, "mean" for average
        
    Returns:
        Confidence score rounded to 3 decimal places
    """
    if not scores:
        return 0.0
    
    # Normalize all scores
    normalized_scores = [normalize_score(s) for s in scores]
    
    # Compute confidence based on method
    if method == "max":
        confidence = max(normalized_scores)
    elif method == "mean":
        confidence = sum(normalized_scores) / len(normalized_scores)
    else:
        # Default to max
        confidence = max(normalized_scores)
    
    return round(confidence, 3)


def save_chunks_to_jsonl(chunks: List[Dict], filepath: str, include_embeddings: bool = False):
    """
    Save chunks to a JSONL file for backup.
    
    Args:
        chunks: List of chunk dictionaries
        filepath: Output file path
        include_embeddings: Whether to include embeddings (makes file large)
    """
    with open(filepath, 'w', encoding='utf-8') as f:
        for chunk in chunks:
            # Create a copy to potentially remove embeddings
            chunk_data = chunk.copy()
            
            if not include_embeddings and 'embedding' in chunk_data:
                del chunk_data['embedding']
            
            f.write(json.dumps(chunk_data, ensure_ascii=False) + '\n')
    
    print(f"Saved {len(chunks)} chunks to {filepath}")


def load_chunks_from_jsonl(filepath: str) -> List[Dict]:
    """
    Load chunks from a JSONL file.
    
    Args:
        filepath: Input file path
        
    Returns:
        List of chunk dictionaries
    """
    chunks = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                chunks.append(json.loads(line))
    
    print(f"Loaded {len(chunks)} chunks from {filepath}")
    return chunks


def format_chunks_for_llm(chunks: List[Dict]) -> str:
    """
    Format retrieved chunks into a string for LLM context.
    
    Args:
        chunks: List of chunk dictionaries with 'text' and 'page' keys
        
    Returns:
        Formatted string with markers for the LLM
    """
    formatted_parts = []
    
    for i, chunk in enumerate(chunks):
        page = chunk.get('page', 'unknown')
        text = chunk.get('text', '')
        chunk_id = chunk.get('id', f'chunk_{i}')
        
        part = f"[Source: {chunk_id}, Page {page}]\n{text}"
        formatted_parts.append(part)
    
    return "\n\n---\n\n".join(formatted_parts)


if __name__ == "__main__":
    # Quick test of utility functions
    print("Testing utils.py functions...")
    
    # Test token counting
    test_text = "This is a test sentence for token counting."
    print(f"Token count for '{test_text}': {count_tokens(test_text)}")
    
    # Test text cleaning
    dirty_text = "  This   has   extra    spaces  \n\n\n\nAnd too many newlines Page 123"
    clean = clean_text(dirty_text)
    print(f"Cleaned text: '{clean}'")
    
    # Test score normalization
    test_scores = [-1.0, 0.0, 0.5, 1.0]
    for score in test_scores:
        print(f"Score {score} -> normalized: {normalize_score(score)}")
    
    # Test confidence computation
    scores = [0.8, 0.6, 0.7]
    print(f"Confidence (max): {compute_confidence(scores, 'max')}")
    print(f"Confidence (mean): {compute_confidence(scores, 'mean')}")
    
    print("\nAll tests passed!")