Spaces:

shahbazdev0
/

hierarchical-rag-eval

Sleeping

File size: 4,703 Bytes

c54dcef

"""Utility functions for RAG system."""

import re
import hashlib
from typing import List, Dict, Any, Optional
import json  # Changed from yaml
from pathlib import Path


def load_hierarchy(hierarchy_name: str) -> Dict[str, Any]:
    """
    Load hierarchy definition from JSON file.
    
    Args:
        hierarchy_name: Name of the hierarchy (hospital, bank, fluid_simulation)
        
    Returns:
        Dictionary containing hierarchy definition
    """
    # Changed from .yaml to .json
    hierarchy_path = Path(__file__).parent.parent / "hierarchies" / f"{hierarchy_name}.json"
    
    if not hierarchy_path.exists():
        raise FileNotFoundError(f"Hierarchy file not found: {hierarchy_path}")
    
    with open(hierarchy_path, 'r', encoding='utf-8') as f:
        return json.load(f)  # Changed from yaml.safe_load


# Rest of the functions remain the same...
def generate_doc_id(content: str) -> str:
    """
    Generate unique document ID from content.
    
    Args:
        content: Document content
        
    Returns:
        Hexadecimal hash string
    """
    return hashlib.md5(content.encode('utf-8')).hexdigest()


def generate_chunk_id(doc_id: str, chunk_index: int) -> str:
    """
    Generate unique chunk ID.
    
    Args:
        doc_id: Parent document ID
        chunk_index: Index of chunk within document
        
    Returns:
        Formatted chunk ID string
    """
    return f"{doc_id}_chunk_{chunk_index}"


def mask_pii(text: str) -> str:
    """
    Basic PII masking for sensitive data.
    
    Args:
        text: Input text potentially containing PII
        
    Returns:
        Text with masked PII
    """
    # Email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
    
    # Phone numbers (simple pattern)
    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
    
    # SSN pattern
    text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
    
    return text


def detect_language(text: str) -> str:
    """
    Simple language detection (English vs Japanese).
    
    Args:
        text: Input text
        
    Returns:
        Language code ('en' or 'ja')
    """
    # Count Japanese characters
    japanese_chars = len(re.findall(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]', text))
    
    # If more than 10% Japanese characters, classify as Japanese
    if len(text) > 0 and japanese_chars / len(text) > 0.1:
        return "ja"
    return "en"


def chunk_by_tokens(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
    """
    Split text into chunks by approximate token count.
    
    Args:
        text: Input text to chunk
        chunk_size: Target chunk size in tokens (approximate)
        overlap: Number of overlapping tokens between chunks
        
    Returns:
        List of text chunks
    """
    # Approximate: 1 token ≈ 4 characters
    chars_per_chunk = chunk_size * 4
    overlap_chars = overlap * 4
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chars_per_chunk
        chunk = text[start:end]
        
        # Try to break at sentence boundary
        if end < len(text):
            last_period = chunk.rfind('.')
            last_newline = chunk.rfind('\n')
            break_point = max(last_period, last_newline)
            
            if break_point > chars_per_chunk * 0.5:  # Only if we're past halfway
                chunk = chunk[:break_point + 1]
                end = start + break_point + 1
        
        chunks.append(chunk.strip())
        start = end - overlap_chars
    
    return [c for c in chunks if c]  # Remove empty chunks


def save_json(data: Any, filepath: str) -> None:
    """
    Save data to JSON file.
    
    Args:
        data: Data to save
        filepath: Output file path
    """
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)


def load_json(filepath: str) -> Any:
    """
    Load data from JSON file.
    
    Args:
        filepath: Input file path
        
    Returns:
        Loaded data
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)


def format_metadata(metadata: Dict[str, Any]) -> str:
    """
    Format metadata dictionary for display.
    
    Args:
        metadata: Metadata dictionary
        
    Returns:
        Formatted string representation
    """
    lines = []
    for key, value in metadata.items():
        if key not in ['embedding', 'text']:  # Skip large fields
            lines.append(f"{key}: {value}")
    return "\n".join(lines)