"""Utility functions for RAG system.""" import re import hashlib from typing import List, Dict, Any, Optional import json # Changed from yaml from pathlib import Path def load_hierarchy(hierarchy_name: str) -> Dict[str, Any]: """ Load hierarchy definition from JSON file. Args: hierarchy_name: Name of the hierarchy (hospital, bank, fluid_simulation) Returns: Dictionary containing hierarchy definition """ # Changed from .yaml to .json hierarchy_path = Path(__file__).parent.parent / "hierarchies" / f"{hierarchy_name}.json" if not hierarchy_path.exists(): raise FileNotFoundError(f"Hierarchy file not found: {hierarchy_path}") with open(hierarchy_path, 'r', encoding='utf-8') as f: return json.load(f) # Changed from yaml.safe_load # Rest of the functions remain the same... def generate_doc_id(content: str) -> str: """ Generate unique document ID from content. Args: content: Document content Returns: Hexadecimal hash string """ return hashlib.md5(content.encode('utf-8')).hexdigest() def generate_chunk_id(doc_id: str, chunk_index: int) -> str: """ Generate unique chunk ID. Args: doc_id: Parent document ID chunk_index: Index of chunk within document Returns: Formatted chunk ID string """ return f"{doc_id}_chunk_{chunk_index}" def mask_pii(text: str) -> str: """ Basic PII masking for sensitive data. Args: text: Input text potentially containing PII Returns: Text with masked PII """ # Email addresses text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text) # Phone numbers (simple pattern) text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text) # SSN pattern text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text) return text def detect_language(text: str) -> str: """ Simple language detection (English vs Japanese). Args: text: Input text Returns: Language code ('en' or 'ja') """ # Count Japanese characters japanese_chars = len(re.findall(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]', text)) # If more than 10% Japanese characters, classify as Japanese if len(text) > 0 and japanese_chars / len(text) > 0.1: return "ja" return "en" def chunk_by_tokens(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]: """ Split text into chunks by approximate token count. Args: text: Input text to chunk chunk_size: Target chunk size in tokens (approximate) overlap: Number of overlapping tokens between chunks Returns: List of text chunks """ # Approximate: 1 token ≈ 4 characters chars_per_chunk = chunk_size * 4 overlap_chars = overlap * 4 chunks = [] start = 0 while start < len(text): end = start + chars_per_chunk chunk = text[start:end] # Try to break at sentence boundary if end < len(text): last_period = chunk.rfind('.') last_newline = chunk.rfind('\n') break_point = max(last_period, last_newline) if break_point > chars_per_chunk * 0.5: # Only if we're past halfway chunk = chunk[:break_point + 1] end = start + break_point + 1 chunks.append(chunk.strip()) start = end - overlap_chars return [c for c in chunks if c] # Remove empty chunks def save_json(data: Any, filepath: str) -> None: """ Save data to JSON file. Args: data: Data to save filepath: Output file path """ Path(filepath).parent.mkdir(parents=True, exist_ok=True) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) def load_json(filepath: str) -> Any: """ Load data from JSON file. Args: filepath: Input file path Returns: Loaded data """ with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def format_metadata(metadata: Dict[str, Any]) -> str: """ Format metadata dictionary for display. Args: metadata: Metadata dictionary Returns: Formatted string representation """ lines = [] for key, value in metadata.items(): if key not in ['embedding', 'text']: # Skip large fields lines.append(f"{key}: {value}") return "\n".join(lines)