Spaces:
Sleeping
Sleeping
| """Utility functions for RAG system.""" | |
| import re | |
| import hashlib | |
| from typing import List, Dict, Any, Optional | |
| import json # Changed from yaml | |
| from pathlib import Path | |
| def load_hierarchy(hierarchy_name: str) -> Dict[str, Any]: | |
| """ | |
| Load hierarchy definition from JSON file. | |
| Args: | |
| hierarchy_name: Name of the hierarchy (hospital, bank, fluid_simulation) | |
| Returns: | |
| Dictionary containing hierarchy definition | |
| """ | |
| # Changed from .yaml to .json | |
| hierarchy_path = Path(__file__).parent.parent / "hierarchies" / f"{hierarchy_name}.json" | |
| if not hierarchy_path.exists(): | |
| raise FileNotFoundError(f"Hierarchy file not found: {hierarchy_path}") | |
| with open(hierarchy_path, 'r', encoding='utf-8') as f: | |
| return json.load(f) # Changed from yaml.safe_load | |
| # Rest of the functions remain the same... | |
| def generate_doc_id(content: str) -> str: | |
| """ | |
| Generate unique document ID from content. | |
| Args: | |
| content: Document content | |
| Returns: | |
| Hexadecimal hash string | |
| """ | |
| return hashlib.md5(content.encode('utf-8')).hexdigest() | |
| def generate_chunk_id(doc_id: str, chunk_index: int) -> str: | |
| """ | |
| Generate unique chunk ID. | |
| Args: | |
| doc_id: Parent document ID | |
| chunk_index: Index of chunk within document | |
| Returns: | |
| Formatted chunk ID string | |
| """ | |
| return f"{doc_id}_chunk_{chunk_index}" | |
| def mask_pii(text: str) -> str: | |
| """ | |
| Basic PII masking for sensitive data. | |
| Args: | |
| text: Input text potentially containing PII | |
| Returns: | |
| Text with masked PII | |
| """ | |
| # Email addresses | |
| text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text) | |
| # Phone numbers (simple pattern) | |
| text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text) | |
| # SSN pattern | |
| text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text) | |
| return text | |
| def detect_language(text: str) -> str: | |
| """ | |
| Simple language detection (English vs Japanese). | |
| Args: | |
| text: Input text | |
| Returns: | |
| Language code ('en' or 'ja') | |
| """ | |
| # Count Japanese characters | |
| japanese_chars = len(re.findall(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]', text)) | |
| # If more than 10% Japanese characters, classify as Japanese | |
| if len(text) > 0 and japanese_chars / len(text) > 0.1: | |
| return "ja" | |
| return "en" | |
| def chunk_by_tokens(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]: | |
| """ | |
| Split text into chunks by approximate token count. | |
| Args: | |
| text: Input text to chunk | |
| chunk_size: Target chunk size in tokens (approximate) | |
| overlap: Number of overlapping tokens between chunks | |
| Returns: | |
| List of text chunks | |
| """ | |
| # Approximate: 1 token ≈ 4 characters | |
| chars_per_chunk = chunk_size * 4 | |
| overlap_chars = overlap * 4 | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chars_per_chunk | |
| chunk = text[start:end] | |
| # Try to break at sentence boundary | |
| if end < len(text): | |
| last_period = chunk.rfind('.') | |
| last_newline = chunk.rfind('\n') | |
| break_point = max(last_period, last_newline) | |
| if break_point > chars_per_chunk * 0.5: # Only if we're past halfway | |
| chunk = chunk[:break_point + 1] | |
| end = start + break_point + 1 | |
| chunks.append(chunk.strip()) | |
| start = end - overlap_chars | |
| return [c for c in chunks if c] # Remove empty chunks | |
| def save_json(data: Any, filepath: str) -> None: | |
| """ | |
| Save data to JSON file. | |
| Args: | |
| data: Data to save | |
| filepath: Output file path | |
| """ | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| def load_json(filepath: str) -> Any: | |
| """ | |
| Load data from JSON file. | |
| Args: | |
| filepath: Input file path | |
| Returns: | |
| Loaded data | |
| """ | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| def format_metadata(metadata: Dict[str, Any]) -> str: | |
| """ | |
| Format metadata dictionary for display. | |
| Args: | |
| metadata: Metadata dictionary | |
| Returns: | |
| Formatted string representation | |
| """ | |
| lines = [] | |
| for key, value in metadata.items(): | |
| if key not in ['embedding', 'text']: # Skip large fields | |
| lines.append(f"{key}: {value}") | |
| return "\n".join(lines) |