hh786's picture
Deployment of Hierarchical RAG system
c54dcef
"""Utility functions for RAG system."""
import re
import hashlib
from typing import List, Dict, Any, Optional
import json # Changed from yaml
from pathlib import Path
def load_hierarchy(hierarchy_name: str) -> Dict[str, Any]:
"""
Load hierarchy definition from JSON file.
Args:
hierarchy_name: Name of the hierarchy (hospital, bank, fluid_simulation)
Returns:
Dictionary containing hierarchy definition
"""
# Changed from .yaml to .json
hierarchy_path = Path(__file__).parent.parent / "hierarchies" / f"{hierarchy_name}.json"
if not hierarchy_path.exists():
raise FileNotFoundError(f"Hierarchy file not found: {hierarchy_path}")
with open(hierarchy_path, 'r', encoding='utf-8') as f:
return json.load(f) # Changed from yaml.safe_load
# Rest of the functions remain the same...
def generate_doc_id(content: str) -> str:
"""
Generate unique document ID from content.
Args:
content: Document content
Returns:
Hexadecimal hash string
"""
return hashlib.md5(content.encode('utf-8')).hexdigest()
def generate_chunk_id(doc_id: str, chunk_index: int) -> str:
"""
Generate unique chunk ID.
Args:
doc_id: Parent document ID
chunk_index: Index of chunk within document
Returns:
Formatted chunk ID string
"""
return f"{doc_id}_chunk_{chunk_index}"
def mask_pii(text: str) -> str:
"""
Basic PII masking for sensitive data.
Args:
text: Input text potentially containing PII
Returns:
Text with masked PII
"""
# Email addresses
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
# Phone numbers (simple pattern)
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
# SSN pattern
text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
return text
def detect_language(text: str) -> str:
"""
Simple language detection (English vs Japanese).
Args:
text: Input text
Returns:
Language code ('en' or 'ja')
"""
# Count Japanese characters
japanese_chars = len(re.findall(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]', text))
# If more than 10% Japanese characters, classify as Japanese
if len(text) > 0 and japanese_chars / len(text) > 0.1:
return "ja"
return "en"
def chunk_by_tokens(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""
Split text into chunks by approximate token count.
Args:
text: Input text to chunk
chunk_size: Target chunk size in tokens (approximate)
overlap: Number of overlapping tokens between chunks
Returns:
List of text chunks
"""
# Approximate: 1 token ≈ 4 characters
chars_per_chunk = chunk_size * 4
overlap_chars = overlap * 4
chunks = []
start = 0
while start < len(text):
end = start + chars_per_chunk
chunk = text[start:end]
# Try to break at sentence boundary
if end < len(text):
last_period = chunk.rfind('.')
last_newline = chunk.rfind('\n')
break_point = max(last_period, last_newline)
if break_point > chars_per_chunk * 0.5: # Only if we're past halfway
chunk = chunk[:break_point + 1]
end = start + break_point + 1
chunks.append(chunk.strip())
start = end - overlap_chars
return [c for c in chunks if c] # Remove empty chunks
def save_json(data: Any, filepath: str) -> None:
"""
Save data to JSON file.
Args:
data: Data to save
filepath: Output file path
"""
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def load_json(filepath: str) -> Any:
"""
Load data from JSON file.
Args:
filepath: Input file path
Returns:
Loaded data
"""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def format_metadata(metadata: Dict[str, Any]) -> str:
"""
Format metadata dictionary for display.
Args:
metadata: Metadata dictionary
Returns:
Formatted string representation
"""
lines = []
for key, value in metadata.items():
if key not in ['embedding', 'text']: # Skip large fields
lines.append(f"{key}: {value}")
return "\n".join(lines)