""" Domain Knowledge Base - Dynamic domain-specific aspect prototypes and skill mapping """ import os import json import logging from typing import Dict, List, Optional, Tuple from pathlib import Path logger = logging.getLogger(__name__) class DomainConfig: """Single domain configuration""" def __init__(self, config_data: Dict): self.domain_id = config_data.get('domain_id', 'unknown') self.display_name = config_data.get('display_name', 'Unknown Domain') self.description = config_data.get('description', '') self.core_skills = config_data.get('core_skills', []) self.aspect_prototypes = config_data.get('aspect_prototypes', {}) self.industry_benchmarks = config_data.get('industry_benchmarks', {}) self.skill_gaps_mapping = config_data.get('skill_gaps_mapping', {}) self.detection_keywords = config_data.get('detection_keywords', []) def get_aspect_seeds(self, aspect: str) -> List[str]: """Get seed phrases for a specific aspect""" return self.aspect_prototypes.get(aspect, []) def get_all_aspect_seeds(self) -> Dict[str, List[str]]: """Get all aspect prototypes""" return self.aspect_prototypes.copy() def get_skill_gap_info(self, skill: str) -> Optional[Dict]: """Get skill gap information including courses and certs""" return self.skill_gaps_mapping.get(skill) def get_benchmark(self, key: str, default=None): """Get industry benchmark value""" return self.industry_benchmarks.get(key, default) class DomainKnowledgeBase: """ Domain Knowledge Base - loads and manages domain configurations Provides domain-specific aspect prototypes for the Fidelity Criteria Transformer """ def __init__(self, domains_dir: str = None): """ Initialize DKB with domain configs from directory Args: domains_dir: Path to directory containing domain JSON files Defaults to ./domains/ relative to this file """ if domains_dir is None: domains_dir = os.path.join(os.path.dirname(__file__), 'domains') self.domains_dir = Path(domains_dir) self.domains: Dict[str, DomainConfig] = {} self._keyword_index: Dict[str, str] = {} # keyword -> domain_id self._load_all_domains() self._build_keyword_index() logger.info(f"DomainKnowledgeBase initialized with {len(self.domains)} domains") def _load_all_domains(self): """Load all domain configs from directory""" if not self.domains_dir.exists(): logger.warning(f"Domains directory not found: {self.domains_dir}") return for json_file in self.domains_dir.glob('*.json'): try: with open(json_file, 'r', encoding='utf-8') as f: config_data = json.load(f) domain_config = DomainConfig(config_data) self.domains[domain_config.domain_id] = domain_config logger.info(f"Loaded domain config: {domain_config.display_name}") except Exception as e: logger.error(f"Failed to load domain config {json_file}: {e}") def _build_keyword_index(self): """Build keyword -> domain mapping for detection""" for domain_id, config in self.domains.items(): for keyword in config.detection_keywords: self._keyword_index[keyword.lower()] = domain_id def get_domain(self, domain_id: str) -> Optional[DomainConfig]: """Get domain config by ID""" return self.domains.get(domain_id) def list_domains(self) -> List[str]: """List all available domain IDs""" return list(self.domains.keys()) def detect_domain(self, text: str, skills: List[str] = None) -> Tuple[str, float]: """ Detect most likely domain from text and/or skills Args: text: Text content (career goals, descriptions, etc.) skills: List of skill keywords Returns: (domain_id, confidence) tuple """ if not text and not skills: return ('general', 0.0) text_lower = (text or '').lower() skills_lower = [s.lower() for s in (skills or [])] domain_scores = {} for domain_id, config in self.domains.items(): score = 0.0 # Keyword matching from text for keyword in config.detection_keywords: if keyword.lower() in text_lower: score += 0.1 # Skill matching core_skills_lower = [s.lower() for s in config.core_skills] skill_matches = sum(1 for s in skills_lower if s in core_skills_lower) score += skill_matches * 0.15 domain_scores[domain_id] = min(score, 1.0) if not domain_scores: return ('general', 0.0) # Return domain with highest score best_domain = max(domain_scores, key=domain_scores.get) confidence = domain_scores[best_domain] # Minimum confidence threshold if confidence < 0.2: return ('general', confidence) return (best_domain, confidence) def get_aspect_prototypes_for_domain(self, domain_id: str) -> Dict[str, List[str]]: """Get all aspect prototypes for a domain""" config = self.domains.get(domain_id) if config: return config.get_all_aspect_seeds() return {} def get_merged_prototypes(self, detected_domain: str, base_aspects: Dict[str, List[str]]) -> Dict[str, List[str]]: """ Merge domain-specific prototypes with base aspects Domain-specific seeds are added to base seeds Args: detected_domain: Domain ID from detection base_aspects: Base aspect seeds (from TextModuleV2 defaults) Returns: Merged aspect seeds dictionary """ merged = {k: list(v) for k, v in base_aspects.items()} # Deep copy domain_config = self.domains.get(detected_domain) if not domain_config: return merged # Merge domain-specific prototypes for aspect, seeds in domain_config.aspect_prototypes.items(): if aspect in merged: # Prepend domain-specific seeds (higher priority) merged[aspect] = seeds + merged[aspect] else: merged[aspect] = seeds return merged def analyze_skill_gaps(self, student_skills: List[str], domain_id: str) -> List[Dict]: """ Analyze skill gaps for a student in a given domain Args: student_skills: List of skills the student has domain_id: Target domain Returns: List of skill gap objects with recommendations """ config = self.domains.get(domain_id) if not config: return [] student_skills_lower = [s.lower() for s in student_skills] gaps = [] for skill, gap_info in config.skill_gaps_mapping.items(): skill_lower = skill.lower() # Check if student has this skill has_skill = any(skill_lower in s or s in skill_lower for s in student_skills_lower) if not has_skill: gaps.append({ 'skill': skill, 'demand_score': gap_info.get('demand_score', 0.5), 'recommended_courses': gap_info.get('courses', []), 'certifications': gap_info.get('certifications', []), 'priority': 'high' if gap_info.get('demand_score', 0) > 0.7 else 'medium' }) # Sort by demand score gaps.sort(key=lambda x: x['demand_score'], reverse=True) return gaps def get_domain_summary(self, domain_id: str) -> Optional[Dict]: """Get summary of a domain for reporting""" config = self.domains.get(domain_id) if not config: return None return { 'domain_id': config.domain_id, 'display_name': config.display_name, 'description': config.description, 'core_skills_count': len(config.core_skills), 'aspects_count': len(config.aspect_prototypes), 'benchmarks': config.industry_benchmarks } # Singleton instance _dkb_instance: Optional[DomainKnowledgeBase] = None def get_domain_knowledge_base(domains_dir: str = None) -> DomainKnowledgeBase: """Get or create singleton DomainKnowledgeBase instance""" global _dkb_instance if _dkb_instance is None: _dkb_instance = DomainKnowledgeBase(domains_dir) return _dkb_instance