File size: 9,216 Bytes
3d015cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
"""
Domain Knowledge Base - Dynamic domain-specific aspect prototypes and skill mapping
"""
import os
import json
import logging
from typing import Dict, List, Optional, Tuple
from pathlib import Path

logger = logging.getLogger(__name__)


class DomainConfig:
    """Single domain configuration"""
    
    def __init__(self, config_data: Dict):
        self.domain_id = config_data.get('domain_id', 'unknown')
        self.display_name = config_data.get('display_name', 'Unknown Domain')
        self.description = config_data.get('description', '')
        self.core_skills = config_data.get('core_skills', [])
        self.aspect_prototypes = config_data.get('aspect_prototypes', {})
        self.industry_benchmarks = config_data.get('industry_benchmarks', {})
        self.skill_gaps_mapping = config_data.get('skill_gaps_mapping', {})
        self.detection_keywords = config_data.get('detection_keywords', [])
    
    def get_aspect_seeds(self, aspect: str) -> List[str]:
        """Get seed phrases for a specific aspect"""
        return self.aspect_prototypes.get(aspect, [])
    
    def get_all_aspect_seeds(self) -> Dict[str, List[str]]:
        """Get all aspect prototypes"""
        return self.aspect_prototypes.copy()
    
    def get_skill_gap_info(self, skill: str) -> Optional[Dict]:
        """Get skill gap information including courses and certs"""
        return self.skill_gaps_mapping.get(skill)
    
    def get_benchmark(self, key: str, default=None):
        """Get industry benchmark value"""
        return self.industry_benchmarks.get(key, default)


class DomainKnowledgeBase:
    """
    Domain Knowledge Base - loads and manages domain configurations
    Provides domain-specific aspect prototypes for the Fidelity Criteria Transformer
    """
    
    def __init__(self, domains_dir: str = None):
        """
        Initialize DKB with domain configs from directory
        
        Args:
            domains_dir: Path to directory containing domain JSON files
                         Defaults to ./domains/ relative to this file
        """
        if domains_dir is None:
            domains_dir = os.path.join(os.path.dirname(__file__), 'domains')
        
        self.domains_dir = Path(domains_dir)
        self.domains: Dict[str, DomainConfig] = {}
        self._keyword_index: Dict[str, str] = {}  # keyword -> domain_id
        
        self._load_all_domains()
        self._build_keyword_index()
        
        logger.info(f"DomainKnowledgeBase initialized with {len(self.domains)} domains")
    
    def _load_all_domains(self):
        """Load all domain configs from directory"""
        if not self.domains_dir.exists():
            logger.warning(f"Domains directory not found: {self.domains_dir}")
            return
        
        for json_file in self.domains_dir.glob('*.json'):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    config_data = json.load(f)
                
                domain_config = DomainConfig(config_data)
                self.domains[domain_config.domain_id] = domain_config
                logger.info(f"Loaded domain config: {domain_config.display_name}")
                
            except Exception as e:
                logger.error(f"Failed to load domain config {json_file}: {e}")
    
    def _build_keyword_index(self):
        """Build keyword -> domain mapping for detection"""
        for domain_id, config in self.domains.items():
            for keyword in config.detection_keywords:
                self._keyword_index[keyword.lower()] = domain_id
    
    def get_domain(self, domain_id: str) -> Optional[DomainConfig]:
        """Get domain config by ID"""
        return self.domains.get(domain_id)
    
    def list_domains(self) -> List[str]:
        """List all available domain IDs"""
        return list(self.domains.keys())
    
    def detect_domain(self, text: str, skills: List[str] = None) -> Tuple[str, float]:
        """
        Detect most likely domain from text and/or skills
        
        Args:
            text: Text content (career goals, descriptions, etc.)
            skills: List of skill keywords
        
        Returns:
            (domain_id, confidence) tuple
        """
        if not text and not skills:
            return ('general', 0.0)
        
        text_lower = (text or '').lower()
        skills_lower = [s.lower() for s in (skills or [])]
        
        domain_scores = {}
        
        for domain_id, config in self.domains.items():
            score = 0.0
            
            # Keyword matching from text
            for keyword in config.detection_keywords:
                if keyword.lower() in text_lower:
                    score += 0.1
            
            # Skill matching
            core_skills_lower = [s.lower() for s in config.core_skills]
            skill_matches = sum(1 for s in skills_lower if s in core_skills_lower)
            score += skill_matches * 0.15
            
            domain_scores[domain_id] = min(score, 1.0)
        
        if not domain_scores:
            return ('general', 0.0)
        
        # Return domain with highest score
        best_domain = max(domain_scores, key=domain_scores.get)
        confidence = domain_scores[best_domain]
        
        # Minimum confidence threshold
        if confidence < 0.2:
            return ('general', confidence)
        
        return (best_domain, confidence)
    
    def get_aspect_prototypes_for_domain(self, domain_id: str) -> Dict[str, List[str]]:
        """Get all aspect prototypes for a domain"""
        config = self.domains.get(domain_id)
        if config:
            return config.get_all_aspect_seeds()
        return {}
    
    def get_merged_prototypes(self, detected_domain: str, 
                              base_aspects: Dict[str, List[str]]) -> Dict[str, List[str]]:
        """
        Merge domain-specific prototypes with base aspects
        Domain-specific seeds are added to base seeds
        
        Args:
            detected_domain: Domain ID from detection
            base_aspects: Base aspect seeds (from TextModuleV2 defaults)
        
        Returns:
            Merged aspect seeds dictionary
        """
        merged = {k: list(v) for k, v in base_aspects.items()}  # Deep copy
        
        domain_config = self.domains.get(detected_domain)
        if not domain_config:
            return merged
        
        # Merge domain-specific prototypes
        for aspect, seeds in domain_config.aspect_prototypes.items():
            if aspect in merged:
                # Prepend domain-specific seeds (higher priority)
                merged[aspect] = seeds + merged[aspect]
            else:
                merged[aspect] = seeds
        
        return merged
    
    def analyze_skill_gaps(self, student_skills: List[str], 
                          domain_id: str) -> List[Dict]:
        """
        Analyze skill gaps for a student in a given domain
        
        Args:
            student_skills: List of skills the student has
            domain_id: Target domain
        
        Returns:
            List of skill gap objects with recommendations
        """
        config = self.domains.get(domain_id)
        if not config:
            return []
        
        student_skills_lower = [s.lower() for s in student_skills]
        gaps = []
        
        for skill, gap_info in config.skill_gaps_mapping.items():
            skill_lower = skill.lower()
            
            # Check if student has this skill
            has_skill = any(skill_lower in s or s in skill_lower 
                          for s in student_skills_lower)
            
            if not has_skill:
                gaps.append({
                    'skill': skill,
                    'demand_score': gap_info.get('demand_score', 0.5),
                    'recommended_courses': gap_info.get('courses', []),
                    'certifications': gap_info.get('certifications', []),
                    'priority': 'high' if gap_info.get('demand_score', 0) > 0.7 else 'medium'
                })
        
        # Sort by demand score
        gaps.sort(key=lambda x: x['demand_score'], reverse=True)
        return gaps
    
    def get_domain_summary(self, domain_id: str) -> Optional[Dict]:
        """Get summary of a domain for reporting"""
        config = self.domains.get(domain_id)
        if not config:
            return None
        
        return {
            'domain_id': config.domain_id,
            'display_name': config.display_name,
            'description': config.description,
            'core_skills_count': len(config.core_skills),
            'aspects_count': len(config.aspect_prototypes),
            'benchmarks': config.industry_benchmarks
        }


# Singleton instance
_dkb_instance: Optional[DomainKnowledgeBase] = None


def get_domain_knowledge_base(domains_dir: str = None) -> DomainKnowledgeBase:
    """Get or create singleton DomainKnowledgeBase instance"""
    global _dkb_instance
    
    if _dkb_instance is None:
        _dkb_instance = DomainKnowledgeBase(domains_dir)
    
    return _dkb_instance