FCT / services /domain_knowledge_base.py
Parthnuwal7
Adding analytical content
3d015cd
"""
Domain Knowledge Base - Dynamic domain-specific aspect prototypes and skill mapping
"""
import os
import json
import logging
from typing import Dict, List, Optional, Tuple
from pathlib import Path
logger = logging.getLogger(__name__)
class DomainConfig:
"""Single domain configuration"""
def __init__(self, config_data: Dict):
self.domain_id = config_data.get('domain_id', 'unknown')
self.display_name = config_data.get('display_name', 'Unknown Domain')
self.description = config_data.get('description', '')
self.core_skills = config_data.get('core_skills', [])
self.aspect_prototypes = config_data.get('aspect_prototypes', {})
self.industry_benchmarks = config_data.get('industry_benchmarks', {})
self.skill_gaps_mapping = config_data.get('skill_gaps_mapping', {})
self.detection_keywords = config_data.get('detection_keywords', [])
def get_aspect_seeds(self, aspect: str) -> List[str]:
"""Get seed phrases for a specific aspect"""
return self.aspect_prototypes.get(aspect, [])
def get_all_aspect_seeds(self) -> Dict[str, List[str]]:
"""Get all aspect prototypes"""
return self.aspect_prototypes.copy()
def get_skill_gap_info(self, skill: str) -> Optional[Dict]:
"""Get skill gap information including courses and certs"""
return self.skill_gaps_mapping.get(skill)
def get_benchmark(self, key: str, default=None):
"""Get industry benchmark value"""
return self.industry_benchmarks.get(key, default)
class DomainKnowledgeBase:
"""
Domain Knowledge Base - loads and manages domain configurations
Provides domain-specific aspect prototypes for the Fidelity Criteria Transformer
"""
def __init__(self, domains_dir: str = None):
"""
Initialize DKB with domain configs from directory
Args:
domains_dir: Path to directory containing domain JSON files
Defaults to ./domains/ relative to this file
"""
if domains_dir is None:
domains_dir = os.path.join(os.path.dirname(__file__), 'domains')
self.domains_dir = Path(domains_dir)
self.domains: Dict[str, DomainConfig] = {}
self._keyword_index: Dict[str, str] = {} # keyword -> domain_id
self._load_all_domains()
self._build_keyword_index()
logger.info(f"DomainKnowledgeBase initialized with {len(self.domains)} domains")
def _load_all_domains(self):
"""Load all domain configs from directory"""
if not self.domains_dir.exists():
logger.warning(f"Domains directory not found: {self.domains_dir}")
return
for json_file in self.domains_dir.glob('*.json'):
try:
with open(json_file, 'r', encoding='utf-8') as f:
config_data = json.load(f)
domain_config = DomainConfig(config_data)
self.domains[domain_config.domain_id] = domain_config
logger.info(f"Loaded domain config: {domain_config.display_name}")
except Exception as e:
logger.error(f"Failed to load domain config {json_file}: {e}")
def _build_keyword_index(self):
"""Build keyword -> domain mapping for detection"""
for domain_id, config in self.domains.items():
for keyword in config.detection_keywords:
self._keyword_index[keyword.lower()] = domain_id
def get_domain(self, domain_id: str) -> Optional[DomainConfig]:
"""Get domain config by ID"""
return self.domains.get(domain_id)
def list_domains(self) -> List[str]:
"""List all available domain IDs"""
return list(self.domains.keys())
def detect_domain(self, text: str, skills: List[str] = None) -> Tuple[str, float]:
"""
Detect most likely domain from text and/or skills
Args:
text: Text content (career goals, descriptions, etc.)
skills: List of skill keywords
Returns:
(domain_id, confidence) tuple
"""
if not text and not skills:
return ('general', 0.0)
text_lower = (text or '').lower()
skills_lower = [s.lower() for s in (skills or [])]
domain_scores = {}
for domain_id, config in self.domains.items():
score = 0.0
# Keyword matching from text
for keyword in config.detection_keywords:
if keyword.lower() in text_lower:
score += 0.1
# Skill matching
core_skills_lower = [s.lower() for s in config.core_skills]
skill_matches = sum(1 for s in skills_lower if s in core_skills_lower)
score += skill_matches * 0.15
domain_scores[domain_id] = min(score, 1.0)
if not domain_scores:
return ('general', 0.0)
# Return domain with highest score
best_domain = max(domain_scores, key=domain_scores.get)
confidence = domain_scores[best_domain]
# Minimum confidence threshold
if confidence < 0.2:
return ('general', confidence)
return (best_domain, confidence)
def get_aspect_prototypes_for_domain(self, domain_id: str) -> Dict[str, List[str]]:
"""Get all aspect prototypes for a domain"""
config = self.domains.get(domain_id)
if config:
return config.get_all_aspect_seeds()
return {}
def get_merged_prototypes(self, detected_domain: str,
base_aspects: Dict[str, List[str]]) -> Dict[str, List[str]]:
"""
Merge domain-specific prototypes with base aspects
Domain-specific seeds are added to base seeds
Args:
detected_domain: Domain ID from detection
base_aspects: Base aspect seeds (from TextModuleV2 defaults)
Returns:
Merged aspect seeds dictionary
"""
merged = {k: list(v) for k, v in base_aspects.items()} # Deep copy
domain_config = self.domains.get(detected_domain)
if not domain_config:
return merged
# Merge domain-specific prototypes
for aspect, seeds in domain_config.aspect_prototypes.items():
if aspect in merged:
# Prepend domain-specific seeds (higher priority)
merged[aspect] = seeds + merged[aspect]
else:
merged[aspect] = seeds
return merged
def analyze_skill_gaps(self, student_skills: List[str],
domain_id: str) -> List[Dict]:
"""
Analyze skill gaps for a student in a given domain
Args:
student_skills: List of skills the student has
domain_id: Target domain
Returns:
List of skill gap objects with recommendations
"""
config = self.domains.get(domain_id)
if not config:
return []
student_skills_lower = [s.lower() for s in student_skills]
gaps = []
for skill, gap_info in config.skill_gaps_mapping.items():
skill_lower = skill.lower()
# Check if student has this skill
has_skill = any(skill_lower in s or s in skill_lower
for s in student_skills_lower)
if not has_skill:
gaps.append({
'skill': skill,
'demand_score': gap_info.get('demand_score', 0.5),
'recommended_courses': gap_info.get('courses', []),
'certifications': gap_info.get('certifications', []),
'priority': 'high' if gap_info.get('demand_score', 0) > 0.7 else 'medium'
})
# Sort by demand score
gaps.sort(key=lambda x: x['demand_score'], reverse=True)
return gaps
def get_domain_summary(self, domain_id: str) -> Optional[Dict]:
"""Get summary of a domain for reporting"""
config = self.domains.get(domain_id)
if not config:
return None
return {
'domain_id': config.domain_id,
'display_name': config.display_name,
'description': config.description,
'core_skills_count': len(config.core_skills),
'aspects_count': len(config.aspect_prototypes),
'benchmarks': config.industry_benchmarks
}
# Singleton instance
_dkb_instance: Optional[DomainKnowledgeBase] = None
def get_domain_knowledge_base(domains_dir: str = None) -> DomainKnowledgeBase:
"""Get or create singleton DomainKnowledgeBase instance"""
global _dkb_instance
if _dkb_instance is None:
_dkb_instance = DomainKnowledgeBase(domains_dir)
return _dkb_instance