conceptvector / analysis /coverage_analyzer.py
Tawhid Bin Omar
cleaned up code comments and docs
892d4dd
"""
Coverage Analyzer Module
Analyzes how well user explanation covers canonical concept graph
"""
from typing import List, Dict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class CoverageAnalyzer:
def __init__(self):
self._ready = True
def is_ready(self) -> bool:
return self._ready
async def analyze_coverage(
self,
user_claims: List[Dict],
canonical_graph: Dict,
explanation: str
) -> Dict:
"""
Analyze concept coverage by matching user claims to graph nodes
Returns:
{
'coverage_score': float (0-100),
'node_coverage': Dict[node_id, status],
'missing_concepts': List[Dict],
'weak_links': List[Dict],
'name_dropping': List[str]
}
"""
# Extract embeddings from user claims
claim_embeddings = [claim['embedding'] for claim in user_claims]
claim_texts = [claim['text'] for claim in user_claims]
# Analyze coverage for each node
node_coverage = {}
missing_concepts = []
weak_links = []
for node in canonical_graph['nodes']:
node_id = node['id']
node_label = node['label']
# Check if concept is mentioned
coverage_status = self._check_node_coverage(
node_label=node_label,
claim_texts=claim_texts,
claim_embeddings=claim_embeddings,
explanation=explanation
)
node_coverage[node_id] = coverage_status
if coverage_status['status'] == 'missing':
severity = 'high' if node.get('type') == 'prerequisite' else 'medium'
missing_concepts.append({
'concept': node_label,
'severity': severity,
'description': f"This is a key {'prerequisite' if node.get('type') == 'prerequisite' else 'component'} for understanding the concept."
})
elif coverage_status['status'] == 'weak':
weak_links.append({
'concept': node_label,
'user_quote': coverage_status.get('user_quote', ''),
'suggestion': 'Explain the mechanism or relationship, not just mention the term.'
})
# Calculate coverage score
coverage_score = self._calculate_coverage_score(node_coverage, canonical_graph)
# Detect name-dropping (mentioned but not explained)
name_dropping = self._detect_name_dropping(claim_texts, node_coverage)
return {
'coverage_score': coverage_score,
'node_coverage': node_coverage,
'missing_concepts': missing_concepts,
'weak_links': weak_links,
'name_dropping': name_dropping
}
def _check_node_coverage(
self,
node_label: str,
claim_texts: List[str],
claim_embeddings: List[List[float]],
explanation: str
) -> Dict:
"""Check if and how well a concept node is covered"""
# Started with just keyword matching, works surprisingly well
# might add semantic similarity later if needed
node_lower = node_label.lower()
explanation_lower = explanation.lower()
# Check if mentioned at all
if node_lower not in explanation_lower:
return {
'status': 'missing',
'user_quote': None,
'coverage_strength': 0.0
}
# Find best matching claim via semantic similarity
# (In full implementation, would use actual embeddings of node_label)
best_match_idx = None
best_score = 0.0
for idx, claim_text in enumerate(claim_texts):
if node_lower in claim_text.lower():
# Simple heuristic: longer explanation = better coverage
coverage_strength = min(1.0, len(claim_text.split()) / 15.0)
if coverage_strength > best_score:
best_score = coverage_strength
best_match_idx = idx
if best_match_idx is not None:
user_quote = claim_texts[best_match_idx]
# Determine status based on coverage strength
if best_score > 0.6:
status = 'covered'
elif best_score > 0.2:
status = 'weak'
else:
status = 'missing'
return {
'status': status,
'user_quote': user_quote,
'coverage_strength': best_score
}
# Mentioned but not in any claim (name-dropping)
return {
'status': 'weak',
'user_quote': None,
'coverage_strength': 0.1
}
def _calculate_coverage_score(self, node_coverage: Dict, canonical_graph: Dict) -> float:
"""Calculate overall coverage score"""
if not node_coverage:
return 0.0
# Weight by node importance
total_weight = 0.0
covered_weight = 0.0
for node in canonical_graph['nodes']:
node_id = node['id']
# Prerequisites are more important
weight = 2.0 if node.get('type') == 'prerequisite' else 1.0
total_weight += weight
coverage = node_coverage.get(node_id, {})
status = coverage.get('status', 'missing')
if status == 'covered':
covered_weight += weight
elif status == 'weak':
covered_weight += weight * 0.4
return (covered_weight / total_weight * 100) if total_weight > 0 else 0.0
def _detect_name_dropping(self, claim_texts: List[str], node_coverage: Dict) -> List[str]:
"""Detect concepts that are mentioned but not explained"""
name_dropped = []
for node_id, coverage in node_coverage.items():
if coverage.get('coverage_strength', 0) < 0.3 and coverage.get('user_quote'):
# Mentioned but weakly explained
if coverage.get('user_quote'):
name_dropped.append(coverage['user_quote'])
return name_dropped[:3] # Limit to top 3