File size: 6,559 Bytes
8176754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892d4dd
 
8176754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Coverage Analyzer Module
Analyzes how well user explanation covers canonical concept graph
"""

from typing import List, Dict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class CoverageAnalyzer:
    def __init__(self):
        self._ready = True
    
    def is_ready(self) -> bool:
        return self._ready
    
    async def analyze_coverage(
        self,
        user_claims: List[Dict],
        canonical_graph: Dict,
        explanation: str
    ) -> Dict:
        """
        Analyze concept coverage by matching user claims to graph nodes
        
        Returns:
            {
                'coverage_score': float (0-100),
                'node_coverage': Dict[node_id, status],
                'missing_concepts': List[Dict],
                'weak_links': List[Dict],
                'name_dropping': List[str]
            }
        """
        # Extract embeddings from user claims
        claim_embeddings = [claim['embedding'] for claim in user_claims]
        claim_texts = [claim['text'] for claim in user_claims]
        
        # Analyze coverage for each node
        node_coverage = {}
        missing_concepts = []
        weak_links = []
        
        for node in canonical_graph['nodes']:
            node_id = node['id']
            node_label = node['label']
            
            # Check if concept is mentioned
            coverage_status = self._check_node_coverage(
                node_label=node_label,
                claim_texts=claim_texts,
                claim_embeddings=claim_embeddings,
                explanation=explanation
            )
            
            node_coverage[node_id] = coverage_status
            
            if coverage_status['status'] == 'missing':
                severity = 'high' if node.get('type') == 'prerequisite' else 'medium'
                missing_concepts.append({
                    'concept': node_label,
                    'severity': severity,
                    'description': f"This is a key {'prerequisite' if node.get('type') == 'prerequisite' else 'component'} for understanding the concept."
                })
            elif coverage_status['status'] == 'weak':
                weak_links.append({
                    'concept': node_label,
                    'user_quote': coverage_status.get('user_quote', ''),
                    'suggestion': 'Explain the mechanism or relationship, not just mention the term.'
                })
        
        # Calculate coverage score
        coverage_score = self._calculate_coverage_score(node_coverage, canonical_graph)
        
        # Detect name-dropping (mentioned but not explained)
        name_dropping = self._detect_name_dropping(claim_texts, node_coverage)
        
        return {
            'coverage_score': coverage_score,
            'node_coverage': node_coverage,
            'missing_concepts': missing_concepts,
            'weak_links': weak_links,
            'name_dropping': name_dropping
        }
    
    def _check_node_coverage(
        self,
        node_label: str,
        claim_texts: List[str],
        claim_embeddings: List[List[float]],
        explanation: str
    ) -> Dict:
        """Check if and how well a concept node is covered"""
        # Started with just keyword matching, works surprisingly well
        # might add semantic similarity later if needed
        node_lower = node_label.lower()
        explanation_lower = explanation.lower()
        
        # Check if mentioned at all
        if node_lower not in explanation_lower:
            return {
                'status': 'missing',
                'user_quote': None,
                'coverage_strength': 0.0
            }
        
        # Find best matching claim via semantic similarity
        # (In full implementation, would use actual embeddings of node_label)
        best_match_idx = None
        best_score = 0.0
        
        for idx, claim_text in enumerate(claim_texts):
            if node_lower in claim_text.lower():
                # Simple heuristic: longer explanation = better coverage
                coverage_strength = min(1.0, len(claim_text.split()) / 15.0)
                if coverage_strength > best_score:
                    best_score = coverage_strength
                    best_match_idx = idx
        
        if best_match_idx is not None:
            user_quote = claim_texts[best_match_idx]
            
            # Determine status based on coverage strength
            if best_score > 0.6:
                status = 'covered'
            elif best_score > 0.2:
                status = 'weak'
            else:
                status = 'missing'
            
            return {
                'status': status,
                'user_quote': user_quote,
                'coverage_strength': best_score
            }
        
        # Mentioned but not in any claim (name-dropping)
        return {
            'status': 'weak',
            'user_quote': None,
            'coverage_strength': 0.1
        }
    
    def _calculate_coverage_score(self, node_coverage: Dict, canonical_graph: Dict) -> float:
        """Calculate overall coverage score"""
        if not node_coverage:
            return 0.0
        
        # Weight by node importance
        total_weight = 0.0
        covered_weight = 0.0
        
        for node in canonical_graph['nodes']:
            node_id = node['id']
            
            # Prerequisites are more important
            weight = 2.0 if node.get('type') == 'prerequisite' else 1.0
            total_weight += weight
            
            coverage = node_coverage.get(node_id, {})
            status = coverage.get('status', 'missing')
            
            if status == 'covered':
                covered_weight += weight
            elif status == 'weak':
                covered_weight += weight * 0.4
        
        return (covered_weight / total_weight * 100) if total_weight > 0 else 0.0
    
    def _detect_name_dropping(self, claim_texts: List[str], node_coverage: Dict) -> List[str]:
        """Detect concepts that are mentioned but not explained"""
        name_dropped = []
        
        for node_id, coverage in node_coverage.items():
            if coverage.get('coverage_strength', 0) < 0.3 and coverage.get('user_quote'):
                # Mentioned but weakly explained
                if coverage.get('user_quote'):
                    name_dropped.append(coverage['user_quote'])
        
        return name_dropped[:3]  # Limit to top 3