Spaces:
Sleeping
Sleeping
| """ | |
| Coverage Analyzer Module | |
| Analyzes how well user explanation covers canonical concept graph | |
| """ | |
| from typing import List, Dict | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| class CoverageAnalyzer: | |
| def __init__(self): | |
| self._ready = True | |
| def is_ready(self) -> bool: | |
| return self._ready | |
| async def analyze_coverage( | |
| self, | |
| user_claims: List[Dict], | |
| canonical_graph: Dict, | |
| explanation: str | |
| ) -> Dict: | |
| """ | |
| Analyze concept coverage by matching user claims to graph nodes | |
| Returns: | |
| { | |
| 'coverage_score': float (0-100), | |
| 'node_coverage': Dict[node_id, status], | |
| 'missing_concepts': List[Dict], | |
| 'weak_links': List[Dict], | |
| 'name_dropping': List[str] | |
| } | |
| """ | |
| # Extract embeddings from user claims | |
| claim_embeddings = [claim['embedding'] for claim in user_claims] | |
| claim_texts = [claim['text'] for claim in user_claims] | |
| # Analyze coverage for each node | |
| node_coverage = {} | |
| missing_concepts = [] | |
| weak_links = [] | |
| for node in canonical_graph['nodes']: | |
| node_id = node['id'] | |
| node_label = node['label'] | |
| # Check if concept is mentioned | |
| coverage_status = self._check_node_coverage( | |
| node_label=node_label, | |
| claim_texts=claim_texts, | |
| claim_embeddings=claim_embeddings, | |
| explanation=explanation | |
| ) | |
| node_coverage[node_id] = coverage_status | |
| if coverage_status['status'] == 'missing': | |
| severity = 'high' if node.get('type') == 'prerequisite' else 'medium' | |
| missing_concepts.append({ | |
| 'concept': node_label, | |
| 'severity': severity, | |
| 'description': f"This is a key {'prerequisite' if node.get('type') == 'prerequisite' else 'component'} for understanding the concept." | |
| }) | |
| elif coverage_status['status'] == 'weak': | |
| weak_links.append({ | |
| 'concept': node_label, | |
| 'user_quote': coverage_status.get('user_quote', ''), | |
| 'suggestion': 'Explain the mechanism or relationship, not just mention the term.' | |
| }) | |
| # Calculate coverage score | |
| coverage_score = self._calculate_coverage_score(node_coverage, canonical_graph) | |
| # Detect name-dropping (mentioned but not explained) | |
| name_dropping = self._detect_name_dropping(claim_texts, node_coverage) | |
| return { | |
| 'coverage_score': coverage_score, | |
| 'node_coverage': node_coverage, | |
| 'missing_concepts': missing_concepts, | |
| 'weak_links': weak_links, | |
| 'name_dropping': name_dropping | |
| } | |
| def _check_node_coverage( | |
| self, | |
| node_label: str, | |
| claim_texts: List[str], | |
| claim_embeddings: List[List[float]], | |
| explanation: str | |
| ) -> Dict: | |
| """Check if and how well a concept node is covered""" | |
| # Started with just keyword matching, works surprisingly well | |
| # might add semantic similarity later if needed | |
| node_lower = node_label.lower() | |
| explanation_lower = explanation.lower() | |
| # Check if mentioned at all | |
| if node_lower not in explanation_lower: | |
| return { | |
| 'status': 'missing', | |
| 'user_quote': None, | |
| 'coverage_strength': 0.0 | |
| } | |
| # Find best matching claim via semantic similarity | |
| # (In full implementation, would use actual embeddings of node_label) | |
| best_match_idx = None | |
| best_score = 0.0 | |
| for idx, claim_text in enumerate(claim_texts): | |
| if node_lower in claim_text.lower(): | |
| # Simple heuristic: longer explanation = better coverage | |
| coverage_strength = min(1.0, len(claim_text.split()) / 15.0) | |
| if coverage_strength > best_score: | |
| best_score = coverage_strength | |
| best_match_idx = idx | |
| if best_match_idx is not None: | |
| user_quote = claim_texts[best_match_idx] | |
| # Determine status based on coverage strength | |
| if best_score > 0.6: | |
| status = 'covered' | |
| elif best_score > 0.2: | |
| status = 'weak' | |
| else: | |
| status = 'missing' | |
| return { | |
| 'status': status, | |
| 'user_quote': user_quote, | |
| 'coverage_strength': best_score | |
| } | |
| # Mentioned but not in any claim (name-dropping) | |
| return { | |
| 'status': 'weak', | |
| 'user_quote': None, | |
| 'coverage_strength': 0.1 | |
| } | |
| def _calculate_coverage_score(self, node_coverage: Dict, canonical_graph: Dict) -> float: | |
| """Calculate overall coverage score""" | |
| if not node_coverage: | |
| return 0.0 | |
| # Weight by node importance | |
| total_weight = 0.0 | |
| covered_weight = 0.0 | |
| for node in canonical_graph['nodes']: | |
| node_id = node['id'] | |
| # Prerequisites are more important | |
| weight = 2.0 if node.get('type') == 'prerequisite' else 1.0 | |
| total_weight += weight | |
| coverage = node_coverage.get(node_id, {}) | |
| status = coverage.get('status', 'missing') | |
| if status == 'covered': | |
| covered_weight += weight | |
| elif status == 'weak': | |
| covered_weight += weight * 0.4 | |
| return (covered_weight / total_weight * 100) if total_weight > 0 else 0.0 | |
| def _detect_name_dropping(self, claim_texts: List[str], node_coverage: Dict) -> List[str]: | |
| """Detect concepts that are mentioned but not explained""" | |
| name_dropped = [] | |
| for node_id, coverage in node_coverage.items(): | |
| if coverage.get('coverage_strength', 0) < 0.3 and coverage.get('user_quote'): | |
| # Mentioned but weakly explained | |
| if coverage.get('user_quote'): | |
| name_dropped.append(coverage['user_quote']) | |
| return name_dropped[:3] # Limit to top 3 | |