Spaces:

tawhidbinomar
/

conceptvector

Sleeping

File size: 6,559 Bytes

"""
Coverage Analyzer Module
Analyzes how well user explanation covers canonical concept graph
"""

from typing import List, Dict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class CoverageAnalyzer:
    def __init__(self):
        self._ready = True
    
    def is_ready(self) -> bool:
        return self._ready
    
    async def analyze_coverage(
        self,
        user_claims: List[Dict],
        canonical_graph: Dict,
        explanation: str
    ) -> Dict:
        """
        Analyze concept coverage by matching user claims to graph nodes
        
        Returns:
            {
                'coverage_score': float (0-100),
                'node_coverage': Dict[node_id, status],
                'missing_concepts': List[Dict],
                'weak_links': List[Dict],
                'name_dropping': List[str]
            }
        """
        # Extract embeddings from user claims
        claim_embeddings = [claim['embedding'] for claim in user_claims]
        claim_texts = [claim['text'] for claim in user_claims]
        
        # Analyze coverage for each node
        node_coverage = {}
        missing_concepts = []
        weak_links = []
        
        for node in canonical_graph['nodes']:
            node_id = node['id']
            node_label = node['label']
            
            # Check if concept is mentioned
            coverage_status = self._check_node_coverage(
                node_label=node_label,
                claim_texts=claim_texts,
                claim_embeddings=claim_embeddings,
                explanation=explanation
            )
            
            node_coverage[node_id] = coverage_status
            
            if coverage_status['status'] == 'missing':
                severity = 'high' if node.get('type') == 'prerequisite' else 'medium'
                missing_concepts.append({
                    'concept': node_label,
                    'severity': severity,
                    'description': f"This is a key {'prerequisite' if node.get('type') == 'prerequisite' else 'component'} for understanding the concept."
                })
            elif coverage_status['status'] == 'weak':
                weak_links.append({
                    'concept': node_label,
                    'user_quote': coverage_status.get('user_quote', ''),
                    'suggestion': 'Explain the mechanism or relationship, not just mention the term.'
                })
        
        # Calculate coverage score
        coverage_score = self._calculate_coverage_score(node_coverage, canonical_graph)
        
        # Detect name-dropping (mentioned but not explained)
        name_dropping = self._detect_name_dropping(claim_texts, node_coverage)
        
        return {
            'coverage_score': coverage_score,
            'node_coverage': node_coverage,
            'missing_concepts': missing_concepts,
            'weak_links': weak_links,
            'name_dropping': name_dropping
        }
    
    def _check_node_coverage(
        self,
        node_label: str,
        claim_texts: List[str],
        claim_embeddings: List[List[float]],
        explanation: str
    ) -> Dict:
        """Check if and how well a concept node is covered"""
        # Started with just keyword matching, works surprisingly well
        # might add semantic similarity later if needed
        node_lower = node_label.lower()
        explanation_lower = explanation.lower()
        
        # Check if mentioned at all
        if node_lower not in explanation_lower:
            return {
                'status': 'missing',
                'user_quote': None,
                'coverage_strength': 0.0
            }
        
        # Find best matching claim via semantic similarity
        # (In full implementation, would use actual embeddings of node_label)
        best_match_idx = None
        best_score = 0.0
        
        for idx, claim_text in enumerate(claim_texts):
            if node_lower in claim_text.lower():
                # Simple heuristic: longer explanation = better coverage
                coverage_strength = min(1.0, len(claim_text.split()) / 15.0)
                if coverage_strength > best_score:
                    best_score = coverage_strength
                    best_match_idx = idx
        
        if best_match_idx is not None:
            user_quote = claim_texts[best_match_idx]
            
            # Determine status based on coverage strength
            if best_score > 0.6:
                status = 'covered'
            elif best_score > 0.2:
                status = 'weak'
            else:
                status = 'missing'
            
            return {
                'status': status,
                'user_quote': user_quote,
                'coverage_strength': best_score
            }
        
        # Mentioned but not in any claim (name-dropping)
        return {
            'status': 'weak',
            'user_quote': None,
            'coverage_strength': 0.1
        }
    
    def _calculate_coverage_score(self, node_coverage: Dict, canonical_graph: Dict) -> float:
        """Calculate overall coverage score"""
        if not node_coverage:
            return 0.0
        
        # Weight by node importance
        total_weight = 0.0
        covered_weight = 0.0
        
        for node in canonical_graph['nodes']:
            node_id = node['id']
            
            # Prerequisites are more important
            weight = 2.0 if node.get('type') == 'prerequisite' else 1.0
            total_weight += weight
            
            coverage = node_coverage.get(node_id, {})
            status = coverage.get('status', 'missing')
            
            if status == 'covered':
                covered_weight += weight
            elif status == 'weak':
                covered_weight += weight * 0.4
        
        return (covered_weight / total_weight * 100) if total_weight > 0 else 0.0
    
    def _detect_name_dropping(self, claim_texts: List[str], node_coverage: Dict) -> List[str]:
        """Detect concepts that are mentioned but not explained"""
        name_dropped = []
        
        for node_id, coverage in node_coverage.items():
            if coverage.get('coverage_strength', 0) < 0.3 and coverage.get('user_quote'):
                # Mentioned but weakly explained
                if coverage.get('user_quote'):
                    name_dropped.append(coverage['user_quote'])
        
        return name_dropped[:3]  # Limit to top 3