Spaces:

tawhidbinomar
/

conceptvector

Sleeping

conceptvector / analysis /coverage_analyzer.py

Tawhid Bin Omar

cleaned up code comments and docs

892d4dd 2 months ago

6.56 kB

	"""
	Coverage Analyzer Module
	Analyzes how well user explanation covers canonical concept graph
	"""

	from typing import List, Dict
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity

	class CoverageAnalyzer:
	def __init__(self):
	self._ready = True

	def is_ready(self) -> bool:
	return self._ready

	async def analyze_coverage(
	self,
	user_claims: List[Dict],
	canonical_graph: Dict,
	explanation: str
	) -> Dict:
	"""
	Analyze concept coverage by matching user claims to graph nodes

	Returns:
	{
	'coverage_score': float (0-100),
	'node_coverage': Dict[node_id, status],
	'missing_concepts': List[Dict],
	'weak_links': List[Dict],
	'name_dropping': List[str]
	}
	"""
	# Extract embeddings from user claims
	claim_embeddings = [claim['embedding'] for claim in user_claims]
	claim_texts = [claim['text'] for claim in user_claims]

	# Analyze coverage for each node
	node_coverage = {}
	missing_concepts = []
	weak_links = []

	for node in canonical_graph['nodes']:
	node_id = node['id']
	node_label = node['label']

	# Check if concept is mentioned
	coverage_status = self._check_node_coverage(
	node_label=node_label,
	claim_texts=claim_texts,
	claim_embeddings=claim_embeddings,
	explanation=explanation
	)

	node_coverage[node_id] = coverage_status

	if coverage_status['status'] == 'missing':
	severity = 'high' if node.get('type') == 'prerequisite' else 'medium'
	missing_concepts.append({
	'concept': node_label,
	'severity': severity,
	'description': f"This is a key {'prerequisite' if node.get('type') == 'prerequisite' else 'component'} for understanding the concept."
	})
	elif coverage_status['status'] == 'weak':
	weak_links.append({
	'concept': node_label,
	'user_quote': coverage_status.get('user_quote', ''),
	'suggestion': 'Explain the mechanism or relationship, not just mention the term.'
	})

	# Calculate coverage score
	coverage_score = self._calculate_coverage_score(node_coverage, canonical_graph)

	# Detect name-dropping (mentioned but not explained)
	name_dropping = self._detect_name_dropping(claim_texts, node_coverage)

	return {
	'coverage_score': coverage_score,
	'node_coverage': node_coverage,
	'missing_concepts': missing_concepts,
	'weak_links': weak_links,
	'name_dropping': name_dropping
	}

	def _check_node_coverage(
	self,
	node_label: str,
	claim_texts: List[str],
	claim_embeddings: List[List[float]],
	explanation: str
	) -> Dict:
	"""Check if and how well a concept node is covered"""
	# Started with just keyword matching, works surprisingly well
	# might add semantic similarity later if needed
	node_lower = node_label.lower()
	explanation_lower = explanation.lower()

	# Check if mentioned at all
	if node_lower not in explanation_lower:
	return {
	'status': 'missing',
	'user_quote': None,
	'coverage_strength': 0.0
	}

	# Find best matching claim via semantic similarity
	# (In full implementation, would use actual embeddings of node_label)
	best_match_idx = None
	best_score = 0.0

	for idx, claim_text in enumerate(claim_texts):
	if node_lower in claim_text.lower():
	# Simple heuristic: longer explanation = better coverage
	coverage_strength = min(1.0, len(claim_text.split()) / 15.0)
	if coverage_strength > best_score:
	best_score = coverage_strength
	best_match_idx = idx

	if best_match_idx is not None:
	user_quote = claim_texts[best_match_idx]

	# Determine status based on coverage strength
	if best_score > 0.6:
	status = 'covered'
	elif best_score > 0.2:
	status = 'weak'
	else:
	status = 'missing'

	return {
	'status': status,
	'user_quote': user_quote,
	'coverage_strength': best_score
	}

	# Mentioned but not in any claim (name-dropping)
	return {
	'status': 'weak',
	'user_quote': None,
	'coverage_strength': 0.1
	}

	def _calculate_coverage_score(self, node_coverage: Dict, canonical_graph: Dict) -> float:
	"""Calculate overall coverage score"""
	if not node_coverage:
	return 0.0

	# Weight by node importance
	total_weight = 0.0
	covered_weight = 0.0

	for node in canonical_graph['nodes']:
	node_id = node['id']

	# Prerequisites are more important
	weight = 2.0 if node.get('type') == 'prerequisite' else 1.0
	total_weight += weight

	coverage = node_coverage.get(node_id, {})
	status = coverage.get('status', 'missing')

	if status == 'covered':
	covered_weight += weight
	elif status == 'weak':
	covered_weight += weight * 0.4

	return (covered_weight / total_weight * 100) if total_weight > 0 else 0.0

	def _detect_name_dropping(self, claim_texts: List[str], node_coverage: Dict) -> List[str]:
	"""Detect concepts that are mentioned but not explained"""
	name_dropped = []

	for node_id, coverage in node_coverage.items():
	if coverage.get('coverage_strength', 0) < 0.3 and coverage.get('user_quote'):
	# Mentioned but weakly explained
	if coverage.get('user_quote'):
	name_dropped.append(coverage['user_quote'])

	return name_dropped[:3] # Limit to top 3