Spaces:

vlsiddarth
/

Knowledge-Universe

Running

App Files Files Community

Knowledge-Universe / src /scoring /coverage_confidence.py

vlsiddarth

fixed: crawlers and routes

4d5254b 20 days ago

raw

history blame contribute delete

13.3 kB

	"""
	Knowledge Universe — Coverage Confidence Score (Blend Mode — John's Optimization)
	==================================================================================

	JOHN'S OPTIMIZATION:
	Added compute_from_embeddings() — accepts pre-computed embeddings from the
	reranker so we never call model.encode() twice in the same request.

	Before: 2x model.encode() = ~800ms overhead
	After: 1x model.encode() + cosine similarity = ~10ms overhead

	The API is unchanged. compute() still works for standalone use.
	compute_from_embeddings() is the fast path used by request_handler.
	"""

	import logging
	from typing import List, Dict, Any, Optional

	logger = logging.getLogger(__name__)

	_HIGH_CONFIDENCE = 0.65
	_MEDIUM_CONFIDENCE = 0.45

	_ACRONYM_EXPANSIONS = {
	"rlhf": "reinforcement learning from human feedback",
	"llm": "large language model",
	"rag": "retrieval augmented generation",
	"nlp": "natural language processing",
	"gnn": "graph neural network",
	"cnn": "convolutional neural network",
	"rnn": "recurrent neural network",
	"lstm": "long short-term memory",
	"bert": "bidirectional encoder representations transformers",
	"gpt": "generative pre-trained transformer",
	"vae": "variational autoencoder",
	"gan": "generative adversarial network",
	"moe": "mixture of experts",
	"peft": "parameter efficient fine tuning",
	"lora": "low rank adaptation",
	"dpo": "direct preference optimization",
	"sft": "supervised fine tuning",
	"mlops": "machine learning operations",
	}

	_QUERY_REFINEMENTS = {
	"transformer architecture": [
	"attention mechanism self-attention explained",
	"transformer encoder decoder tutorial",
	"attention is all you need paper explained",
	],
	"machine learning": [
	"machine learning beginner tutorial python",
	"supervised learning algorithms explained",
	"machine learning fundamentals course",
	],
	"deep learning": [
	"neural network backpropagation explained",
	"deep learning pytorch tutorial beginner",
	"convolutional neural network image classification",
	],
	"rlhf": [
	"reinforcement learning from human feedback explained",
	"reward model training language model",
	"RLHF PPO implementation tutorial",
	],
	"mixture of experts": [
	"sparse mixture of experts architecture explained",
	"MoE gating network routing mechanism",
	"mixture of experts transformer tutorial",
	],
	}

	DOMAIN_QUERY_PATTERNS = {
	# Physics/quantum — suggest arXiv-specific terms
	("quantum", "entanglement", "qubit", "hamiltonian", "decoherence"): [
	"{topic} arxiv review",
	"{topic} lecture notes pdf",
	"{topic} error analysis",
	],
	# ML frameworks — suggest code and docs
	("langchain", "llamaindex", "langgraph", "haystack", "dspy"): [
	"{topic} github example",
	"{topic} documentation tutorial",
	"{topic} cookbook python",
	],
	# RAG/retrieval — suggest implementation
	("rag", "retrieval", "vector", "embedding", "chunking"): [
	"{topic} implementation guide",
	"{topic} evaluation metrics",
	"production {topic} architecture",
	],
	# Finance/trading
	("fintech", "trading", "portfolio", "risk", "quant"): [
	"{topic} research paper 2025",
	"{topic} python implementation",
	"{topic} backtesting guide",
	],
	# Deep learning techniques
	("transformer", "attention", "bert", "gpt", "llm", "fine-tuning"): [
	"{topic} paper explained",
	"{topic} implementation pytorch",
	"{topic} benchmark comparison",
	],
	# MLOps/deployment
	("mlops", "deployment", "inference", "serving", "vllm"): [
	"{topic} production guide",
	"{topic} kubernetes setup",
	"{topic} optimization tutorial",
	],
	# Materials science / physics
	("superconductor", "lk-99", "quantum", "material", "semiconductor",
	"perovskite", "graphene", "topological"): [
	"{topic} arxiv preprint 2024",
	"{topic} experimental results review",
	"{topic} physics mechanism explained",
	],

	# Neuroscience / biology
	("neural", "neuron", "brain", "cortex", "synapse"): [
	"{topic} research paper",
	"{topic} computational model",
	"{topic} biological mechanism",
	],
	}

	class CoverageConfidenceScorer:
	"""
	Computes how well returned results match the query intent.

	Fast path: compute_from_embeddings() — reuses embeddings from reranker.
	Slow path: compute() — encodes from scratch (standalone use only).
	"""

	def __init__(self):
	self._model = None

	def _get_model(self):
	if self._model is None:
	try:
	from src.integrations.shared_model import get_shared_model
	self._model = get_shared_model()
	except Exception as e:
	logger.error(f"Failed to get shared model: {e}")
	return None
	return self._model

	def compute_from_embeddings(
	self,
	query: str,
	sources: List[Dict[str, Any]],
	query_emb,
	doc_embs,
	top_k: int = 5,
	) -> Dict[str, Any]:
	"""
	FAST PATH — John's optimization.

	Uses pre-computed embeddings from the reranker.
	Zero additional model.encode() calls.
	Cost: ~5ms (just cosine similarity computation).

	Args:
	query: Original query string
	sources: Result dicts (same order as doc_embs)
	query_emb: Tensor from reranker's model.encode(query)
	doc_embs: Tensor from reranker's model.encode(texts)
	top_k: How many results to score
	"""
	if query_emb is None or doc_embs is None or not sources:
	return self._no_results_response(query) if not sources else self._unavailable_response()

	try:
	from sentence_transformers import util

	top_sources = sources[:top_k]
	# doc_embs contains all documents — slice to top_k
	top_embs = doc_embs[:top_k]

	sims = util.cos_sim(query_emb, top_embs)[0]
	sim_scores = [max(0.0, float(s)) for s in sims]

	# Weighted average — top results count more
	weights = [1.0 / (i + 1) for i in range(len(sim_scores))]
	total_w = sum(weights)
	confidence = sum(s * w for s, w in zip(sim_scores, weights)) / total_w
	confidence = round(confidence, 3)

	per_result = [
	{
	"title": top_sources[i].get("title", "")[:60],
	"similarity": round(sim_scores[i], 3),
	}
	for i in range(len(sim_scores))
	]

	return self._build_response(query, confidence, per_result)

	except Exception as e:
	logger.error(f"Fast confidence scoring failed: {e}")
	return self._unavailable_response()

	def compute(
	self,
	query: str,
	sources: List[Dict[str, Any]],
	top_k: int = 5,
	) -> Dict[str, Any]:
	"""
	SLOW PATH — standalone use only.
	Encodes from scratch. Use compute_from_embeddings() in production.
	"""
	if not sources:
	return self._no_results_response(query)

	model = self._get_model()
	if model is None:
	return self._unavailable_response()

	try:
	from sentence_transformers import util

	query_emb = model.encode(query, convert_to_tensor=True)
	top_sources = sources[:top_k]
	texts = [
	f"{s.get('title', '')} {(s.get('summary') or '')[:200]}"
	for s in top_sources
	]
	doc_embs = model.encode(texts, convert_to_tensor=True)

	return self.compute_from_embeddings(
	query, top_sources, query_emb, doc_embs, top_k
	)

	except Exception as e:
	logger.error(f"Confidence scoring failed: {e}")
	return self._unavailable_response()

	def _build_response(
	self,
	query: str,
	confidence: float,
	per_result: List[Dict],
	) -> Dict[str, Any]:
	# Always generate suggestions — useful at all confidence levels
	all_suggestions = self._suggest_queries(query)

	if confidence >= _HIGH_CONFIDENCE:
	label = "high"
	warning = False
	message = None
	# Still provide 2 suggestions for query refinement even at high confidence
	suggestions = all_suggestions[:2]

	elif confidence >= _MEDIUM_CONFIDENCE:
	label = "medium"
	warning = False
	message = (
	"Results partially match your query. "
	"Consider refining with more specific terms."
	)
	suggestions = all_suggestions[:3]

	else:
	label = "low"
	warning = True
	message = (
	f"Low confidence ({confidence:.2f}) — results may not fully "
	"match your query intent. Try the suggested queries below."
	)
	suggestions = all_suggestions[:3]


	return {
	"confidence": confidence,
	"confidence_label": label,
	"coverage_warning": warning,
	"warning_message": message,
	"suggested_queries": suggestions,
	"top_result_similarities": per_result,
	}

	# ============================================================
	# FEATURE 4: Domain-aware query refinement
	# src/scoring/coverage_confidence.py
	# Replace _suggest_queries() and add DOMAIN_QUERY_PATTERNS
	# ============================================================

	def _suggest_queries(self, query: str) -> List[str]:
	"""
	Domain-aware query refinement instead of generic 'explained/tutorial python'.
	Matches domain patterns first, falls back to acronym expansion, then generic.
	"""
	suggestions = []
	query_lower = query.lower().strip()

	# 1. Explicit query refinements (highest priority)
	for pattern, refinements in _QUERY_REFINEMENTS.items():
	if pattern in query_lower:
	suggestions.extend(refinements)
	break

	# 2. Domain-aware patterns
	if not suggestions:
	topic_words = set(query_lower.split())
	for domain_keywords, patterns in DOMAIN_QUERY_PATTERNS.items():
	if any(kw in query_lower for kw in domain_keywords):
	# Extract the most meaningful topic phrase for {topic} substitution
	topic = _extract_topic(query_lower, domain_keywords)
	for pattern in patterns:
	filled = pattern.format(topic=topic)
	if filled not in suggestions:
	suggestions.append(filled)
	break

	# 3. Acronym expansion
	words = query_lower.split()
	for word in words:
	if word in _ACRONYM_EXPANSIONS:
	expanded = _ACRONYM_EXPANSIONS[word]
	expanded_query = query_lower.replace(word, expanded)
	if expanded_query not in suggestions:
	suggestions.append(expanded_query)

	# 4. Generic fallback only if nothing matched
	if not suggestions:
	base = query_lower
	suggestions = [
	f"{base} tutorial 2025",
	f"{base} implementation guide",
	f"introduction to {base}",
	]

	seen = {query_lower}
	unique = []
	for s in suggestions:
	if s not in seen:
	seen.add(s)
	unique.append(s)

	return unique[:3]

	def _no_results_response(self, query: str) -> Dict[str, Any]:
	return {
	"confidence": 0.0,
	"confidence_label": "none",
	"coverage_warning": True,
	"warning_message": "No results returned for this query.",
	"suggested_queries": self._suggest_queries(query),
	"top_result_similarities": [],
	}

	def _unavailable_response(self) -> Dict[str, Any]:
	return {
	"confidence": None,
	"confidence_label": "unavailable",
	"coverage_warning": False,
	"warning_message": None,
	"suggested_queries": [],
	"top_result_similarities": [],
	}

	def _extract_topic(query: str, domain_keywords: tuple) -> str:
	"""Extract topic phrase by removing domain keyword from query."""
	for kw in domain_keywords:
	if kw in query:
	# Return query with the matched keyword stripped of stop words
	return query.strip()
	return query.strip()