Spaces:

devrajsinh2012
/

Mexar

Running

Mexar / backend /utils /source_attribution.py

Devrajsinh bharatsinh gohil

Initial commit of MEXAR Ultimate - Phase 2 cleanup complete

b0b150b 5 days ago

7.78 kB

	"""
	MEXAR - Source Attribution Module
	Links each sentence in the answer to its supporting source chunk.
	Provides inline citations for full transparency.
	"""
	import re
	import logging
	from typing import List, Dict, Tuple, Any
	from dataclasses import dataclass, field
	import numpy as np

	logger = logging.getLogger(__name__)


	@dataclass
	class AttributedSentence:
	"""A sentence with its source attribution."""
	text: str
	citation: str
	source_chunk_id: int
	source_preview: str
	source_file: str
	similarity: float


	@dataclass
	class AttributedAnswer:
	"""Complete answer with all attributions."""
	answer_with_citations: str
	sentences: List[AttributedSentence]
	sources: List[Dict]


	class SourceAttributor:
	"""
	Attributes each sentence in an LLM answer to its source chunk.

	This enables:
	1. Inline citations [1], [2], etc.
	2. Verification of claims against source data
	3. Transparency about where information came from
	"""

	def __init__(self, embedding_model=None):
	"""
	Initialize attributor.

	Args:
	embedding_model: FastEmbed model for sentence embedding
	"""
	self.embedding_model = embedding_model

	def attribute(
	self,
	answer: str,
	chunks: List[Any],
	chunk_embeddings: List[np.ndarray] = None
	) -> AttributedAnswer:
	"""
	Attribute each sentence in answer to source chunks.

	Args:
	answer: LLM generated answer
	chunks: Retrieved DocumentChunk objects
	chunk_embeddings: Pre-computed embeddings (optional)

	Returns:
	AttributedAnswer with citations
	"""
	if not answer or not chunks:
	return AttributedAnswer(
	answer_with_citations=answer,
	sentences=[],
	sources=[]
	)

	# Split answer into sentences
	sentences = self._split_sentences(answer)

	# Compute chunk embeddings if not provided
	if chunk_embeddings is None and self.embedding_model:
	contents = [self._get_content(c) for c in chunks]
	chunk_embeddings = list(self.embedding_model.embed(contents))

	# Track which sources we've cited
	sources_used = {} # chunk_id -> citation_number
	attributed_sentences = []

	for sentence in sentences:
	# Skip very short or non-substantive sentences
	if len(sentence.split()) < 4:
	continue

	# Find best matching chunk
	best_chunk, similarity = self._find_best_source(
	sentence, chunks, chunk_embeddings
	)

	# Assign citation number
	chunk_id = self._get_id(best_chunk)
	if chunk_id not in sources_used:
	sources_used[chunk_id] = len(sources_used) + 1
	citation_num = sources_used[chunk_id]

	attributed_sentences.append(AttributedSentence(
	text=sentence,
	citation=f"[{citation_num}]",
	source_chunk_id=chunk_id,
	source_preview=self._get_content(best_chunk)[:150],
	source_file=self._get_source(best_chunk),
	similarity=similarity
	))

	# Build answer with inline citations
	answer_with_citations = self._build_cited_answer(answer, attributed_sentences)

	# Build sources list for display
	sources = []
	for chunk_id, num in sorted(sources_used.items(), key=lambda x: x[1]):
	# Find the attributed sentence for this chunk
	attr = next((a for a in attributed_sentences if a.source_chunk_id == chunk_id), None)
	if attr:
	sources.append({
	"citation": f"[{num}]",
	"chunk_id": chunk_id,
	"source": attr.source_file,
	"preview": attr.source_preview,
	"similarity": round(attr.similarity, 3)
	})

	return AttributedAnswer(
	answer_with_citations=answer_with_citations,
	sentences=attributed_sentences,
	sources=sources
	)

	def _split_sentences(self, text: str) -> List[str]:
	"""Split text into sentences."""
	# Split on sentence-ending punctuation followed by space
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if s.strip()]

	def _find_best_source(
	self,
	sentence: str,
	chunks: List[Any],
	chunk_embeddings: List[np.ndarray]
	) -> Tuple[Any, float]:
	"""Find the chunk most similar to the sentence."""
	if not chunks:
	return None, 0.0

	# Default to first chunk if no embeddings
	if not self.embedding_model or not chunk_embeddings:
	return chunks[0], 0.5

	try:
	# Embed the sentence
	sentence_emb = list(self.embedding_model.embed([sentence]))[0]

	# Find best match
	best_chunk = chunks[0]
	best_sim = 0.0

	for chunk, emb in zip(chunks, chunk_embeddings):
	sim = self._cosine_similarity(sentence_emb, emb)
	if sim > best_sim:
	best_sim = sim
	best_chunk = chunk

	return best_chunk, best_sim

	except Exception as e:
	logger.warning(f"Embedding failed in attribution: {e}")
	return chunks[0], 0.5

	def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
	"""Calculate cosine similarity between two vectors."""
	try:
	dot = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	if norm_a == 0 or norm_b == 0:
	return 0.0
	return float(dot / (norm_a * norm_b))
	except:
	return 0.0

	def _build_cited_answer(
	self,
	answer: str,
	attributed: List[AttributedSentence]
	) -> str:
	"""Insert citations after sentences in the answer."""
	result = answer

	# Process in reverse order to preserve positions
	for attr in reversed(attributed):
	# Add citation after the sentence
	if attr.text in result:
	result = result.replace(
	attr.text,
	f"{attr.text} {attr.citation}",
	1 # Only replace first occurrence
	)

	return result

	def _get_content(self, chunk) -> str:
	"""Extract content from chunk object."""
	if hasattr(chunk, 'content'):
	return chunk.content
	elif isinstance(chunk, dict):
	return chunk.get('content', '')
	return str(chunk)

	def _get_id(self, chunk) -> int:
	"""Extract ID from chunk object."""
	if hasattr(chunk, 'id'):
	return chunk.id
	elif isinstance(chunk, dict):
	return chunk.get('id', 0)
	return 0

	def _get_source(self, chunk) -> str:
	"""Extract source from chunk object."""
	if hasattr(chunk, 'source'):
	return chunk.source or "unknown"
	elif isinstance(chunk, dict):
	return chunk.get('source', 'unknown')
	return "unknown"


	def create_source_attributor(embedding_model=None) -> SourceAttributor:
	"""Factory function to create SourceAttributor."""
	return SourceAttributor(embedding_model)