File size: 7,781 Bytes
b0b150b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
MEXAR - Source Attribution Module
Links each sentence in the answer to its supporting source chunk.
Provides inline citations for full transparency.
"""
import re
import logging
from typing import List, Dict, Tuple, Any
from dataclasses import dataclass, field
import numpy as np

logger = logging.getLogger(__name__)


@dataclass
class AttributedSentence:
    """A sentence with its source attribution."""
    text: str
    citation: str
    source_chunk_id: int
    source_preview: str
    source_file: str
    similarity: float


@dataclass
class AttributedAnswer:
    """Complete answer with all attributions."""
    answer_with_citations: str
    sentences: List[AttributedSentence]
    sources: List[Dict]


class SourceAttributor:
    """
    Attributes each sentence in an LLM answer to its source chunk.
    
    This enables:
    1. Inline citations [1], [2], etc.
    2. Verification of claims against source data
    3. Transparency about where information came from
    """
    
    def __init__(self, embedding_model=None):
        """
        Initialize attributor.
        
        Args:
            embedding_model: FastEmbed model for sentence embedding
        """
        self.embedding_model = embedding_model
    
    def attribute(
        self, 
        answer: str, 
        chunks: List[Any],
        chunk_embeddings: List[np.ndarray] = None
    ) -> AttributedAnswer:
        """
        Attribute each sentence in answer to source chunks.
        
        Args:
            answer: LLM generated answer
            chunks: Retrieved DocumentChunk objects  
            chunk_embeddings: Pre-computed embeddings (optional)
            
        Returns:
            AttributedAnswer with citations
        """
        if not answer or not chunks:
            return AttributedAnswer(
                answer_with_citations=answer,
                sentences=[],
                sources=[]
            )
        
        # Split answer into sentences
        sentences = self._split_sentences(answer)
        
        # Compute chunk embeddings if not provided
        if chunk_embeddings is None and self.embedding_model:
            contents = [self._get_content(c) for c in chunks]
            chunk_embeddings = list(self.embedding_model.embed(contents))
        
        # Track which sources we've cited
        sources_used = {}  # chunk_id -> citation_number
        attributed_sentences = []
        
        for sentence in sentences:
            # Skip very short or non-substantive sentences
            if len(sentence.split()) < 4:
                continue
            
            # Find best matching chunk
            best_chunk, similarity = self._find_best_source(
                sentence, chunks, chunk_embeddings
            )
            
            # Assign citation number
            chunk_id = self._get_id(best_chunk)
            if chunk_id not in sources_used:
                sources_used[chunk_id] = len(sources_used) + 1
            citation_num = sources_used[chunk_id]
            
            attributed_sentences.append(AttributedSentence(
                text=sentence,
                citation=f"[{citation_num}]",
                source_chunk_id=chunk_id,
                source_preview=self._get_content(best_chunk)[:150],
                source_file=self._get_source(best_chunk),
                similarity=similarity
            ))
        
        # Build answer with inline citations
        answer_with_citations = self._build_cited_answer(answer, attributed_sentences)
        
        # Build sources list for display
        sources = []
        for chunk_id, num in sorted(sources_used.items(), key=lambda x: x[1]):
            # Find the attributed sentence for this chunk
            attr = next((a for a in attributed_sentences if a.source_chunk_id == chunk_id), None)
            if attr:
                sources.append({
                    "citation": f"[{num}]",
                    "chunk_id": chunk_id,
                    "source": attr.source_file,
                    "preview": attr.source_preview,
                    "similarity": round(attr.similarity, 3)
                })
        
        return AttributedAnswer(
            answer_with_citations=answer_with_citations,
            sentences=attributed_sentences,
            sources=sources
        )
    
    def _split_sentences(self, text: str) -> List[str]:
        """Split text into sentences."""
        # Split on sentence-ending punctuation followed by space
        sentences = re.split(r'(?<=[.!?])\s+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def _find_best_source(
        self, 
        sentence: str, 
        chunks: List[Any],
        chunk_embeddings: List[np.ndarray]
    ) -> Tuple[Any, float]:
        """Find the chunk most similar to the sentence."""
        if not chunks:
            return None, 0.0
        
        # Default to first chunk if no embeddings
        if not self.embedding_model or not chunk_embeddings:
            return chunks[0], 0.5
        
        try:
            # Embed the sentence
            sentence_emb = list(self.embedding_model.embed([sentence]))[0]
            
            # Find best match
            best_chunk = chunks[0]
            best_sim = 0.0
            
            for chunk, emb in zip(chunks, chunk_embeddings):
                sim = self._cosine_similarity(sentence_emb, emb)
                if sim > best_sim:
                    best_sim = sim
                    best_chunk = chunk
            
            return best_chunk, best_sim
            
        except Exception as e:
            logger.warning(f"Embedding failed in attribution: {e}")
            return chunks[0], 0.5
    
    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
        """Calculate cosine similarity between two vectors."""
        try:
            dot = np.dot(a, b)
            norm_a = np.linalg.norm(a)
            norm_b = np.linalg.norm(b)
            if norm_a == 0 or norm_b == 0:
                return 0.0
            return float(dot / (norm_a * norm_b))
        except:
            return 0.0
    
    def _build_cited_answer(
        self, 
        answer: str, 
        attributed: List[AttributedSentence]
    ) -> str:
        """Insert citations after sentences in the answer."""
        result = answer
        
        # Process in reverse order to preserve positions
        for attr in reversed(attributed):
            # Add citation after the sentence
            if attr.text in result:
                result = result.replace(
                    attr.text, 
                    f"{attr.text} {attr.citation}",
                    1  # Only replace first occurrence
                )
        
        return result
    
    def _get_content(self, chunk) -> str:
        """Extract content from chunk object."""
        if hasattr(chunk, 'content'):
            return chunk.content
        elif isinstance(chunk, dict):
            return chunk.get('content', '')
        return str(chunk)
    
    def _get_id(self, chunk) -> int:
        """Extract ID from chunk object."""
        if hasattr(chunk, 'id'):
            return chunk.id
        elif isinstance(chunk, dict):
            return chunk.get('id', 0)
        return 0
    
    def _get_source(self, chunk) -> str:
        """Extract source from chunk object."""
        if hasattr(chunk, 'source'):
            return chunk.source or "unknown"
        elif isinstance(chunk, dict):
            return chunk.get('source', 'unknown')
        return "unknown"


def create_source_attributor(embedding_model=None) -> SourceAttributor:
    """Factory function to create SourceAttributor."""
    return SourceAttributor(embedding_model)