Spaces:
Sleeping
Sleeping
File size: 9,498 Bytes
46fa8d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
"""
Compose a grounded answer from retrieved chunks with verbatim quotes + citations.
This module defines a *deterministic, reproducible* pipeline that never invents facts.
"""
from typing import List, Dict
import re
STOP_WORDS = {
"the", "and", "a", "an", "of", "to", "in", "for", "on", "at", "with",
"about", "by", "from", "is", "it", "this", "that", "these", "those",
"be", "was", "were", "am", "are", "as", "or", "if", "but", "what",
"which", "who", "whom", "when", "where", "why", "how", "does", "do",
"did", "can", "could", "would", "should", "may", "might", "so"
}
def _tokenize(text: str) -> List[str]:
"""Lowercase tokenization with stop-word removal."""
tokens = re.findall(r"\b\w+\b", text.lower())
return [tok for tok in tokens if tok not in STOP_WORDS]
def segment_sentences(text: str) -> List[str]:
"""
Split text into sentences using punctuation boundaries.
Returns non-empty sentences (minimum 20 chars or contains query terms).
"""
# Split on sentence boundaries (. ! ?) while keeping punctuation
sentences = re.split(r'([.!?]+)', text)
# Recombine sentences with their punctuation
result = []
for i in range(0, len(sentences) - 1, 2):
if i + 1 < len(sentences):
sentence = (sentences[i] + sentences[i + 1]).strip()
else:
sentence = sentences[i].strip()
# Keep sentences that are at least 20 chars or contain meaningful content
if len(sentence) >= 20:
result.append(sentence)
return result if result else [text] # Fallback to full text if no sentences found
def score_sentence(query: str, sentence: str, sent_vec=None) -> float: # sent_vec reserved for future embedding-based scoring
"""
Score how well a sentence supports the query.
Combines token overlap, precision, and simple phrase heuristics
for a 0-1 score. Purely lexical to stay deterministic/offline.
"""
query_tokens = _tokenize(query)
sentence_tokens = _tokenize(sentence)
if not query_tokens or not sentence_tokens:
return 0.0
q_set = set(query_tokens)
s_set = set(sentence_tokens)
overlap = q_set & s_set
coverage = len(overlap) / len(q_set)
precision = len(overlap) / len(s_set)
# Favor sentences that contain the unmodified question focus
normalized_query = re.sub(r"[^a-z0-9\s]", " ", query.lower()).strip()
phrase_bonus = 0.0
if normalized_query and normalized_query in sentence.lower():
phrase_bonus = 0.2
# Reward matching proper nouns (character names, places, etc.)
query_propers = {w.lower() for w in re.findall(r"\b[A-Z][a-z]+\b", query)}
sent_lower = sentence.lower()
proper_bonus = 0.0
if query_propers:
matches = sum(1 for name in query_propers if name in sent_lower)
if matches:
proper_bonus = min(0.2, matches * 0.05)
# Penalize sentences that only match names but not topical words
content_query_tokens = q_set - query_propers
if overlap and content_query_tokens and not (overlap & content_query_tokens):
name_only_penalty = 0.4
else:
name_only_penalty = 1.0
# Check for focus terms (longer words that are more meaningful)
# But don't completely reject if they're missing - just penalize
non_name_terms = q_set - query_propers
if non_name_terms:
focus_terms = {tok for tok in non_name_terms if len(tok) >= 4} # Lowered from 5 to 4
else:
focus_terms = {tok for tok in q_set if len(tok) >= 4} # Lowered from 5 to 4
# Penalize if no focus terms match, but don't return 0.0
focus_penalty = 0.3 if (focus_terms and not (focus_terms & s_set)) else 1.0
# Prefer sentences roughly tweet-length (avoid super short/long)
length = len(sentence)
if 60 <= length <= 280:
length_bonus = 0.1
else:
length_bonus = 0.0
raw_score = (coverage * 0.6) + (precision * 0.3) + phrase_bonus + proper_bonus + length_bonus
score = raw_score * name_only_penalty * focus_penalty
return max(0.0, min(1.0, score))
def select_quotes(query: str, retrieved: List[Dict], n: int = 3) -> List[Dict]:
"""
Select top-N quotes from retrieved chunks with diversity.
Segments chunks into sentences, scores them, filters low-signal
lines, and keeps diverse evidence.
"""
all_sentences = []
min_score = 0.05 # Lower threshold to allow more sentences through
# For each retrieved chunk, segment and score sentences
for item in retrieved:
text = item.get('text', '')
if not text:
continue
sentences = segment_sentences(text)
for sent in sentences:
score = score_sentence(query, sent)
if score < min_score:
continue
all_sentences.append({
'text': sent.strip(),
'score': score,
'chunk_id': item.get('chunk_id', ''),
'cite': item.get('meta', {})
})
# Sort by score and take top-N
all_sentences.sort(key=lambda x: x['score'], reverse=True)
# Simple diversity: skip sentences that are too similar to already selected ones
selected = []
def _too_similar(a_text: str, b_text: str) -> bool:
a_tokens = set(_tokenize(a_text))
b_tokens = set(_tokenize(b_text))
if not a_tokens or not b_tokens:
return False
jaccard = len(a_tokens & b_tokens) / len(a_tokens | b_tokens)
return jaccard > 0.8
for sent_data in all_sentences:
if len(selected) >= n:
break
is_duplicate = any(
sent_data['chunk_id'] == existing['chunk_id'] and _too_similar(sent_data['text'], existing['text'])
for existing in selected
)
if not is_duplicate:
selected.append(sent_data)
# Fallback: if filtering removed everything, use the best sentences regardless of score
if not selected and all_sentences:
selected = all_sentences[:n]
# Final fallback: if still no quotes, take first sentences from retrieved chunks
if not selected and retrieved:
for item in retrieved[:n]:
text = item.get('text', '')
if text:
sentences = segment_sentences(text)
if sentences:
selected.append({
'text': sentences[0].strip(),
'score': 0.1, # Low score but still included
'chunk_id': item.get('chunk_id', ''),
'cite': item.get('meta', {})
})
return selected[:n]
def synthesize_answer(query: str, quotes: List[Dict]) -> str:
"""
Compose a short synthetic answer that references selected quotes.
Deterministic template with proper grammar + explicit evidence callouts.
"""
if not quotes:
return "I couldn't find relevant information to answer this question."
query_clean = query.strip().rstrip("?")
intro = f"Here's what the text says about β{query_clean}β:"
bullet_lines = []
for i, quote in enumerate(quotes, 1):
text = " ".join(quote['text'].split())
if len(text) > 200:
text = text[:200].rstrip() + "..."
cite = quote.get('cite') or {}
location = ""
if cite:
book = cite.get('book')
para_start = cite.get('para_idx_start')
para_end = cite.get('para_idx_end')
if book and para_start is not None and para_end is not None:
location = f" ({book.title()}, paragraphs {para_start}-{para_end})"
bullet_lines.append(f"[{i}] {text}{location}")
body = "\n".join(bullet_lines)
closing = "Together these cited passages directly answer the question."
return f"{intro}\n\n{body}\n\n{closing}"
def render_citations(quotes: List[Dict]) -> List[str]:
"""
Render citations block for UI.
Format: [n] short_snippet β source (book), location (paragraphs).
"""
citations = []
for i, quote in enumerate(quotes, 1):
text = quote['text']
# Shorten to ~200 chars with ellipses
if len(text) > 200:
text = text[:200] + "..."
cite = quote.get('cite', {})
book = cite.get('book', 'unknown')
para_start = cite.get('para_idx_start', '?')
para_end = cite.get('para_idx_end', '?')
citation = f"[{i}] {text} β {book.title()}, paragraphs {para_start}-{para_end}"
citations.append(citation)
return citations
def compose_answer(query: str, retrieved: List[Dict], max_quotes: int = 3) -> Dict:
"""
Main composition entrypoint called by app layer.
Returns structured payload for UI.
"""
if not retrieved:
return {
'answer': "I couldn't find any relevant information to answer this question.",
'quotes': [],
'references': []
}
# Select top quotes
quotes = select_quotes(query, retrieved, n=max_quotes)
# Synthesize answer
answer = synthesize_answer(query, quotes)
# Render citations
references = render_citations(quotes)
return {
'answer': answer,
'quotes': quotes,
'references': references
}
|