Spaces:
Running
Running
feat: implement semantic TL;DR citations and live image support
Browse files
src/core/domain/schemas.py
CHANGED
|
@@ -13,10 +13,14 @@ class SourceDocument(BaseModel):
|
|
| 13 |
content: str
|
| 14 |
metadata: dict
|
| 15 |
score: float
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
class ChatResponse(BaseModel):
|
| 18 |
answer: str
|
| 19 |
sources: List[SourceDocument]
|
|
|
|
| 20 |
session_id: str = "anonymous"
|
| 21 |
|
| 22 |
class FeedbackRequest(BaseModel):
|
|
|
|
| 13 |
content: str
|
| 14 |
metadata: dict
|
| 15 |
score: float
|
| 16 |
+
snippet: Optional[str] = None
|
| 17 |
+
image_url: Optional[str] = None
|
| 18 |
+
citation_index: Optional[int] = None
|
| 19 |
|
| 20 |
class ChatResponse(BaseModel):
|
| 21 |
answer: str
|
| 22 |
sources: List[SourceDocument]
|
| 23 |
+
follow_up_questions: List[str] = []
|
| 24 |
session_id: str = "anonymous"
|
| 25 |
|
| 26 |
class FeedbackRequest(BaseModel):
|
src/core/use_cases/rag_chat_use_case.py
CHANGED
|
@@ -131,10 +131,37 @@ Document:
|
|
| 131 |
|
| 132 |
context_text += formatted + "\n\n"
|
| 133 |
total_tokens += tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
filtered_sources.append(doc)
|
| 135 |
|
| 136 |
return context_text, filtered_sources
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
|
| 139 |
"""
|
| 140 |
Single LLM call: query understanding + multilingual translation.
|
|
@@ -954,6 +981,9 @@ Answer:"""
|
|
| 954 |
# ββ Attach citation index to each source for frontend rendering βββββββ
|
| 955 |
for idx, doc in enumerate(final_sources, 1):
|
| 956 |
doc["citation_index"] = idx
|
|
|
|
|
|
|
|
|
|
| 957 |
|
| 958 |
result = {
|
| 959 |
"answer": answer,
|
|
|
|
| 131 |
|
| 132 |
context_text += formatted + "\n\n"
|
| 133 |
total_tokens += tokens
|
| 134 |
+
|
| 135 |
+
# ββ ENHANCEMENT: Extract Semantic TL;DR Highlight ββ
|
| 136 |
+
doc["highlight"] = self._extract_highlight_sentence(query, content)
|
| 137 |
+
doc["image_url"] = doc.get("image_url") or metadata.get("image_url") or metadata.get("url_to_image")
|
| 138 |
+
|
| 139 |
filtered_sources.append(doc)
|
| 140 |
|
| 141 |
return context_text, filtered_sources
|
| 142 |
|
| 143 |
+
def _extract_highlight_sentence(self, query: str, content: str) -> str:
|
| 144 |
+
"""
|
| 145 |
+
Extracts the single most relevant sentence from the content for hoverable citations.
|
| 146 |
+
Uses a simple sentence splitter and keyword overlap for speed.
|
| 147 |
+
"""
|
| 148 |
+
import re
|
| 149 |
+
sentences = re.split(r'(?<=[.!?])\s+', content)
|
| 150 |
+
if not sentences: return content[:150] + "..."
|
| 151 |
+
|
| 152 |
+
query_terms = set(query.lower().split())
|
| 153 |
+
best_sentence = sentences[0]
|
| 154 |
+
max_overlap = -1
|
| 155 |
+
|
| 156 |
+
for s in sentences:
|
| 157 |
+
if len(s) < 20: continue
|
| 158 |
+
overlap = len(set(s.lower().split()) & query_terms)
|
| 159 |
+
if overlap > max_overlap:
|
| 160 |
+
max_overlap = overlap
|
| 161 |
+
best_sentence = s
|
| 162 |
+
|
| 163 |
+
return best_sentence.strip()[:250]
|
| 164 |
+
|
| 165 |
def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
|
| 166 |
"""
|
| 167 |
Single LLM call: query understanding + multilingual translation.
|
|
|
|
| 981 |
# ββ Attach citation index to each source for frontend rendering βββββββ
|
| 982 |
for idx, doc in enumerate(final_sources, 1):
|
| 983 |
doc["citation_index"] = idx
|
| 984 |
+
# Ensure snippet exists for the TL;DR hover
|
| 985 |
+
if not doc.get("snippet"):
|
| 986 |
+
doc["snippet"] = doc.get("highlight", doc.get("content", "")[:200])
|
| 987 |
|
| 988 |
result = {
|
| 989 |
"answer": answer,
|
src/infrastructure/adapters/duckduckgo_adapter.py
CHANGED
|
@@ -339,6 +339,8 @@ class DuckDuckGoAdapter:
|
|
| 339 |
# Calculate freshness score (live results are freshest)
|
| 340 |
freshness_score = self._calculate_freshness(published_at)
|
| 341 |
|
|
|
|
|
|
|
| 342 |
return {
|
| 343 |
"title": title,
|
| 344 |
"url": url,
|
|
@@ -346,6 +348,7 @@ class DuckDuckGoAdapter:
|
|
| 346 |
"snippet": snippet,
|
| 347 |
"source": source or self._extract_domain(url),
|
| 348 |
"published_at": published_at,
|
|
|
|
| 349 |
"source_type": "live",
|
| 350 |
"is_live": True,
|
| 351 |
"freshness_score": freshness_score,
|
|
@@ -355,6 +358,7 @@ class DuckDuckGoAdapter:
|
|
| 355 |
"url": url,
|
| 356 |
"source": source,
|
| 357 |
"published_at": published_at,
|
|
|
|
| 358 |
"search_engine": "duckduckgo"
|
| 359 |
}
|
| 360 |
}
|
|
|
|
| 339 |
# Calculate freshness score (live results are freshest)
|
| 340 |
freshness_score = self._calculate_freshness(published_at)
|
| 341 |
|
| 342 |
+
image_url = raw_result.get("image") or raw_result.get("thumbnail")
|
| 343 |
+
|
| 344 |
return {
|
| 345 |
"title": title,
|
| 346 |
"url": url,
|
|
|
|
| 348 |
"snippet": snippet,
|
| 349 |
"source": source or self._extract_domain(url),
|
| 350 |
"published_at": published_at,
|
| 351 |
+
"image_url": image_url,
|
| 352 |
"source_type": "live",
|
| 353 |
"is_live": True,
|
| 354 |
"freshness_score": freshness_score,
|
|
|
|
| 358 |
"url": url,
|
| 359 |
"source": source,
|
| 360 |
"published_at": published_at,
|
| 361 |
+
"image_url": image_url,
|
| 362 |
"search_engine": "duckduckgo"
|
| 363 |
}
|
| 364 |
}
|