Peterase commited on
Commit
d8f8038
Β·
1 Parent(s): 4ce2e2e

feat: implement semantic TL;DR citations and live image support

Browse files
src/core/domain/schemas.py CHANGED
@@ -13,10 +13,14 @@ class SourceDocument(BaseModel):
13
  content: str
14
  metadata: dict
15
  score: float
 
 
 
16
 
17
  class ChatResponse(BaseModel):
18
  answer: str
19
  sources: List[SourceDocument]
 
20
  session_id: str = "anonymous"
21
 
22
  class FeedbackRequest(BaseModel):
 
13
  content: str
14
  metadata: dict
15
  score: float
16
+ snippet: Optional[str] = None
17
+ image_url: Optional[str] = None
18
+ citation_index: Optional[int] = None
19
 
20
  class ChatResponse(BaseModel):
21
  answer: str
22
  sources: List[SourceDocument]
23
+ follow_up_questions: List[str] = []
24
  session_id: str = "anonymous"
25
 
26
  class FeedbackRequest(BaseModel):
src/core/use_cases/rag_chat_use_case.py CHANGED
@@ -131,10 +131,37 @@ Document:
131
 
132
  context_text += formatted + "\n\n"
133
  total_tokens += tokens
 
 
 
 
 
134
  filtered_sources.append(doc)
135
 
136
  return context_text, filtered_sources
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
139
  """
140
  Single LLM call: query understanding + multilingual translation.
@@ -954,6 +981,9 @@ Answer:"""
954
  # ── Attach citation index to each source for frontend rendering ───────
955
  for idx, doc in enumerate(final_sources, 1):
956
  doc["citation_index"] = idx
 
 
 
957
 
958
  result = {
959
  "answer": answer,
 
131
 
132
  context_text += formatted + "\n\n"
133
  total_tokens += tokens
134
+
135
+ # ── ENHANCEMENT: Extract Semantic TL;DR Highlight ──
136
+ doc["highlight"] = self._extract_highlight_sentence(query, content)
137
+ doc["image_url"] = doc.get("image_url") or metadata.get("image_url") or metadata.get("url_to_image")
138
+
139
  filtered_sources.append(doc)
140
 
141
  return context_text, filtered_sources
142
 
143
+ def _extract_highlight_sentence(self, query: str, content: str) -> str:
144
+ """
145
+ Extracts the single most relevant sentence from the content for hoverable citations.
146
+ Uses a simple sentence splitter and keyword overlap for speed.
147
+ """
148
+ import re
149
+ sentences = re.split(r'(?<=[.!?])\s+', content)
150
+ if not sentences: return content[:150] + "..."
151
+
152
+ query_terms = set(query.lower().split())
153
+ best_sentence = sentences[0]
154
+ max_overlap = -1
155
+
156
+ for s in sentences:
157
+ if len(s) < 20: continue
158
+ overlap = len(set(s.lower().split()) & query_terms)
159
+ if overlap > max_overlap:
160
+ max_overlap = overlap
161
+ best_sentence = s
162
+
163
+ return best_sentence.strip()[:250]
164
+
165
  def _extract_intents_and_translate(self, query: str) -> Dict[str, Any]:
166
  """
167
  Single LLM call: query understanding + multilingual translation.
 
981
  # ── Attach citation index to each source for frontend rendering ───────
982
  for idx, doc in enumerate(final_sources, 1):
983
  doc["citation_index"] = idx
984
+ # Ensure snippet exists for the TL;DR hover
985
+ if not doc.get("snippet"):
986
+ doc["snippet"] = doc.get("highlight", doc.get("content", "")[:200])
987
 
988
  result = {
989
  "answer": answer,
src/infrastructure/adapters/duckduckgo_adapter.py CHANGED
@@ -339,6 +339,8 @@ class DuckDuckGoAdapter:
339
  # Calculate freshness score (live results are freshest)
340
  freshness_score = self._calculate_freshness(published_at)
341
 
 
 
342
  return {
343
  "title": title,
344
  "url": url,
@@ -346,6 +348,7 @@ class DuckDuckGoAdapter:
346
  "snippet": snippet,
347
  "source": source or self._extract_domain(url),
348
  "published_at": published_at,
 
349
  "source_type": "live",
350
  "is_live": True,
351
  "freshness_score": freshness_score,
@@ -355,6 +358,7 @@ class DuckDuckGoAdapter:
355
  "url": url,
356
  "source": source,
357
  "published_at": published_at,
 
358
  "search_engine": "duckduckgo"
359
  }
360
  }
 
339
  # Calculate freshness score (live results are freshest)
340
  freshness_score = self._calculate_freshness(published_at)
341
 
342
+ image_url = raw_result.get("image") or raw_result.get("thumbnail")
343
+
344
  return {
345
  "title": title,
346
  "url": url,
 
348
  "snippet": snippet,
349
  "source": source or self._extract_domain(url),
350
  "published_at": published_at,
351
+ "image_url": image_url,
352
  "source_type": "live",
353
  "is_live": True,
354
  "freshness_score": freshness_score,
 
358
  "url": url,
359
  "source": source,
360
  "published_at": published_at,
361
+ "image_url": image_url,
362
  "search_engine": "duckduckgo"
363
  }
364
  }