chinmayjha commited on
Commit
d8c683d
Β·
unverified Β·
1 Parent(s): d7fa497

Fix Sources section parsing in CustomGradioUI

Browse files

- Added _parse_sources_from_text() method to parse new Sources format
- Updated parse_agent_response() to split answer from Sources section
- Sources section now properly displays separately from answer
- Extracts Doc titles, dates, summaries, and key findings from formatted text

src/second_brain_online/application/ui/custom_gradio_ui.py CHANGED
@@ -207,6 +207,79 @@ class CustomGradioUI:
207
  error_msg = f"<div style='color: #dc3545; padding: 12px; border: 1px solid #f5c6cb; border-radius: 4px; background-color: #f8d7da;'>Error: {str(e)}</div>"
208
  return error_msg, "", "", str(e), self.load_conversations()
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def parse_agent_response(self, result: Any, agent_logs: List = None) -> Tuple[str, List[Dict], List[str]]:
211
  """Parse the agent response to extract answer, sources, and tools used."""
212
  answer = ""
@@ -236,6 +309,24 @@ class CustomGradioUI:
236
  # Pattern 3: Use the entire result as answer if no specific pattern matches
237
  answer = result_str
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # If we have agent logs, extract tools and sources from them
240
  if agent_logs:
241
  for step in agent_logs:
 
207
  error_msg = f"<div style='color: #dc3545; padding: 12px; border: 1px solid #f5c6cb; border-radius: 4px; background-color: #f8d7da;'>Error: {str(e)}</div>"
208
  return error_msg, "", "", str(e), self.load_conversations()
209
 
210
+ def _parse_sources_from_text(self, sources_text: str) -> List[Dict]:
211
+ """Parse sources from the formatted text output.
212
+
213
+ Expected format:
214
+ Doc 1: Title (Date)
215
+ Source: Type | Document ID: ID | URL | User ID
216
+
217
+ Summary: ...
218
+
219
+ Key Findings:
220
+ - [Type/Impact] Finding
221
+ """
222
+ sources = []
223
+
224
+ # Split by "Doc X:" pattern
225
+ doc_pattern = r'Doc\s+(\d+):\s*([^\n]+)'
226
+ doc_matches = re.finditer(doc_pattern, sources_text)
227
+
228
+ for match in doc_matches:
229
+ doc_num = match.group(1)
230
+ title_line = match.group(2).strip()
231
+
232
+ # Find the next Doc or end of string
233
+ start_pos = match.end()
234
+ next_match = re.search(r'Doc\s+\d+:', sources_text[start_pos:])
235
+ if next_match:
236
+ end_pos = start_pos + next_match.start()
237
+ doc_content = sources_text[start_pos:end_pos]
238
+ else:
239
+ doc_content = sources_text[start_pos:]
240
+
241
+ # Extract title and date from title line
242
+ title_date_match = re.match(r'(.+?)\s*\(([^)]+)\)', title_line)
243
+ if title_date_match:
244
+ title = title_date_match.group(1).strip()
245
+ date = title_date_match.group(2).strip()
246
+ else:
247
+ title = title_line
248
+ date = ""
249
+
250
+ # Extract document ID
251
+ doc_id = ""
252
+ id_match = re.search(r'Document ID:\s*([a-zA-Z0-9]+)', doc_content)
253
+ if id_match:
254
+ doc_id = id_match.group(1)
255
+
256
+ # Extract summary
257
+ summary = ""
258
+ summary_match = re.search(r'Summary:\s*([^\n]+)', doc_content)
259
+ if summary_match:
260
+ summary = summary_match.group(1).strip()
261
+
262
+ # Extract key findings
263
+ key_findings = []
264
+ findings_section = re.search(r'Key Findings:\s*(.+?)(?=\n\nDoc\s+\d+:|$)', doc_content, re.DOTALL)
265
+ if findings_section:
266
+ findings_text = findings_section.group(1)
267
+ # Extract each finding line
268
+ finding_lines = re.findall(r'-\s*\[([^\]]+)\]\s*([^\n]+)', findings_text)
269
+ for finding_type, finding_text in finding_lines:
270
+ key_findings.append(f"[{finding_type}] {finding_text.strip()}")
271
+
272
+ sources.append({
273
+ "id": doc_id,
274
+ "title": title,
275
+ "date": date,
276
+ "summary": summary,
277
+ "key_findings": key_findings,
278
+ "quotes": [] # Not using quotes in new format
279
+ })
280
+
281
+ return sources
282
+
283
  def parse_agent_response(self, result: Any, agent_logs: List = None) -> Tuple[str, List[Dict], List[str]]:
284
  """Parse the agent response to extract answer, sources, and tools used."""
285
  answer = ""
 
309
  # Pattern 3: Use the entire result as answer if no specific pattern matches
310
  answer = result_str
311
 
312
+ # NEW: Split answer and sources section
313
+ # Look for the Sources section marker (πŸ“š Sources:)
314
+ sources_split = re.split(r'πŸ“š\s*Sources:?', answer, maxsplit=1, flags=re.IGNORECASE)
315
+
316
+ if len(sources_split) == 2:
317
+ # We found a Sources section
318
+ answer_only = sources_split[0].strip()
319
+ sources_text = sources_split[1].strip()
320
+
321
+ # Parse sources from the text
322
+ sources = self._parse_sources_from_text(sources_text)
323
+
324
+ # Update answer to only include the answer part
325
+ answer = answer_only
326
+ else:
327
+ # No sources section found, answer remains as-is
328
+ pass
329
+
330
  # If we have agent logs, extract tools and sources from them
331
  if agent_logs:
332
  for step in agent_logs: