Ryan commited on
Commit
c759ad8
·
1 Parent(s): 83175f3

- better formatting and debug

Browse files
Files changed (4) hide show
  1. app.py +22 -2
  2. citations.py +46 -50
  3. query.py +21 -4
  4. test_url.py +18 -0
app.py CHANGED
@@ -19,15 +19,35 @@ def chat_interface(question: str, show_context: bool = False):
19
  for i, citation in enumerate(result["citations"], 1):
20
  # Use matched_text (actual source text) instead of AI's quote
21
  display_text = citation.get('matched_text', citation['quote'])
 
 
 
 
22
  citations_text += f"**[{i}]** {citation['title']}\n\n"
23
  citations_text += f"> \"{display_text}\"\n\n"
24
  citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
25
 
26
  # Add validation warnings if any
27
  if result.get("validation_errors"):
28
- citations_text += "\n⚠️ **Validation Warnings:**\n"
29
  for error in result["validation_errors"]:
30
- citations_text += f"- {error}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Add stats
33
  if result["citations"]:
 
19
  for i, citation in enumerate(result["citations"], 1):
20
  # Use matched_text (actual source text) instead of AI's quote
21
  display_text = citation.get('matched_text', citation['quote'])
22
+ # Replace markdown bullets with bullet character for display in quote block
23
+ display_text = display_text.replace('\n- ', '\n• ')
24
+ if display_text.startswith('- '):
25
+ display_text = '\n• ' + display_text[2:]
26
  citations_text += f"**[{i}]** {citation['title']}\n\n"
27
  citations_text += f"> \"{display_text}\"\n\n"
28
  citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
29
 
30
  # Add validation warnings if any
31
  if result.get("validation_errors"):
32
+ citations_text += "\n---\n\n### ⚠️ Validation Warnings\n\n"
33
  for error in result["validation_errors"]:
34
+ fuzzy_score = error.get('fuzzy_match_score', 0)
35
+ citations_text += f"**[{error['citation_id']}]** {error['reason']}\n\n"
36
+
37
+ # Format claimed quote (stored as 'quote' in validation result)
38
+ claimed_quote = error.get('quote', '')
39
+ claimed_quote = claimed_quote.replace('\n- ', '\n• ')
40
+ if claimed_quote.startswith('- '):
41
+ claimed_quote = '\n• ' + claimed_quote[2:]
42
+ citations_text += f"**AI's claimed quote:**\n> \"{claimed_quote}\"\n\n"
43
+
44
+ # Format matched text from source
45
+ if error.get('matched_text'):
46
+ matched_text = error['matched_text']
47
+ matched_text = matched_text.replace('\n- ', '\n• ')
48
+ if matched_text.startswith('- '):
49
+ matched_text = '\n• ' + matched_text[2:]
50
+ citations_text += f"**Closest match in actual source** ({fuzzy_score:.1f}% fuzzy match):\n> \"{matched_text}\"\n\n"
51
 
52
  # Add stats
53
  if result["citations"]:
citations.py CHANGED
@@ -10,7 +10,7 @@ from rapidfuzz import fuzz
10
  from fuzzysearch import find_near_matches
11
 
12
 
13
- FUZZY_THRESHOLD = 95
14
 
15
  def find_best_match_substring(quote: str, source_text: str) -> str:
16
  """Find the actual matching substring in source_text.
@@ -56,23 +56,27 @@ def create_highlighted_url(base_url: str, quote_text: str) -> str:
56
  Returns:
57
  URL with text fragment
58
  """
 
 
 
 
 
 
 
59
  # Extract a meaningful snippet (first ~80 chars work better for text fragments)
60
  # Cut at word boundaries to avoid breaking words mid-way
61
  max_length = 80
62
- if len(quote_text) > max_length:
63
  # Find the last space before the cutoff
64
- text_fragment = quote_text[:max_length]
65
  last_space = text_fragment.rfind(' ')
66
  if last_space > 0: # If we found a space, cut there
67
  text_fragment = text_fragment[:last_space]
68
  else:
69
- text_fragment = quote_text
70
 
71
  text_fragment = text_fragment.strip()
72
 
73
- # Encode everything for maximum compatibility
74
- # quote() with safe='' still preserves unreserved chars (- . _ ~)
75
- # So we manually encode those too
76
  encoded_text = quote(text_fragment, safe='')
77
  # Manually encode the unreserved chars that quote() preserves
78
  encoded_text = encoded_text.replace('-', '%2D')
@@ -123,7 +127,7 @@ def build_citation_entry(citation: Dict[str, Any], validation_result: Dict[str,
123
  Returns:
124
  Complete citation entry with URL and metadata
125
  """
126
- matched_text = validation_result.get("matched_text", citation.get("quote", ""))
127
  highlighted_url = create_highlighted_url(
128
  validation_result["url"],
129
  matched_text
@@ -135,10 +139,9 @@ def build_citation_entry(citation: Dict[str, Any], validation_result: Dict[str,
135
  "matched_text": matched_text, # Actual text from source
136
  "title": validation_result["title"],
137
  "url": highlighted_url,
138
- "similarity_score": validation_result["similarity_score"]
 
139
  }
140
- if validation_result.get("remapped"):
141
- citation_entry["remapped_from"] = validation_result["original_source_id"]
142
  return citation_entry
143
 
144
  def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any]) -> Dict[str, Any]:
@@ -165,18 +168,32 @@ def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any])
165
  citation_entry = build_citation_entry(citation, validation_result)
166
  validated_citations.append(citation_entry)
167
  else:
168
- validation_errors.append({
169
- "citation_id": citation_id,
170
- "reason": validation_result['reason'],
171
- "claimed_quote": quote,
172
- "source_text": validation_result.get('source_text')
173
- })
174
 
175
  return {
176
  "validated_citations": validated_citations,
177
  "validation_errors": validation_errors
178
  }
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> Dict[str, Any]:
181
  """Validate that a quote exists in the specified source chunk.
182
 
@@ -197,51 +214,30 @@ def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> D
197
  "source_text": None
198
  }
199
 
200
-
201
-
202
  # Step 1: Check claimed source first (fast path)
203
  source_text = source_chunks[source_id - 1].payload['text']
204
  claimed_score = fuzz.partial_ratio(quote, source_text)
205
 
206
  if claimed_score >= FUZZY_THRESHOLD:
207
- # Find the actual matching substring in the source
208
- matched_substring = find_best_match_substring(quote, source_chunks[source_id - 1].payload['text'])
209
- return {
210
- "valid": True,
211
- "quote": quote,
212
- "matched_text": matched_substring, # The actual matching text from 80k Hours
213
- "source_id": source_id,
214
- "title": source_chunks[source_id - 1].payload['title'],
215
- "url": source_chunks[source_id - 1].payload['url'],
216
- "similarity_score": claimed_score
217
- }
218
 
 
219
  for idx, chunk in enumerate(source_chunks, 1):
220
  if idx == source_id:
221
  continue # Already checked
222
  score = fuzz.partial_ratio(quote, chunk.payload['text'])
223
  if score >= FUZZY_THRESHOLD:
224
- # Find the actual matching substring in the source
225
- matched_substring = find_best_match_substring(quote, chunk.payload['text'])
226
- return {
227
- "valid": True,
228
- "quote": quote,
229
- "matched_text": matched_substring, # The actual matching text from 80k Hours
230
- "source_id": idx,
231
- "title": chunk.payload['title'],
232
- "url": chunk.payload['url'],
233
- "similarity_score": score,
234
- "remapped": True,
235
- "original_source_id": source_id
236
- }
237
 
238
- # Validation failed - report best score from claimed source
 
239
  return {
240
  "valid": False,
241
  "quote": quote,
242
  "source_id": source_id,
243
- "reason": f"Quote not found in any source (claimed source: {claimed_score:.1f}% similarity)",
244
- "source_text": source_chunks[source_id - 1].payload['text']
 
245
  }
246
 
247
 
@@ -263,12 +259,12 @@ def format_citations_display(citations: List[Dict[str, Any]]) -> str:
263
  citation_parts = []
264
  for cit in sorted_citations:
265
  marker = f"[{cit['citation_id']}]"
266
- score = cit.get('similarity_score', 100)
267
 
268
- if cit.get('remapped_from'):
269
- note = f" ({score:.1f}% match, remapped: source {cit['remapped_from']} → {cit['source_id']})"
270
  else:
271
- note = f" ({score:.1f}% match)"
272
 
273
  citation_parts.append(
274
  f"{marker} {cit['title']}{note}\n"
 
10
  from fuzzysearch import find_near_matches
11
 
12
 
13
+ FUZZY_THRESHOLD = 90
14
 
15
  def find_best_match_substring(quote: str, source_text: str) -> str:
16
  """Find the actual matching substring in source_text.
 
56
  Returns:
57
  URL with text fragment
58
  """
59
+ # Take only the first line/paragraph (text fragments can't match across elements)
60
+ first_line = quote_text.split('\n')[0].strip()
61
+
62
+ # Remove bullet point markers (they're formatting, not content)
63
+ if first_line.startswith('- '):
64
+ first_line = first_line[2:].strip()
65
+
66
  # Extract a meaningful snippet (first ~80 chars work better for text fragments)
67
  # Cut at word boundaries to avoid breaking words mid-way
68
  max_length = 80
69
+ if len(first_line) > max_length:
70
  # Find the last space before the cutoff
71
+ text_fragment = first_line[:max_length]
72
  last_space = text_fragment.rfind(' ')
73
  if last_space > 0: # If we found a space, cut there
74
  text_fragment = text_fragment[:last_space]
75
  else:
76
+ text_fragment = first_line
77
 
78
  text_fragment = text_fragment.strip()
79
 
 
 
 
80
  encoded_text = quote(text_fragment, safe='')
81
  # Manually encode the unreserved chars that quote() preserves
82
  encoded_text = encoded_text.replace('-', '%2D')
 
127
  Returns:
128
  Complete citation entry with URL and metadata
129
  """
130
+ matched_text = validation_result["matched_text"]
131
  highlighted_url = create_highlighted_url(
132
  validation_result["url"],
133
  matched_text
 
139
  "matched_text": matched_text, # Actual text from source
140
  "title": validation_result["title"],
141
  "url": highlighted_url,
142
+ "fuzzy_match_score": validation_result["fuzzy_match_score"],
143
+ "remapped": validation_result.get("remapped", False)
144
  }
 
 
145
  return citation_entry
146
 
147
  def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any]) -> Dict[str, Any]:
 
168
  citation_entry = build_citation_entry(citation, validation_result)
169
  validated_citations.append(citation_entry)
170
  else:
171
+ # Add citation_id to validation result for tracking
172
+ validation_result["citation_id"] = citation_id
173
+ validation_errors.append(validation_result)
 
 
 
174
 
175
  return {
176
  "validated_citations": validated_citations,
177
  "validation_errors": validation_errors
178
  }
179
 
180
+ def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
181
+ remapped: bool = False) -> Dict[str, Any]:
182
+ """Build a valid citation result dict."""
183
+ matched_substring = find_best_match_substring(quote, chunk.payload['text'])
184
+ result = {
185
+ "valid": True,
186
+ "quote": quote,
187
+ "matched_text": matched_substring,
188
+ "source_id": chunk_id,
189
+ "title": chunk.payload['title'],
190
+ "url": chunk.payload['url'],
191
+ "fuzzy_match_score": score
192
+ }
193
+ if remapped:
194
+ result["remapped"] = True
195
+ return result
196
+
197
  def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> Dict[str, Any]:
198
  """Validate that a quote exists in the specified source chunk.
199
 
 
214
  "source_text": None
215
  }
216
 
 
 
217
  # Step 1: Check claimed source first (fast path)
218
  source_text = source_chunks[source_id - 1].payload['text']
219
  claimed_score = fuzz.partial_ratio(quote, source_text)
220
 
221
  if claimed_score >= FUZZY_THRESHOLD:
222
+ return _build_valid_result(quote, source_chunks[source_id - 1], source_id, claimed_score)
 
 
 
 
 
 
 
 
 
 
223
 
224
+ # Step 2: Search all other sources for remapping
225
  for idx, chunk in enumerate(source_chunks, 1):
226
  if idx == source_id:
227
  continue # Already checked
228
  score = fuzz.partial_ratio(quote, chunk.payload['text'])
229
  if score >= FUZZY_THRESHOLD:
230
+ return _build_valid_result(quote, chunk, idx, score, remapped=True)
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ # Validation failed - find closest match for debugging
233
+ matched_text = find_best_match_substring(quote, source_text)
234
  return {
235
  "valid": False,
236
  "quote": quote,
237
  "source_id": source_id,
238
+ "reason": f"Quote not found in any source (claimed source: {claimed_score:.1f}% fuzzy match)",
239
+ "matched_text": matched_text,
240
+ "fuzzy_match_score": claimed_score
241
  }
242
 
243
 
 
259
  citation_parts = []
260
  for cit in sorted_citations:
261
  marker = f"[{cit['citation_id']}]"
262
+ score = cit.get('fuzzy_match_score', 100)
263
 
264
+ if cit.get('remapped'):
265
+ note = f" ({score:.1f}% fuzzy match, remapped)"
266
  else:
267
+ note = f" ({score:.1f}% fuzzy match)"
268
 
269
  citation_parts.append(
270
  f"{marker} {cit['title']}{note}\n"
query.py CHANGED
@@ -17,6 +17,8 @@ SCORE_THRESHOLD = 0.4
17
 
18
  def retrieve_context(question):
19
  """Retrieve relevant chunks from Qdrant."""
 
 
20
  client = QdrantClient(
21
  url=os.getenv("QDRANT_URL"),
22
  api_key=os.getenv("QDRANT_API_KEY"),
@@ -32,6 +34,9 @@ def retrieve_context(question):
32
  score_threshold=SCORE_THRESHOLD,
33
  )
34
 
 
 
 
35
  return results.points
36
 
37
  def format_context(results):
@@ -78,13 +83,17 @@ def generate_answer_with_citations(
78
  STEP 2: Provide citations
79
  - For each [N] in your answer, provide a citation with:
80
  * citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
81
- * source_id: Which source it came from (see [Source N] in context below)
82
  * quote: Copy the EXACT sentences from that source, word-for-word
83
 
 
 
 
 
84
  CRITICAL RULES:
85
  1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
86
  2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
87
- 3. Match source_id to where you found the quote ([Source 1] → source_id: 1)
88
  4. Each quote must be complete sentences from the source
89
 
90
  OUTPUT FORMAT (valid JSON):
@@ -128,7 +137,7 @@ def generate_answer_with_citations(
128
  parsed = parse_llm_response(response.choices[0].message.content)
129
  if "validation_errors" in parsed:
130
  return {
131
- "answer": parsed["answer"],
132
  "citations": [],
133
  "validation_errors": parsed["validation_errors"],
134
  "total_citations": 0,
@@ -139,7 +148,10 @@ def generate_answer_with_citations(
139
  citations = parsed.get("citations", [])
140
 
141
  # Validate citations
 
142
  result = process_citations(citations, results)
 
 
143
 
144
  return {
145
  "answer": answer,
@@ -166,7 +178,7 @@ def save_validation_results(question: str, result: Dict[str, Any], results: List
166
  "title": hit.payload['title'],
167
  "url": hit.payload['url'],
168
  "chunk_id": hit.payload.get('chunk_id'),
169
- "similarity_score": hit.score,
170
  "text": hit.payload['text']
171
  }
172
  for i, hit in enumerate(results, 1)
@@ -212,6 +224,8 @@ def display_results(question: str, result: Dict[str, Any], context: str = None):
212
 
213
  def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
214
  """Main RAG function: retrieve context and generate answer with validated citations."""
 
 
215
  results = retrieve_context(question)
216
  if not results:
217
  print("No relevant sources found above the score threshold.")
@@ -232,6 +246,9 @@ def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
232
  openai_api_key=os.getenv("OPENAI_API_KEY")
233
  )
234
 
 
 
 
235
  # Display results
236
  display_results(question, result, context if show_context else None)
237
 
 
17
 
18
  def retrieve_context(question):
19
  """Retrieve relevant chunks from Qdrant."""
20
+ start = time.time()
21
+
22
  client = QdrantClient(
23
  url=os.getenv("QDRANT_URL"),
24
  api_key=os.getenv("QDRANT_API_KEY"),
 
34
  score_threshold=SCORE_THRESHOLD,
35
  )
36
 
37
+ elapsed = (time.time() - start) * 1000
38
+ print(f"[TIMING] Retrieval: {elapsed:.0f}ms")
39
+
40
  return results.points
41
 
42
  def format_context(results):
 
83
  STEP 2: Provide citations
84
  - For each [N] in your answer, provide a citation with:
85
  * citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
86
+ * source_id: Which source it came from (match the [Source N] label exactly)
87
  * quote: Copy the EXACT sentences from that source, word-for-word
88
 
89
+ EXAMPLE - If you found text in [Source 3]:
90
+ - Your answer: "Career capital helps you succeed [1]."
91
+ - Your citation: {"citation_id": 1, "source_id": 3, "quote": "Career capital includes..."}
92
+
93
  CRITICAL RULES:
94
  1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
95
  2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
96
+ 3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
97
  4. Each quote must be complete sentences from the source
98
 
99
  OUTPUT FORMAT (valid JSON):
 
137
  parsed = parse_llm_response(response.choices[0].message.content)
138
  if "validation_errors" in parsed:
139
  return {
140
+ "answer": parsed["answer"], # raw llm response
141
  "citations": [],
142
  "validation_errors": parsed["validation_errors"],
143
  "total_citations": 0,
 
148
  citations = parsed.get("citations", [])
149
 
150
  # Validate citations
151
+ validation_start = time.time()
152
  result = process_citations(citations, results)
153
+ validation_time = (time.time() - validation_start) * 1000
154
+ print(f"[TIMING] Validation: {validation_time:.0f}ms")
155
 
156
  return {
157
  "answer": answer,
 
178
  "title": hit.payload['title'],
179
  "url": hit.payload['url'],
180
  "chunk_id": hit.payload.get('chunk_id'),
181
+ "cosine_similarity": hit.score, # Vector similarity from Qdrant
182
  "text": hit.payload['text']
183
  }
184
  for i, hit in enumerate(results, 1)
 
224
 
225
  def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
226
  """Main RAG function: retrieve context and generate answer with validated citations."""
227
+ total_start = time.time()
228
+
229
  results = retrieve_context(question)
230
  if not results:
231
  print("No relevant sources found above the score threshold.")
 
246
  openai_api_key=os.getenv("OPENAI_API_KEY")
247
  )
248
 
249
+ total_time = (time.time() - total_start) * 1000
250
+ print(f"[TIMING] Total: {total_time:.0f}ms")
251
+
252
  # Display results
253
  display_results(question, result, context if show_context else None)
254
 
test_url.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Simple test for create_highlighted_url function."""
3
+
4
+ from urllib.parse import urlparse
5
+ from citations import create_highlighted_url
6
+
7
+ def extract_base_url(full_url: str) -> str:
8
+ """Extract base URL without fragments."""
9
+ parsed = urlparse(full_url)
10
+ return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
11
+
12
+ # Paste your URL and text here
13
+ full_url = "https://80000hours.org/articles/future-generations/"
14
+ quote_text = '''- Risks from'''
15
+
16
+ base_url = extract_base_url(full_url)
17
+ result = create_highlighted_url(base_url, quote_text)
18
+ print(result)