Ryan commited on
Commit
ee6b298
·
1 Parent(s): 12f4fc7

update UI

Browse files
Files changed (3) hide show
  1. app.py +63 -100
  2. citations.py +99 -32
  3. query.py +2 -2
app.py CHANGED
@@ -2,121 +2,84 @@ import gradio as gr
2
  import os
3
  from query import ask
4
 
5
- # Import query module (this starts loading the embedding model in background)
6
  print("🚀 Starting 80,000 Hours RAG system...")
7
  from query import is_model_ready
8
  print("✅ App ready! Model loading in background...")
9
 
10
- def chat_interface(question: str, show_context: bool = False):
11
- """Process question and return formatted response."""
12
- if not question.strip():
13
- return "Please enter a question.", ""
14
 
15
- result = ask(question, show_context=show_context)
 
 
 
 
 
 
 
 
 
16
 
17
- # Format main response
18
- answer = result["answer"]
19
 
20
- # Format citations
21
- citations_text = ""
22
- if result["citations"]:
23
- citations_text += "\n\n---\n\n### 📚 Citations\n\n"
24
- for i, citation in enumerate(result["citations"], 1):
25
- # Use matched_text (actual source text) instead of AI's quote
26
- display_text = citation.get('matched_text', citation['quote'])
27
- # Replace markdown bullets with bullet character for display in quote block
28
- display_text = display_text.replace('\n- ', '\n• ')
29
- if display_text.startswith('- '):
30
- display_text = '\n• ' + display_text[2:]
31
- citations_text += f"**[{i}]** {citation['title']}\n\n"
32
- citations_text += f"> \"{display_text}\"\n\n"
33
- citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
34
 
35
- # Add validation warnings if any
36
- if result.get("validation_errors"):
37
- citations_text += "\n---\n\n### ⚠️ Validation Warnings\n\n"
38
- for error in result["validation_errors"]:
39
- fuzzy_score = error.get('fuzzy_match_score', 0)
40
- citations_text += f"**[{error['citation_id']}]** {error['reason']}\n\n"
41
-
42
- # Format claimed quote (stored as 'quote' in validation result)
43
- claimed_quote = error.get('quote', '')
44
- claimed_quote = claimed_quote.replace('\n- ', '\n• ')
45
- if claimed_quote.startswith('- '):
46
- claimed_quote = '\n• ' + claimed_quote[2:]
47
- citations_text += f"**AI's claimed quote:**\n> \"{claimed_quote}\"\n\n"
48
-
49
- # Format matched text from source
50
- if error.get('matched_text'):
51
- matched_text = error['matched_text']
52
- matched_text = matched_text.replace('\n- ', '\n• ')
53
- if matched_text.startswith('- '):
54
- matched_text = '\n• ' + matched_text[2:]
55
- citations_text += f"**Closest match in actual source** ({fuzzy_score:.1f}% match):\n> \"{matched_text}\"\n\n"
56
 
57
- # Add stats
58
  if result["citations"]:
59
- valid_count = len([c for c in result["citations"] if c.get("validated", True)])
60
- total_count = len(result["citations"])
61
- citations_text += f"\n✓ {valid_count}/{total_count} citations validated"
 
62
 
63
- return answer, citations_text
64
 
65
 
66
  # --- Build Gradio UI ---
67
- with gr.Blocks(title="80,000 Hours Q&A", theme=gr.themes.Soft()) as demo:
68
- gr.Markdown(
69
- """
70
- # 🎯 80,000 Hours Career Advice Q&A
71
- Ask questions about career planning and get answers backed by citations from 80,000 Hours articles.
72
- """
73
- )
74
-
75
- with gr.Row():
76
- with gr.Column():
77
- question_input = gr.Textbox(
78
- label="Your Question",
79
- placeholder="e.g., Should I plan my entire career?",
80
- lines=2
81
- )
82
- show_context_checkbox = gr.Checkbox(
83
- label="Show retrieved context (for debugging)",
84
- value=False
85
- )
86
- submit_btn = gr.Button("Ask", variant="primary")
87
-
88
- with gr.Row():
89
- with gr.Column():
90
- answer_output = gr.Textbox(
91
- label="Answer",
92
- lines=10,
93
- show_copy_button=True
94
- )
95
-
96
- with gr.Column():
97
- citations_output = gr.Markdown(label="Citations & Sources")
98
-
99
- # Event handlers
100
- submit_btn.click(
101
- fn=chat_interface,
102
- inputs=[question_input, show_context_checkbox],
103
- outputs=[answer_output, citations_output]
104
- )
105
-
106
- question_input.submit(
107
  fn=chat_interface,
108
- inputs=[question_input, show_context_checkbox],
109
- outputs=[answer_output, citations_output]
110
- )
111
-
112
- gr.Examples(
113
- examples = [
114
- "What skills will be most in demand in the next 5–10 years?",
115
- "What careers will be most affected by AI?",
116
- "How can I work on the world's most pressing problems?",
117
- "How do I figure out what I want to do with my life?",
118
- ],
119
- inputs=question_input
 
 
 
 
 
 
 
120
  )
121
 
122
  # --- Launch Logic ---
 
2
  import os
3
  from query import ask
4
 
 
5
  print("🚀 Starting 80,000 Hours RAG system...")
6
  from query import is_model_ready
7
  print("✅ App ready! Model loading in background...")
8
 
9
+ def chat_interface(message: str, history):
10
+ """Process question and return formatted response for chatbot.
 
 
11
 
12
+ Args:
13
+ message: User's question (string or dict with 'content' key)
14
+ history: Chat history (list of message dicts with 'role' and 'content')
15
+
16
+ Returns:
17
+ Formatted response with answer and citations
18
+ """
19
+ # Handle both string and dict message formats
20
+ if isinstance(message, dict):
21
+ message = message.get('text', message.get('content', ''))
22
 
23
+ if not message or not message.strip():
24
+ return ""
25
 
26
+ result = ask(message, show_context=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Format response: answer first, then divider, then citations
29
+ response = result["answer"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Add citations after divider
32
  if result["citations"]:
33
+ response += "\n\n---\n\n**Citations:**\n\n"
34
+ for i, citation in enumerate(result["citations"], 1):
35
+ # Replace bullet points in citation text with newline + bullet icon
36
+ response += f"**[{i}]** [{citation['title']}]({citation['url']})\n\n"
37
 
38
+ return response
39
 
40
 
41
  # --- Build Gradio UI ---
42
+ with gr.Blocks(title="80,000 Hours Q&A", theme=gr.themes.Soft(), css="""
43
+ footer {display: none !important;}
44
+ .examples button {
45
+ background: linear-gradient(to bottom, #ffffff, #f8f9fa) !important;
46
+ border: 2px solid #dee2e6 !important;
47
+ border-radius: 8px !important;
48
+ padding: 12px 16px !important;
49
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05) !important;
50
+ transition: all 0.2s ease !important;
51
+ }
52
+ .examples button:hover {
53
+ border-color: #adb5bd !important;
54
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1) !important;
55
+ transform: translateY(-1px) !important;
56
+ }
57
+ """) as demo:
58
+ # Title section
59
+ gr.Markdown("# 80,000 Hours Q&A")
60
+ gr.Markdown("*Ask questions about career planning and get answers backed by citations from 80,000 Hours articles.*")
61
+
62
+ gr.ChatInterface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  fn=chat_interface,
64
+ type="messages",
65
+ chatbot=gr.Chatbot(
66
+ height=400,
67
+ show_copy_button=True,
68
+ render_markdown=True,
69
+ layout="bubble",
70
+ type="messages"
71
+ ),
72
+ textbox=gr.MultimodalTextbox(
73
+ placeholder="Ask about career planning...",
74
+ show_label=False,
75
+ submit_btn=True,
76
+ sources=[]
77
+ ),
78
+ examples=[
79
+ "What skills will be most in demand in the next 5–10 years?",
80
+ "How can I work on the world's most pressing problems?",
81
+ "How do I figure out what I want to do with my life?",
82
+ ]
83
  )
84
 
85
  # --- Launch Logic ---
citations.py CHANGED
@@ -144,33 +144,93 @@ def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any])
144
  "validation_errors": validation_errors
145
  }
146
 
147
- def _expand_to_word_boundaries(text: str, start: int, end: int) -> Tuple[int, int]:
148
- """Expand alignment boundaries to include complete words.
 
 
 
 
149
 
150
- Handles hyphenated words (e.g., "long-term"), contractions (e.g., "don't"),
151
- and possessives (e.g., "company's").
 
 
152
 
153
  Args:
154
- text: The full source text
155
- start: Start position from alignment
156
- end: End position from alignment
157
 
158
  Returns:
159
- Tuple of (expanded_start, expanded_end)
160
  """
161
- def is_word_char(char: str) -> bool:
162
- """Check if character is part of a word (alphanumeric, hyphen, or apostrophe)."""
163
- return char.isalnum() or char in ("-", "'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Expand start backward to beginning of word
166
- while start > 0 and is_word_char(text[start - 1]):
167
- start -= 1
 
168
 
169
- # Expand end forward to end of word
170
- while end < len(text) and is_word_char(text[end]):
171
- end += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- return start, end
174
 
175
  def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
176
  matched_text: str, remapped: bool = False) -> Dict[str, Any]:
@@ -208,35 +268,42 @@ def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> D
208
  "source_text": None
209
  }
210
 
 
 
 
 
211
  # Step 1: Check the AI's cited source first (fast path)
212
  source_text = source_chunks[source_id - 1].payload['text']
213
- primary_alignment = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=FUZZY_THRESHOLD)
214
 
215
- if primary_alignment:
216
- # Expand to word boundaries to avoid cutting off partial words
217
- start, end = _expand_to_word_boundaries(source_text, primary_alignment.dest_start, primary_alignment.dest_end)
 
 
218
  matched_text = source_text[start:end].strip()
219
- return _build_valid_result(quote, source_chunks[source_id - 1], source_id, primary_alignment.score, matched_text)
220
 
221
  # Step 2: Search other sources for remapping (AI cited wrong source)
222
  for idx, chunk in enumerate(source_chunks, 1):
223
  if idx == source_id:
224
  continue # Already checked
225
- other_alignment = fuzz.partial_ratio_alignment(quote, chunk.payload['text'], score_cutoff=FUZZY_THRESHOLD)
226
- if other_alignment:
227
- # Expand to word boundaries to avoid cutting off partial words
228
- start, end = _expand_to_word_boundaries(chunk.payload['text'], other_alignment.dest_start, other_alignment.dest_end)
 
229
  matched_text = chunk.payload['text'][start:end].strip()
230
- return _build_valid_result(quote, chunk, idx, other_alignment.score, matched_text, remapped=True)
231
 
232
  # Validation failed - find closest match for debugging
233
  matched_text = ""
234
  actual_score = 0
235
  try:
236
- debug_alignment = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=70)
237
- if debug_alignment:
238
- matched_text = source_text[debug_alignment.dest_start:debug_alignment.dest_end].strip()
239
- actual_score = debug_alignment.score
 
240
  except:
241
  pass
242
 
 
144
  "validation_errors": validation_errors
145
  }
146
 
147
+ def _is_word_char(char: str) -> bool:
148
+ """Check if character is part of a word (alphanumeric, comma, hyphen, apostrophe)."""
149
+ return char.isalnum() or char in (',', '-', "'", "'")
150
+
151
+ def _find_best_match_position(quote: str, source_text: str, alignment_hint=None) -> Tuple[int, int, float]:
152
+ """Find the best matching position for a quote in source text using sliding window.
153
 
154
+ This method is better than partial_ratio_alignment because it:
155
+ 1. Uses word boundaries naturally
156
+ 2. Finds the best matching substring at the token level
157
+ 3. Returns positions that align with actual text segments
158
 
159
  Args:
160
+ quote: The text to find
161
+ source_text: The text to search in
162
+ alignment_hint: Optional alignment result from partial_ratio_alignment to focus search
163
 
164
  Returns:
165
+ Tuple of (start_pos, end_pos, score). Returns (-1, -1, 0) if no good match.
166
  """
167
+ import re
168
+
169
+ # Normalize whitespace for matching
170
+ quote_normalized = ' '.join(quote.split())
171
+
172
+ # Split source into words with their positions
173
+ # This regex splits on whitespace while preserving positions
174
+ word_pattern = re.compile(r'\S+')
175
+ source_words = []
176
+ for match in word_pattern.finditer(source_text):
177
+ source_words.append({
178
+ 'word': match.group(),
179
+ 'start': match.start(),
180
+ 'end': match.end()
181
+ })
182
+
183
+ quote_words = quote_normalized.split()
184
+
185
+ if not quote_words or not source_words:
186
+ return -1, -1, 0
187
+
188
+ # Determine search range based on alignment hint
189
+ if alignment_hint:
190
+ # Find which word index contains the alignment position
191
+ center_word_idx = 0
192
+ for idx, word_info in enumerate(source_words):
193
+ if word_info['start'] <= alignment_hint.dest_start < word_info['end']:
194
+ center_word_idx = idx
195
+ break
196
+
197
+ # Search within +/- 5 words of the hint position
198
+ search_start_idx = max(0, center_word_idx - 5)
199
+ search_end_idx = min(len(source_words), center_word_idx + len(quote_words) + 5)
200
+ else:
201
+ # No hint found, search entire text (fallback)
202
+ search_start_idx = 0
203
+ search_end_idx = len(source_words)
204
+
205
+ best_score = 0
206
+ best_start = -1
207
+ best_end = -1
208
 
209
+ # Try different window sizes around the quote length
210
+ # Quote should never be longer than source, so only check smaller windows
211
+ min_window = max(1, len(quote_words) - 3)
212
+ max_window = min(search_end_idx - search_start_idx, len(quote_words))
213
 
214
+ for window_size in range(min_window, max_window + 1):
215
+ for i in range(search_start_idx, min(search_end_idx - window_size + 1, len(source_words) - window_size + 1)):
216
+ # Get window of words
217
+ window_words = [source_words[j]['word'] for j in range(i, i + window_size)]
218
+ window_text = ' '.join(window_words)
219
+
220
+ # Calculate similarity score
221
+ score = fuzz.ratio(quote_normalized, window_text)
222
+
223
+ if score > best_score:
224
+ best_score = score
225
+ # Use the start of the first word and end of the last word
226
+ best_start = source_words[i]['start']
227
+ best_end = source_words[i + window_size - 1]['end']
228
+
229
+ # Strip trailing punctuation from the end position
230
+ while best_end > best_start and source_text[best_end - 1] in '.,;:!?)':
231
+ best_end -= 1
232
 
233
+ return best_start, best_end, best_score
234
 
235
  def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
236
  matched_text: str, remapped: bool = False) -> Dict[str, Any]:
 
268
  "source_text": None
269
  }
270
 
271
+ # If quote contains ellipsis, only match the part before it
272
+ if '...' in quote:
273
+ quote = quote.split('...')[0].strip()
274
+
275
  # Step 1: Check the AI's cited source first (fast path)
276
  source_text = source_chunks[source_id - 1].payload['text']
 
277
 
278
+ # Get alignment hint from partial_ratio_alignment
279
+ alignment_hint = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=70)
280
+ start, end, score = _find_best_match_position(quote, source_text, alignment_hint)
281
+
282
+ if score >= FUZZY_THRESHOLD and start != -1:
283
  matched_text = source_text[start:end].strip()
284
+ return _build_valid_result(quote, source_chunks[source_id - 1], source_id, score, matched_text)
285
 
286
  # Step 2: Search other sources for remapping (AI cited wrong source)
287
  for idx, chunk in enumerate(source_chunks, 1):
288
  if idx == source_id:
289
  continue # Already checked
290
+
291
+ # Get alignment hint for this chunk
292
+ alignment_hint = fuzz.partial_ratio_alignment(quote, chunk.payload['text'], score_cutoff=70)
293
+ start, end, score = _find_best_match_position(quote, chunk.payload['text'], alignment_hint)
294
+ if score >= FUZZY_THRESHOLD and start != -1:
295
  matched_text = chunk.payload['text'][start:end].strip()
296
+ return _build_valid_result(quote, chunk, idx, score, matched_text, remapped=True)
297
 
298
  # Validation failed - find closest match for debugging
299
  matched_text = ""
300
  actual_score = 0
301
  try:
302
+ debug_hint = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=60)
303
+ debug_start, debug_end, debug_score = _find_best_match_position(quote, source_text, debug_hint)
304
+ if debug_score >= 70 and debug_start != -1:
305
+ matched_text = source_text[debug_start:debug_end].strip()
306
+ actual_score = debug_score
307
  except:
308
  pass
309
 
query.py CHANGED
@@ -137,7 +137,7 @@ def generate_answer_with_citations(
137
 
138
  CRITICAL RULES:
139
  1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
140
- 2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
141
  3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
142
  4. Each quote must be complete sentences from the source
143
 
@@ -303,7 +303,7 @@ def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
303
  print(f"[TIMING] Total: {total_time:.0f}ms")
304
 
305
  # Display results
306
- display_results(question, result, context if show_context else None)
307
 
308
  # Save debug output
309
  save_validation_results(question, result, results, 0)
 
137
 
138
  CRITICAL RULES:
139
  1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
140
+ 2. Copy quotes EXACTLY - No changes, NO ellipses, No paraphrasing
141
  3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
142
  4. Each quote must be complete sentences from the source
143
 
 
303
  print(f"[TIMING] Total: {total_time:.0f}ms")
304
 
305
  # Display results
306
+ # display_results(question, result, context if show_context else None)
307
 
308
  # Save debug output
309
  save_validation_results(question, result, results, 0)