Spaces:

Automaton9
/

80000_Hours_AI_Assistant

Sleeping

App Files Files Community

Ryan commited on Oct 14, 2025

Commit

ee6b298

1 Parent(s): 12f4fc7

update UI

Browse files

Files changed (3) hide show

app.py +63 -100
citations.py +99 -32
query.py +2 -2

app.py CHANGED Viewed

@@ -2,121 +2,84 @@ import gradio as gr
 import os
 from query import ask
-# Import query module (this starts loading the embedding model in background)
 print("🚀 Starting 80,000 Hours RAG system...")
 from query import is_model_ready
 print("✅ App ready! Model loading in background...")
-def chat_interface(question: str, show_context: bool = False):
-    """Process question and return formatted response."""
-    if not question.strip():
-        return "Please enter a question.", ""
-    result = ask(question, show_context=show_context)
-    # Format main response
-    answer = result["answer"]
-    # Format citations
-    citations_text = ""
-    if result["citations"]:
-        citations_text += "\n\n---\n\n### 📚 Citations\n\n"
-        for i, citation in enumerate(result["citations"], 1):
-            # Use matched_text (actual source text) instead of AI's quote
-            display_text = citation.get('matched_text', citation['quote'])
-            # Replace markdown bullets with bullet character for display in quote block
-            display_text = display_text.replace('\n- ', '\n• ')
-            if display_text.startswith('- '):
-                display_text = '\n• ' + display_text[2:]
-            citations_text += f"**[{i}]** {citation['title']}\n\n"
-            citations_text += f"> \"{display_text}\"\n\n"
-            citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
-    # Add validation warnings if any
-    if result.get("validation_errors"):
-        citations_text += "\n---\n\n### ⚠️ Validation Warnings\n\n"
-        for error in result["validation_errors"]:
-            fuzzy_score = error.get('fuzzy_match_score', 0)
-            citations_text += f"**[{error['citation_id']}]** {error['reason']}\n\n"
-            # Format claimed quote (stored as 'quote' in validation result)
-            claimed_quote = error.get('quote', '')
-            claimed_quote = claimed_quote.replace('\n- ', '\n• ')
-            if claimed_quote.startswith('- '):
-                claimed_quote = '\n• ' + claimed_quote[2:]
-            citations_text += f"**AI's claimed quote:**\n> \"{claimed_quote}\"\n\n"
-            # Format matched text from source
-            if error.get('matched_text'):
-                matched_text = error['matched_text']
-                matched_text = matched_text.replace('\n- ', '\n• ')
-                if matched_text.startswith('- '):
-                    matched_text = '\n• ' + matched_text[2:]
-                citations_text += f"**Closest match in actual source** ({fuzzy_score:.1f}% match):\n> \"{matched_text}\"\n\n"
-    # Add stats
     if result["citations"]:
-        valid_count = len([c for c in result["citations"] if c.get("validated", True)])
-        total_count = len(result["citations"])
-        citations_text += f"\n✓ {valid_count}/{total_count} citations validated"
-    return answer, citations_text
 # --- Build Gradio UI ---
-with gr.Blocks(title="80,000 Hours Q&A", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🎯 80,000 Hours Career Advice Q&A
-        Ask questions about career planning and get answers backed by citations from 80,000 Hours articles.
-        """
-    )
-    with gr.Row():
-        with gr.Column():
-            question_input = gr.Textbox(
-                label="Your Question",
-                placeholder="e.g., Should I plan my entire career?",
-                lines=2
-            )
-            show_context_checkbox = gr.Checkbox(
-                label="Show retrieved context (for debugging)",
-                value=False
-            )
-            submit_btn = gr.Button("Ask", variant="primary")
-    with gr.Row():
-        with gr.Column():
-            answer_output = gr.Textbox(
-                label="Answer",
-                lines=10,
-                show_copy_button=True
-            )
-        with gr.Column():
-            citations_output = gr.Markdown(label="Citations & Sources")
-    # Event handlers
-    submit_btn.click(
-        fn=chat_interface,
-        inputs=[question_input, show_context_checkbox],
-        outputs=[answer_output, citations_output]
-    )
-    question_input.submit(
         fn=chat_interface,
-        inputs=[question_input, show_context_checkbox],
-        outputs=[answer_output, citations_output]
-    )
-    gr.Examples(
-            examples = [
-                "What skills will be most in demand in the next 5–10 years?",
-                "What careers will be most affected by AI?",
-                "How can I work on the world's most pressing problems?",
-                "How do I figure out what I want to do with my life?",
-            ],
-        inputs=question_input
     )
 # --- Launch Logic ---

 import os
 from query import ask
 print("🚀 Starting 80,000 Hours RAG system...")
 from query import is_model_ready
 print("✅ App ready! Model loading in background...")
+def chat_interface(message: str, history):
+    """Process question and return formatted response for chatbot.
+    Args:
+        message: User's question (string or dict with 'content' key)
+        history: Chat history (list of message dicts with 'role' and 'content')
+    Returns:
+        Formatted response with answer and citations
+    """
+    # Handle both string and dict message formats
+    if isinstance(message, dict):
+        message = message.get('text', message.get('content', ''))
+    if not message or not message.strip():
+        return ""
+    result = ask(message, show_context=False)
+    # Format response: answer first, then divider, then citations
+    response = result["answer"]
+    # Add citations after divider
     if result["citations"]:
+        response += "\n\n---\n\n**Citations:**\n\n"
+        for i, citation in enumerate(result["citations"], 1):
+            # Replace bullet points in citation text with newline + bullet icon
+            response += f"**[{i}]** [{citation['title']}]({citation['url']})\n\n"
+    return response
 # --- Build Gradio UI ---
+with gr.Blocks(title="80,000 Hours Q&A", theme=gr.themes.Soft(), css="""
+    footer {display: none !important;}
+    .examples button {
+        background: linear-gradient(to bottom, #ffffff, #f8f9fa) !important;
+        border: 2px solid #dee2e6 !important;
+        border-radius: 8px !important;
+        padding: 12px 16px !important;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.05) !important;
+        transition: all 0.2s ease !important;
+    }
+    .examples button:hover {
+        border-color: #adb5bd !important;
+        box-shadow: 0 4px 8px rgba(0,0,0,0.1) !important;
+        transform: translateY(-1px) !important;
+    }
+""") as demo:
+    # Title section
+    gr.Markdown("# 80,000 Hours Q&A")
+    gr.Markdown("*Ask questions about career planning and get answers backed by citations from 80,000 Hours articles.*")
+    gr.ChatInterface(
         fn=chat_interface,
+        type="messages",
+        chatbot=gr.Chatbot(
+            height=400,
+            show_copy_button=True,
+            render_markdown=True,
+            layout="bubble",
+            type="messages"
+        ),
+        textbox=gr.MultimodalTextbox(
+            placeholder="Ask about career planning...",
+            show_label=False,
+            submit_btn=True,
+            sources=[]
+        ),
+        examples=[
+            "What skills will be most in demand in the next 5–10 years?",
+            "How can I work on the world's most pressing problems?",
+            "How do I figure out what I want to do with my life?",
+        ]
     )
 # --- Launch Logic ---

citations.py CHANGED Viewed

@@ -144,33 +144,93 @@ def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any])
         "validation_errors": validation_errors
     }
-def _expand_to_word_boundaries(text: str, start: int, end: int) -> Tuple[int, int]:
-    """Expand alignment boundaries to include complete words.
-    Handles hyphenated words (e.g., "long-term"), contractions (e.g., "don't"),
-    and possessives (e.g., "company's").
     Args:
-        text: The full source text
-        start: Start position from alignment
-        end: End position from alignment
     Returns:
-        Tuple of (expanded_start, expanded_end)
     """
-    def is_word_char(char: str) -> bool:
-        """Check if character is part of a word (alphanumeric, hyphen, or apostrophe)."""
-        return char.isalnum() or char in ("-", "'")
-    # Expand start backward to beginning of word
-    while start > 0 and is_word_char(text[start - 1]):
-        start -= 1
-    # Expand end forward to end of word
-    while end < len(text) and is_word_char(text[end]):
-        end += 1
-    return start, end
 def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
                         matched_text: str, remapped: bool = False) -> Dict[str, Any]:
@@ -208,35 +268,42 @@ def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> D
             "source_text": None
         }
     # Step 1: Check the AI's cited source first (fast path)
     source_text = source_chunks[source_id - 1].payload['text']
-    primary_alignment = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=FUZZY_THRESHOLD)
-    if primary_alignment:
-        # Expand to word boundaries to avoid cutting off partial words
-        start, end = _expand_to_word_boundaries(source_text, primary_alignment.dest_start, primary_alignment.dest_end)
         matched_text = source_text[start:end].strip()
-        return _build_valid_result(quote, source_chunks[source_id - 1], source_id, primary_alignment.score, matched_text)
     # Step 2: Search other sources for remapping (AI cited wrong source)
     for idx, chunk in enumerate(source_chunks, 1):
         if idx == source_id:
             continue  # Already checked
-        other_alignment = fuzz.partial_ratio_alignment(quote, chunk.payload['text'], score_cutoff=FUZZY_THRESHOLD)
-        if other_alignment:
-            # Expand to word boundaries to avoid cutting off partial words
-            start, end = _expand_to_word_boundaries(chunk.payload['text'], other_alignment.dest_start, other_alignment.dest_end)
             matched_text = chunk.payload['text'][start:end].strip()
-            return _build_valid_result(quote, chunk, idx, other_alignment.score, matched_text, remapped=True)
     # Validation failed - find closest match for debugging
     matched_text = ""
     actual_score = 0
     try:
-        debug_alignment = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=70)
-        if debug_alignment:
-            matched_text = source_text[debug_alignment.dest_start:debug_alignment.dest_end].strip()
-            actual_score = debug_alignment.score
     except:
         pass

         "validation_errors": validation_errors
     }
+def _is_word_char(char: str) -> bool:
+    """Check if character is part of a word (alphanumeric, comma, hyphen, apostrophe)."""
+    return char.isalnum() or char in (',', '-', "'", "'")
+def _find_best_match_position(quote: str, source_text: str, alignment_hint=None) -> Tuple[int, int, float]:
+    """Find the best matching position for a quote in source text using sliding window.
+    This method is better than partial_ratio_alignment because it:
+    1. Uses word boundaries naturally
+    2. Finds the best matching substring at the token level
+    3. Returns positions that align with actual text segments
     Args:
+        quote: The text to find
+        source_text: The text to search in
+        alignment_hint: Optional alignment result from partial_ratio_alignment to focus search
     Returns:
+        Tuple of (start_pos, end_pos, score). Returns (-1, -1, 0) if no good match.
     """
+    import re
+    # Normalize whitespace for matching
+    quote_normalized = ' '.join(quote.split())
+    # Split source into words with their positions
+    # This regex splits on whitespace while preserving positions
+    word_pattern = re.compile(r'\S+')
+    source_words = []
+    for match in word_pattern.finditer(source_text):
+        source_words.append({
+            'word': match.group(),
+            'start': match.start(),
+            'end': match.end()
+        })
+    quote_words = quote_normalized.split()
+    if not quote_words or not source_words:
+        return -1, -1, 0
+    # Determine search range based on alignment hint
+    if alignment_hint:
+        # Find which word index contains the alignment position
+        center_word_idx = 0
+        for idx, word_info in enumerate(source_words):
+            if word_info['start'] <= alignment_hint.dest_start < word_info['end']:
+                center_word_idx = idx
+                break
+        # Search within +/- 5 words of the hint position
+        search_start_idx = max(0, center_word_idx - 5)
+        search_end_idx = min(len(source_words), center_word_idx + len(quote_words) + 5)
+    else:
+        # No hint found, search entire text (fallback)
+        search_start_idx = 0
+        search_end_idx = len(source_words)
+    best_score = 0
+    best_start = -1
+    best_end = -1
+    # Try different window sizes around the quote length
+    # Quote should never be longer than source, so only check smaller windows
+    min_window = max(1, len(quote_words) - 3)
+    max_window = min(search_end_idx - search_start_idx, len(quote_words))
+    for window_size in range(min_window, max_window + 1):
+        for i in range(search_start_idx, min(search_end_idx - window_size + 1, len(source_words) - window_size + 1)):
+            # Get window of words
+            window_words = [source_words[j]['word'] for j in range(i, i + window_size)]
+            window_text = ' '.join(window_words)
+            # Calculate similarity score
+            score = fuzz.ratio(quote_normalized, window_text)
+            if score > best_score:
+                best_score = score
+                # Use the start of the first word and end of the last word
+                best_start = source_words[i]['start']
+                best_end = source_words[i + window_size - 1]['end']
+                # Strip trailing punctuation from the end position
+                while best_end > best_start and source_text[best_end - 1] in '.,;:!?)':
+                    best_end -= 1
+    return best_start, best_end, best_score
 def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
                         matched_text: str, remapped: bool = False) -> Dict[str, Any]:
             "source_text": None
         }
+    # If quote contains ellipsis, only match the part before it
+    if '...' in quote:
+        quote = quote.split('...')[0].strip()
     # Step 1: Check the AI's cited source first (fast path)
     source_text = source_chunks[source_id - 1].payload['text']
+    # Get alignment hint from partial_ratio_alignment
+    alignment_hint = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=70)
+    start, end, score = _find_best_match_position(quote, source_text, alignment_hint)
+    if score >= FUZZY_THRESHOLD and start != -1:
         matched_text = source_text[start:end].strip()
+        return _build_valid_result(quote, source_chunks[source_id - 1], source_id, score, matched_text)
     # Step 2: Search other sources for remapping (AI cited wrong source)
     for idx, chunk in enumerate(source_chunks, 1):
         if idx == source_id:
             continue  # Already checked
+        # Get alignment hint for this chunk
+        alignment_hint = fuzz.partial_ratio_alignment(quote, chunk.payload['text'], score_cutoff=70)
+        start, end, score = _find_best_match_position(quote, chunk.payload['text'], alignment_hint)
+        if score >= FUZZY_THRESHOLD and start != -1:
             matched_text = chunk.payload['text'][start:end].strip()
+            return _build_valid_result(quote, chunk, idx, score, matched_text, remapped=True)
     # Validation failed - find closest match for debugging
     matched_text = ""
     actual_score = 0
     try:
+        debug_hint = fuzz.partial_ratio_alignment(quote, source_text, score_cutoff=60)
+        debug_start, debug_end, debug_score = _find_best_match_position(quote, source_text, debug_hint)
+        if debug_score >= 70 and debug_start != -1:
+            matched_text = source_text[debug_start:debug_end].strip()
+            actual_score = debug_score
     except:
         pass

query.py CHANGED Viewed

@@ -137,7 +137,7 @@ def generate_answer_with_citations(
         CRITICAL RULES:
         1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
-        2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
         3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
         4. Each quote must be complete sentences from the source
@@ -303,7 +303,7 @@ def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
     print(f"[TIMING] Total: {total_time:.0f}ms")
     # Display results
-    display_results(question, result, context if show_context else None)
     # Save debug output
     save_validation_results(question, result, results, 0)

         CRITICAL RULES:
         1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
+        2. Copy quotes EXACTLY - No changes, NO ellipses, No paraphrasing
         3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
         4. Each quote must be complete sentences from the source
     print(f"[TIMING] Total: {total_time:.0f}ms")
     # Display results
+    # display_results(question, result, context if show_context else None)
     # Save debug output
     save_validation_results(question, result, results, 0)