Spaces:

Automaton9
/

80000_Hours_AI_Assistant

Sleeping

App Files Files Community

Ryan commited on Oct 11, 2025

Commit

c759ad8

1 Parent(s): 83175f3

- better formatting and debug

Browse files

Files changed (4) hide show

app.py +22 -2
citations.py +46 -50
query.py +21 -4
test_url.py +18 -0

app.py CHANGED Viewed

@@ -19,15 +19,35 @@ def chat_interface(question: str, show_context: bool = False):
         for i, citation in enumerate(result["citations"], 1):
             # Use matched_text (actual source text) instead of AI's quote
             display_text = citation.get('matched_text', citation['quote'])
             citations_text += f"**[{i}]** {citation['title']}\n\n"
             citations_text += f"> \"{display_text}\"\n\n"
             citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
     # Add validation warnings if any
     if result.get("validation_errors"):
-        citations_text += "\n⚠️ **Validation Warnings:**\n"
         for error in result["validation_errors"]:
-            citations_text += f"- {error}\n"
     # Add stats
     if result["citations"]:

         for i, citation in enumerate(result["citations"], 1):
             # Use matched_text (actual source text) instead of AI's quote
             display_text = citation.get('matched_text', citation['quote'])
+            # Replace markdown bullets with bullet character for display in quote block
+            display_text = display_text.replace('\n- ', '\n• ')
+            if display_text.startswith('- '):
+                display_text = '\n• ' + display_text[2:]
             citations_text += f"**[{i}]** {citation['title']}\n\n"
             citations_text += f"> \"{display_text}\"\n\n"
             citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
     # Add validation warnings if any
     if result.get("validation_errors"):
+        citations_text += "\n---\n\n### ⚠️ Validation Warnings\n\n"
         for error in result["validation_errors"]:
+            fuzzy_score = error.get('fuzzy_match_score', 0)
+            citations_text += f"**[{error['citation_id']}]** {error['reason']}\n\n"
+            # Format claimed quote (stored as 'quote' in validation result)
+            claimed_quote = error.get('quote', '')
+            claimed_quote = claimed_quote.replace('\n- ', '\n• ')
+            if claimed_quote.startswith('- '):
+                claimed_quote = '\n• ' + claimed_quote[2:]
+            citations_text += f"**AI's claimed quote:**\n> \"{claimed_quote}\"\n\n"
+            # Format matched text from source
+            if error.get('matched_text'):
+                matched_text = error['matched_text']
+                matched_text = matched_text.replace('\n- ', '\n• ')
+                if matched_text.startswith('- '):
+                    matched_text = '\n• ' + matched_text[2:]
+                citations_text += f"**Closest match in actual source** ({fuzzy_score:.1f}% fuzzy match):\n> \"{matched_text}\"\n\n"
     # Add stats
     if result["citations"]:

citations.py CHANGED Viewed

@@ -10,7 +10,7 @@ from rapidfuzz import fuzz
 from fuzzysearch import find_near_matches
-FUZZY_THRESHOLD = 95
 def find_best_match_substring(quote: str, source_text: str) -> str:
     """Find the actual matching substring in source_text.
@@ -56,23 +56,27 @@ def create_highlighted_url(base_url: str, quote_text: str) -> str:
     Returns:
         URL with text fragment
     """
     # Extract a meaningful snippet (first ~80 chars work better for text fragments)
     # Cut at word boundaries to avoid breaking words mid-way
     max_length = 80
-    if len(quote_text) > max_length:
         # Find the last space before the cutoff
-        text_fragment = quote_text[:max_length]
         last_space = text_fragment.rfind(' ')
         if last_space > 0:  # If we found a space, cut there
             text_fragment = text_fragment[:last_space]
     else:
-        text_fragment = quote_text
     text_fragment = text_fragment.strip()
-    # Encode everything for maximum compatibility
-    # quote() with safe='' still preserves unreserved chars (- . _ ~)
-    # So we manually encode those too
     encoded_text = quote(text_fragment, safe='')
     # Manually encode the unreserved chars that quote() preserves
     encoded_text = encoded_text.replace('-', '%2D')
@@ -123,7 +127,7 @@ def build_citation_entry(citation: Dict[str, Any], validation_result: Dict[str,
     Returns:
         Complete citation entry with URL and metadata
     """
-    matched_text = validation_result.get("matched_text", citation.get("quote", ""))
     highlighted_url = create_highlighted_url(
         validation_result["url"],
         matched_text
@@ -135,10 +139,9 @@ def build_citation_entry(citation: Dict[str, Any], validation_result: Dict[str,
         "matched_text": matched_text,  # Actual text from source
         "title": validation_result["title"],
         "url": highlighted_url,
-        "similarity_score": validation_result["similarity_score"]
     }
-    if validation_result.get("remapped"):
-        citation_entry["remapped_from"] = validation_result["original_source_id"]
     return citation_entry
 def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any]) -> Dict[str, Any]:
@@ -165,18 +168,32 @@ def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any])
             citation_entry = build_citation_entry(citation, validation_result)
             validated_citations.append(citation_entry)
         else:
-            validation_errors.append({
-                "citation_id": citation_id,
-                "reason": validation_result['reason'],
-                "claimed_quote": quote,
-                "source_text": validation_result.get('source_text')
-            })
     return {
         "validated_citations": validated_citations,
         "validation_errors": validation_errors
     }
 def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> Dict[str, Any]:
     """Validate that a quote exists in the specified source chunk.
@@ -197,51 +214,30 @@ def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> D
             "source_text": None
         }
     # Step 1: Check claimed source first (fast path)
     source_text = source_chunks[source_id - 1].payload['text']
     claimed_score = fuzz.partial_ratio(quote, source_text)
     if claimed_score >= FUZZY_THRESHOLD:
-        # Find the actual matching substring in the source
-        matched_substring = find_best_match_substring(quote, source_chunks[source_id - 1].payload['text'])
-        return {
-            "valid": True,
-            "quote": quote,
-            "matched_text": matched_substring,  # The actual matching text from 80k Hours
-            "source_id": source_id,
-            "title": source_chunks[source_id - 1].payload['title'],
-            "url": source_chunks[source_id - 1].payload['url'],
-            "similarity_score": claimed_score
-        }
     for idx, chunk in enumerate(source_chunks, 1):
         if idx == source_id:
             continue  # Already checked
         score = fuzz.partial_ratio(quote, chunk.payload['text'])
         if score >= FUZZY_THRESHOLD:
-            # Find the actual matching substring in the source
-            matched_substring = find_best_match_substring(quote, chunk.payload['text'])
-            return {
-                "valid": True,
-                "quote": quote,
-                "matched_text": matched_substring,  # The actual matching text from 80k Hours
-                "source_id": idx,
-                "title": chunk.payload['title'],
-                "url": chunk.payload['url'],
-                "similarity_score": score,
-                "remapped": True,
-                "original_source_id": source_id
-            }
-    # Validation failed - report best score from claimed source
     return {
         "valid": False,
         "quote": quote,
         "source_id": source_id,
-        "reason": f"Quote not found in any source (claimed source: {claimed_score:.1f}% similarity)",
-        "source_text": source_chunks[source_id - 1].payload['text']
     }
@@ -263,12 +259,12 @@ def format_citations_display(citations: List[Dict[str, Any]]) -> str:
     citation_parts = []
     for cit in sorted_citations:
         marker = f"[{cit['citation_id']}]"
-        score = cit.get('similarity_score', 100)
-        if cit.get('remapped_from'):
-            note = f" ({score:.1f}% match, remapped: source {cit['remapped_from']} → {cit['source_id']})"
         else:
-            note = f" ({score:.1f}% match)"
         citation_parts.append(
             f"{marker} {cit['title']}{note}\n"

 from fuzzysearch import find_near_matches
+FUZZY_THRESHOLD = 90
 def find_best_match_substring(quote: str, source_text: str) -> str:
     """Find the actual matching substring in source_text.
     Returns:
         URL with text fragment
     """
+    # Take only the first line/paragraph (text fragments can't match across elements)
+    first_line = quote_text.split('\n')[0].strip()
+    # Remove bullet point markers (they're formatting, not content)
+    if first_line.startswith('- '):
+        first_line = first_line[2:].strip()
     # Extract a meaningful snippet (first ~80 chars work better for text fragments)
     # Cut at word boundaries to avoid breaking words mid-way
     max_length = 80
+    if len(first_line) > max_length:
         # Find the last space before the cutoff
+        text_fragment = first_line[:max_length]
         last_space = text_fragment.rfind(' ')
         if last_space > 0:  # If we found a space, cut there
             text_fragment = text_fragment[:last_space]
     else:
+        text_fragment = first_line
     text_fragment = text_fragment.strip()
     encoded_text = quote(text_fragment, safe='')
     # Manually encode the unreserved chars that quote() preserves
     encoded_text = encoded_text.replace('-', '%2D')
     Returns:
         Complete citation entry with URL and metadata
     """
+    matched_text = validation_result["matched_text"]
     highlighted_url = create_highlighted_url(
         validation_result["url"],
         matched_text
         "matched_text": matched_text,  # Actual text from source
         "title": validation_result["title"],
         "url": highlighted_url,
+        "fuzzy_match_score": validation_result["fuzzy_match_score"],
+        "remapped": validation_result.get("remapped", False)
     }
     return citation_entry
 def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any]) -> Dict[str, Any]:
             citation_entry = build_citation_entry(citation, validation_result)
             validated_citations.append(citation_entry)
         else:
+            # Add citation_id to validation result for tracking
+            validation_result["citation_id"] = citation_id
+            validation_errors.append(validation_result)
     return {
         "validated_citations": validated_citations,
         "validation_errors": validation_errors
     }
+def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
+                        remapped: bool = False) -> Dict[str, Any]:
+    """Build a valid citation result dict."""
+    matched_substring = find_best_match_substring(quote, chunk.payload['text'])
+    result = {
+        "valid": True,
+        "quote": quote,
+        "matched_text": matched_substring,
+        "source_id": chunk_id,
+        "title": chunk.payload['title'],
+        "url": chunk.payload['url'],
+        "fuzzy_match_score": score
+    }
+    if remapped:
+        result["remapped"] = True
+    return result
 def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> Dict[str, Any]:
     """Validate that a quote exists in the specified source chunk.
             "source_text": None
         }
     # Step 1: Check claimed source first (fast path)
     source_text = source_chunks[source_id - 1].payload['text']
     claimed_score = fuzz.partial_ratio(quote, source_text)
     if claimed_score >= FUZZY_THRESHOLD:
+        return _build_valid_result(quote, source_chunks[source_id - 1], source_id, claimed_score)
+    # Step 2: Search all other sources for remapping
     for idx, chunk in enumerate(source_chunks, 1):
         if idx == source_id:
             continue  # Already checked
         score = fuzz.partial_ratio(quote, chunk.payload['text'])
         if score >= FUZZY_THRESHOLD:
+            return _build_valid_result(quote, chunk, idx, score, remapped=True)
+    # Validation failed - find closest match for debugging
+    matched_text = find_best_match_substring(quote, source_text)
     return {
         "valid": False,
         "quote": quote,
         "source_id": source_id,
+        "reason": f"Quote not found in any source (claimed source: {claimed_score:.1f}% fuzzy match)",
+        "matched_text": matched_text,
+        "fuzzy_match_score": claimed_score
     }
     citation_parts = []
     for cit in sorted_citations:
         marker = f"[{cit['citation_id']}]"
+        score = cit.get('fuzzy_match_score', 100)
+        if cit.get('remapped'):
+            note = f" ({score:.1f}% fuzzy match, remapped)"
         else:
+            note = f" ({score:.1f}% fuzzy match)"
         citation_parts.append(
             f"{marker} {cit['title']}{note}\n"

query.py CHANGED Viewed

@@ -17,6 +17,8 @@ SCORE_THRESHOLD = 0.4
 def retrieve_context(question):
     """Retrieve relevant chunks from Qdrant."""
     client = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY"),
@@ -32,6 +34,9 @@ def retrieve_context(question):
         score_threshold=SCORE_THRESHOLD,
     )
     return results.points
 def format_context(results):
@@ -78,13 +83,17 @@ def generate_answer_with_citations(
         STEP 2: Provide citations
         - For each [N] in your answer, provide a citation with:
         * citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
-        * source_id: Which source it came from (see [Source N] in context below)
         * quote: Copy the EXACT sentences from that source, word-for-word
         CRITICAL RULES:
         1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
         2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
-        3. Match source_id to where you found the quote ([Source 1] → source_id: 1)
         4. Each quote must be complete sentences from the source
         OUTPUT FORMAT (valid JSON):
@@ -128,7 +137,7 @@ def generate_answer_with_citations(
     parsed = parse_llm_response(response.choices[0].message.content)
     if "validation_errors" in parsed:
         return {
-            "answer": parsed["answer"],
             "citations": [],
             "validation_errors": parsed["validation_errors"],
             "total_citations": 0,
@@ -139,7 +148,10 @@ def generate_answer_with_citations(
     citations = parsed.get("citations", [])
     # Validate citations
     result = process_citations(citations, results)
     return {
         "answer": answer,
@@ -166,7 +178,7 @@ def save_validation_results(question: str, result: Dict[str, Any], results: List
                 "title": hit.payload['title'],
                 "url": hit.payload['url'],
                 "chunk_id": hit.payload.get('chunk_id'),
-                "similarity_score": hit.score,
                 "text": hit.payload['text']
             }
             for i, hit in enumerate(results, 1)
@@ -212,6 +224,8 @@ def display_results(question: str, result: Dict[str, Any], context: str = None):
 def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
     """Main RAG function: retrieve context and generate answer with validated citations."""
     results = retrieve_context(question)
     if not results:
         print("No relevant sources found above the score threshold.")
@@ -232,6 +246,9 @@ def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
         openai_api_key=os.getenv("OPENAI_API_KEY")
     )
     # Display results
     display_results(question, result, context if show_context else None)

 def retrieve_context(question):
     """Retrieve relevant chunks from Qdrant."""
+    start = time.time()
     client = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY"),
         score_threshold=SCORE_THRESHOLD,
     )
+    elapsed = (time.time() - start) * 1000
+    print(f"[TIMING] Retrieval: {elapsed:.0f}ms")
     return results.points
 def format_context(results):
         STEP 2: Provide citations
         - For each [N] in your answer, provide a citation with:
         * citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
+        * source_id: Which source it came from (match the [Source N] label exactly)
         * quote: Copy the EXACT sentences from that source, word-for-word
+        EXAMPLE - If you found text in [Source 3]:
+        - Your answer: "Career capital helps you succeed [1]."
+        - Your citation: {"citation_id": 1, "source_id": 3, "quote": "Career capital includes..."}
         CRITICAL RULES:
         1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
         2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
+        3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
         4. Each quote must be complete sentences from the source
         OUTPUT FORMAT (valid JSON):
     parsed = parse_llm_response(response.choices[0].message.content)
     if "validation_errors" in parsed:
         return {
+            "answer": parsed["answer"], # raw llm response
             "citations": [],
             "validation_errors": parsed["validation_errors"],
             "total_citations": 0,
     citations = parsed.get("citations", [])
     # Validate citations
+    validation_start = time.time()
     result = process_citations(citations, results)
+    validation_time = (time.time() - validation_start) * 1000
+    print(f"[TIMING] Validation: {validation_time:.0f}ms")
     return {
         "answer": answer,
                 "title": hit.payload['title'],
                 "url": hit.payload['url'],
                 "chunk_id": hit.payload.get('chunk_id'),
+                "cosine_similarity": hit.score,  # Vector similarity from Qdrant
                 "text": hit.payload['text']
             }
             for i, hit in enumerate(results, 1)
 def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
     """Main RAG function: retrieve context and generate answer with validated citations."""
+    total_start = time.time()
     results = retrieve_context(question)
     if not results:
         print("No relevant sources found above the score threshold.")
         openai_api_key=os.getenv("OPENAI_API_KEY")
     )
+    total_time = (time.time() - total_start) * 1000
+    print(f"[TIMING] Total: {total_time:.0f}ms")
     # Display results
     display_results(question, result, context if show_context else None)

test_url.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env python3
+"""Simple test for create_highlighted_url function."""
+from urllib.parse import urlparse
+from citations import create_highlighted_url
+def extract_base_url(full_url: str) -> str:
+    """Extract base URL without fragments."""
+    parsed = urlparse(full_url)
+    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
+# Paste your URL and text here
+full_url = "https://80000hours.org/articles/future-generations/"
+quote_text = '''- Risks from'''
+base_url = extract_base_url(full_url)
+result = create_highlighted_url(base_url, quote_text)
+print(result)