Spaces:
Sleeping
Sleeping
Ryan commited on
Commit ·
c759ad8
1
Parent(s): 83175f3
- better formatting and debug
Browse files- app.py +22 -2
- citations.py +46 -50
- query.py +21 -4
- test_url.py +18 -0
app.py
CHANGED
|
@@ -19,15 +19,35 @@ def chat_interface(question: str, show_context: bool = False):
|
|
| 19 |
for i, citation in enumerate(result["citations"], 1):
|
| 20 |
# Use matched_text (actual source text) instead of AI's quote
|
| 21 |
display_text = citation.get('matched_text', citation['quote'])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
citations_text += f"**[{i}]** {citation['title']}\n\n"
|
| 23 |
citations_text += f"> \"{display_text}\"\n\n"
|
| 24 |
citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
|
| 25 |
|
| 26 |
# Add validation warnings if any
|
| 27 |
if result.get("validation_errors"):
|
| 28 |
-
citations_text += "\n⚠️
|
| 29 |
for error in result["validation_errors"]:
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Add stats
|
| 33 |
if result["citations"]:
|
|
|
|
| 19 |
for i, citation in enumerate(result["citations"], 1):
|
| 20 |
# Use matched_text (actual source text) instead of AI's quote
|
| 21 |
display_text = citation.get('matched_text', citation['quote'])
|
| 22 |
+
# Replace markdown bullets with bullet character for display in quote block
|
| 23 |
+
display_text = display_text.replace('\n- ', '\n• ')
|
| 24 |
+
if display_text.startswith('- '):
|
| 25 |
+
display_text = '\n• ' + display_text[2:]
|
| 26 |
citations_text += f"**[{i}]** {citation['title']}\n\n"
|
| 27 |
citations_text += f"> \"{display_text}\"\n\n"
|
| 28 |
citations_text += f"🔗 [View highlighted quote on 80,000 Hours →]({citation['url']})\n\n"
|
| 29 |
|
| 30 |
# Add validation warnings if any
|
| 31 |
if result.get("validation_errors"):
|
| 32 |
+
citations_text += "\n---\n\n### ⚠️ Validation Warnings\n\n"
|
| 33 |
for error in result["validation_errors"]:
|
| 34 |
+
fuzzy_score = error.get('fuzzy_match_score', 0)
|
| 35 |
+
citations_text += f"**[{error['citation_id']}]** {error['reason']}\n\n"
|
| 36 |
+
|
| 37 |
+
# Format claimed quote (stored as 'quote' in validation result)
|
| 38 |
+
claimed_quote = error.get('quote', '')
|
| 39 |
+
claimed_quote = claimed_quote.replace('\n- ', '\n• ')
|
| 40 |
+
if claimed_quote.startswith('- '):
|
| 41 |
+
claimed_quote = '\n• ' + claimed_quote[2:]
|
| 42 |
+
citations_text += f"**AI's claimed quote:**\n> \"{claimed_quote}\"\n\n"
|
| 43 |
+
|
| 44 |
+
# Format matched text from source
|
| 45 |
+
if error.get('matched_text'):
|
| 46 |
+
matched_text = error['matched_text']
|
| 47 |
+
matched_text = matched_text.replace('\n- ', '\n• ')
|
| 48 |
+
if matched_text.startswith('- '):
|
| 49 |
+
matched_text = '\n• ' + matched_text[2:]
|
| 50 |
+
citations_text += f"**Closest match in actual source** ({fuzzy_score:.1f}% fuzzy match):\n> \"{matched_text}\"\n\n"
|
| 51 |
|
| 52 |
# Add stats
|
| 53 |
if result["citations"]:
|
citations.py
CHANGED
|
@@ -10,7 +10,7 @@ from rapidfuzz import fuzz
|
|
| 10 |
from fuzzysearch import find_near_matches
|
| 11 |
|
| 12 |
|
| 13 |
-
FUZZY_THRESHOLD =
|
| 14 |
|
| 15 |
def find_best_match_substring(quote: str, source_text: str) -> str:
|
| 16 |
"""Find the actual matching substring in source_text.
|
|
@@ -56,23 +56,27 @@ def create_highlighted_url(base_url: str, quote_text: str) -> str:
|
|
| 56 |
Returns:
|
| 57 |
URL with text fragment
|
| 58 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
# Extract a meaningful snippet (first ~80 chars work better for text fragments)
|
| 60 |
# Cut at word boundaries to avoid breaking words mid-way
|
| 61 |
max_length = 80
|
| 62 |
-
if len(
|
| 63 |
# Find the last space before the cutoff
|
| 64 |
-
text_fragment =
|
| 65 |
last_space = text_fragment.rfind(' ')
|
| 66 |
if last_space > 0: # If we found a space, cut there
|
| 67 |
text_fragment = text_fragment[:last_space]
|
| 68 |
else:
|
| 69 |
-
text_fragment =
|
| 70 |
|
| 71 |
text_fragment = text_fragment.strip()
|
| 72 |
|
| 73 |
-
# Encode everything for maximum compatibility
|
| 74 |
-
# quote() with safe='' still preserves unreserved chars (- . _ ~)
|
| 75 |
-
# So we manually encode those too
|
| 76 |
encoded_text = quote(text_fragment, safe='')
|
| 77 |
# Manually encode the unreserved chars that quote() preserves
|
| 78 |
encoded_text = encoded_text.replace('-', '%2D')
|
|
@@ -123,7 +127,7 @@ def build_citation_entry(citation: Dict[str, Any], validation_result: Dict[str,
|
|
| 123 |
Returns:
|
| 124 |
Complete citation entry with URL and metadata
|
| 125 |
"""
|
| 126 |
-
matched_text = validation_result
|
| 127 |
highlighted_url = create_highlighted_url(
|
| 128 |
validation_result["url"],
|
| 129 |
matched_text
|
|
@@ -135,10 +139,9 @@ def build_citation_entry(citation: Dict[str, Any], validation_result: Dict[str,
|
|
| 135 |
"matched_text": matched_text, # Actual text from source
|
| 136 |
"title": validation_result["title"],
|
| 137 |
"url": highlighted_url,
|
| 138 |
-
"
|
|
|
|
| 139 |
}
|
| 140 |
-
if validation_result.get("remapped"):
|
| 141 |
-
citation_entry["remapped_from"] = validation_result["original_source_id"]
|
| 142 |
return citation_entry
|
| 143 |
|
| 144 |
def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any]) -> Dict[str, Any]:
|
|
@@ -165,18 +168,32 @@ def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any])
|
|
| 165 |
citation_entry = build_citation_entry(citation, validation_result)
|
| 166 |
validated_citations.append(citation_entry)
|
| 167 |
else:
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
"claimed_quote": quote,
|
| 172 |
-
"source_text": validation_result.get('source_text')
|
| 173 |
-
})
|
| 174 |
|
| 175 |
return {
|
| 176 |
"validated_citations": validated_citations,
|
| 177 |
"validation_errors": validation_errors
|
| 178 |
}
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> Dict[str, Any]:
|
| 181 |
"""Validate that a quote exists in the specified source chunk.
|
| 182 |
|
|
@@ -197,51 +214,30 @@ def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> D
|
|
| 197 |
"source_text": None
|
| 198 |
}
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
# Step 1: Check claimed source first (fast path)
|
| 203 |
source_text = source_chunks[source_id - 1].payload['text']
|
| 204 |
claimed_score = fuzz.partial_ratio(quote, source_text)
|
| 205 |
|
| 206 |
if claimed_score >= FUZZY_THRESHOLD:
|
| 207 |
-
|
| 208 |
-
matched_substring = find_best_match_substring(quote, source_chunks[source_id - 1].payload['text'])
|
| 209 |
-
return {
|
| 210 |
-
"valid": True,
|
| 211 |
-
"quote": quote,
|
| 212 |
-
"matched_text": matched_substring, # The actual matching text from 80k Hours
|
| 213 |
-
"source_id": source_id,
|
| 214 |
-
"title": source_chunks[source_id - 1].payload['title'],
|
| 215 |
-
"url": source_chunks[source_id - 1].payload['url'],
|
| 216 |
-
"similarity_score": claimed_score
|
| 217 |
-
}
|
| 218 |
|
|
|
|
| 219 |
for idx, chunk in enumerate(source_chunks, 1):
|
| 220 |
if idx == source_id:
|
| 221 |
continue # Already checked
|
| 222 |
score = fuzz.partial_ratio(quote, chunk.payload['text'])
|
| 223 |
if score >= FUZZY_THRESHOLD:
|
| 224 |
-
|
| 225 |
-
matched_substring = find_best_match_substring(quote, chunk.payload['text'])
|
| 226 |
-
return {
|
| 227 |
-
"valid": True,
|
| 228 |
-
"quote": quote,
|
| 229 |
-
"matched_text": matched_substring, # The actual matching text from 80k Hours
|
| 230 |
-
"source_id": idx,
|
| 231 |
-
"title": chunk.payload['title'],
|
| 232 |
-
"url": chunk.payload['url'],
|
| 233 |
-
"similarity_score": score,
|
| 234 |
-
"remapped": True,
|
| 235 |
-
"original_source_id": source_id
|
| 236 |
-
}
|
| 237 |
|
| 238 |
-
# Validation failed -
|
|
|
|
| 239 |
return {
|
| 240 |
"valid": False,
|
| 241 |
"quote": quote,
|
| 242 |
"source_id": source_id,
|
| 243 |
-
"reason": f"Quote not found in any source (claimed source: {claimed_score:.1f}%
|
| 244 |
-
"
|
|
|
|
| 245 |
}
|
| 246 |
|
| 247 |
|
|
@@ -263,12 +259,12 @@ def format_citations_display(citations: List[Dict[str, Any]]) -> str:
|
|
| 263 |
citation_parts = []
|
| 264 |
for cit in sorted_citations:
|
| 265 |
marker = f"[{cit['citation_id']}]"
|
| 266 |
-
score = cit.get('
|
| 267 |
|
| 268 |
-
if cit.get('
|
| 269 |
-
note = f" ({score:.1f}% match, remapped
|
| 270 |
else:
|
| 271 |
-
note = f" ({score:.1f}% match)"
|
| 272 |
|
| 273 |
citation_parts.append(
|
| 274 |
f"{marker} {cit['title']}{note}\n"
|
|
|
|
| 10 |
from fuzzysearch import find_near_matches
|
| 11 |
|
| 12 |
|
| 13 |
+
FUZZY_THRESHOLD = 90
|
| 14 |
|
| 15 |
def find_best_match_substring(quote: str, source_text: str) -> str:
|
| 16 |
"""Find the actual matching substring in source_text.
|
|
|
|
| 56 |
Returns:
|
| 57 |
URL with text fragment
|
| 58 |
"""
|
| 59 |
+
# Take only the first line/paragraph (text fragments can't match across elements)
|
| 60 |
+
first_line = quote_text.split('\n')[0].strip()
|
| 61 |
+
|
| 62 |
+
# Remove bullet point markers (they're formatting, not content)
|
| 63 |
+
if first_line.startswith('- '):
|
| 64 |
+
first_line = first_line[2:].strip()
|
| 65 |
+
|
| 66 |
# Extract a meaningful snippet (first ~80 chars work better for text fragments)
|
| 67 |
# Cut at word boundaries to avoid breaking words mid-way
|
| 68 |
max_length = 80
|
| 69 |
+
if len(first_line) > max_length:
|
| 70 |
# Find the last space before the cutoff
|
| 71 |
+
text_fragment = first_line[:max_length]
|
| 72 |
last_space = text_fragment.rfind(' ')
|
| 73 |
if last_space > 0: # If we found a space, cut there
|
| 74 |
text_fragment = text_fragment[:last_space]
|
| 75 |
else:
|
| 76 |
+
text_fragment = first_line
|
| 77 |
|
| 78 |
text_fragment = text_fragment.strip()
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
encoded_text = quote(text_fragment, safe='')
|
| 81 |
# Manually encode the unreserved chars that quote() preserves
|
| 82 |
encoded_text = encoded_text.replace('-', '%2D')
|
|
|
|
| 127 |
Returns:
|
| 128 |
Complete citation entry with URL and metadata
|
| 129 |
"""
|
| 130 |
+
matched_text = validation_result["matched_text"]
|
| 131 |
highlighted_url = create_highlighted_url(
|
| 132 |
validation_result["url"],
|
| 133 |
matched_text
|
|
|
|
| 139 |
"matched_text": matched_text, # Actual text from source
|
| 140 |
"title": validation_result["title"],
|
| 141 |
"url": highlighted_url,
|
| 142 |
+
"fuzzy_match_score": validation_result["fuzzy_match_score"],
|
| 143 |
+
"remapped": validation_result.get("remapped", False)
|
| 144 |
}
|
|
|
|
|
|
|
| 145 |
return citation_entry
|
| 146 |
|
| 147 |
def process_citations(citations: List[Dict[str, Any]], source_chunks: List[Any]) -> Dict[str, Any]:
|
|
|
|
| 168 |
citation_entry = build_citation_entry(citation, validation_result)
|
| 169 |
validated_citations.append(citation_entry)
|
| 170 |
else:
|
| 171 |
+
# Add citation_id to validation result for tracking
|
| 172 |
+
validation_result["citation_id"] = citation_id
|
| 173 |
+
validation_errors.append(validation_result)
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
return {
|
| 176 |
"validated_citations": validated_citations,
|
| 177 |
"validation_errors": validation_errors
|
| 178 |
}
|
| 179 |
|
| 180 |
+
def _build_valid_result(quote: str, chunk: Any, chunk_id: int, score: float,
|
| 181 |
+
remapped: bool = False) -> Dict[str, Any]:
|
| 182 |
+
"""Build a valid citation result dict."""
|
| 183 |
+
matched_substring = find_best_match_substring(quote, chunk.payload['text'])
|
| 184 |
+
result = {
|
| 185 |
+
"valid": True,
|
| 186 |
+
"quote": quote,
|
| 187 |
+
"matched_text": matched_substring,
|
| 188 |
+
"source_id": chunk_id,
|
| 189 |
+
"title": chunk.payload['title'],
|
| 190 |
+
"url": chunk.payload['url'],
|
| 191 |
+
"fuzzy_match_score": score
|
| 192 |
+
}
|
| 193 |
+
if remapped:
|
| 194 |
+
result["remapped"] = True
|
| 195 |
+
return result
|
| 196 |
+
|
| 197 |
def validate_citation(quote: str, source_chunks: List[Any], source_id: int) -> Dict[str, Any]:
|
| 198 |
"""Validate that a quote exists in the specified source chunk.
|
| 199 |
|
|
|
|
| 214 |
"source_text": None
|
| 215 |
}
|
| 216 |
|
|
|
|
|
|
|
| 217 |
# Step 1: Check claimed source first (fast path)
|
| 218 |
source_text = source_chunks[source_id - 1].payload['text']
|
| 219 |
claimed_score = fuzz.partial_ratio(quote, source_text)
|
| 220 |
|
| 221 |
if claimed_score >= FUZZY_THRESHOLD:
|
| 222 |
+
return _build_valid_result(quote, source_chunks[source_id - 1], source_id, claimed_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
+
# Step 2: Search all other sources for remapping
|
| 225 |
for idx, chunk in enumerate(source_chunks, 1):
|
| 226 |
if idx == source_id:
|
| 227 |
continue # Already checked
|
| 228 |
score = fuzz.partial_ratio(quote, chunk.payload['text'])
|
| 229 |
if score >= FUZZY_THRESHOLD:
|
| 230 |
+
return _build_valid_result(quote, chunk, idx, score, remapped=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
# Validation failed - find closest match for debugging
|
| 233 |
+
matched_text = find_best_match_substring(quote, source_text)
|
| 234 |
return {
|
| 235 |
"valid": False,
|
| 236 |
"quote": quote,
|
| 237 |
"source_id": source_id,
|
| 238 |
+
"reason": f"Quote not found in any source (claimed source: {claimed_score:.1f}% fuzzy match)",
|
| 239 |
+
"matched_text": matched_text,
|
| 240 |
+
"fuzzy_match_score": claimed_score
|
| 241 |
}
|
| 242 |
|
| 243 |
|
|
|
|
| 259 |
citation_parts = []
|
| 260 |
for cit in sorted_citations:
|
| 261 |
marker = f"[{cit['citation_id']}]"
|
| 262 |
+
score = cit.get('fuzzy_match_score', 100)
|
| 263 |
|
| 264 |
+
if cit.get('remapped'):
|
| 265 |
+
note = f" ({score:.1f}% fuzzy match, remapped)"
|
| 266 |
else:
|
| 267 |
+
note = f" ({score:.1f}% fuzzy match)"
|
| 268 |
|
| 269 |
citation_parts.append(
|
| 270 |
f"{marker} {cit['title']}{note}\n"
|
query.py
CHANGED
|
@@ -17,6 +17,8 @@ SCORE_THRESHOLD = 0.4
|
|
| 17 |
|
| 18 |
def retrieve_context(question):
|
| 19 |
"""Retrieve relevant chunks from Qdrant."""
|
|
|
|
|
|
|
| 20 |
client = QdrantClient(
|
| 21 |
url=os.getenv("QDRANT_URL"),
|
| 22 |
api_key=os.getenv("QDRANT_API_KEY"),
|
|
@@ -32,6 +34,9 @@ def retrieve_context(question):
|
|
| 32 |
score_threshold=SCORE_THRESHOLD,
|
| 33 |
)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
| 35 |
return results.points
|
| 36 |
|
| 37 |
def format_context(results):
|
|
@@ -78,13 +83,17 @@ def generate_answer_with_citations(
|
|
| 78 |
STEP 2: Provide citations
|
| 79 |
- For each [N] in your answer, provide a citation with:
|
| 80 |
* citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
|
| 81 |
-
* source_id: Which source it came from (
|
| 82 |
* quote: Copy the EXACT sentences from that source, word-for-word
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
CRITICAL RULES:
|
| 85 |
1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
|
| 86 |
2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
|
| 87 |
-
3.
|
| 88 |
4. Each quote must be complete sentences from the source
|
| 89 |
|
| 90 |
OUTPUT FORMAT (valid JSON):
|
|
@@ -128,7 +137,7 @@ def generate_answer_with_citations(
|
|
| 128 |
parsed = parse_llm_response(response.choices[0].message.content)
|
| 129 |
if "validation_errors" in parsed:
|
| 130 |
return {
|
| 131 |
-
"answer": parsed["answer"],
|
| 132 |
"citations": [],
|
| 133 |
"validation_errors": parsed["validation_errors"],
|
| 134 |
"total_citations": 0,
|
|
@@ -139,7 +148,10 @@ def generate_answer_with_citations(
|
|
| 139 |
citations = parsed.get("citations", [])
|
| 140 |
|
| 141 |
# Validate citations
|
|
|
|
| 142 |
result = process_citations(citations, results)
|
|
|
|
|
|
|
| 143 |
|
| 144 |
return {
|
| 145 |
"answer": answer,
|
|
@@ -166,7 +178,7 @@ def save_validation_results(question: str, result: Dict[str, Any], results: List
|
|
| 166 |
"title": hit.payload['title'],
|
| 167 |
"url": hit.payload['url'],
|
| 168 |
"chunk_id": hit.payload.get('chunk_id'),
|
| 169 |
-
"
|
| 170 |
"text": hit.payload['text']
|
| 171 |
}
|
| 172 |
for i, hit in enumerate(results, 1)
|
|
@@ -212,6 +224,8 @@ def display_results(question: str, result: Dict[str, Any], context: str = None):
|
|
| 212 |
|
| 213 |
def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
|
| 214 |
"""Main RAG function: retrieve context and generate answer with validated citations."""
|
|
|
|
|
|
|
| 215 |
results = retrieve_context(question)
|
| 216 |
if not results:
|
| 217 |
print("No relevant sources found above the score threshold.")
|
|
@@ -232,6 +246,9 @@ def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
|
|
| 232 |
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 233 |
)
|
| 234 |
|
|
|
|
|
|
|
|
|
|
| 235 |
# Display results
|
| 236 |
display_results(question, result, context if show_context else None)
|
| 237 |
|
|
|
|
| 17 |
|
| 18 |
def retrieve_context(question):
|
| 19 |
"""Retrieve relevant chunks from Qdrant."""
|
| 20 |
+
start = time.time()
|
| 21 |
+
|
| 22 |
client = QdrantClient(
|
| 23 |
url=os.getenv("QDRANT_URL"),
|
| 24 |
api_key=os.getenv("QDRANT_API_KEY"),
|
|
|
|
| 34 |
score_threshold=SCORE_THRESHOLD,
|
| 35 |
)
|
| 36 |
|
| 37 |
+
elapsed = (time.time() - start) * 1000
|
| 38 |
+
print(f"[TIMING] Retrieval: {elapsed:.0f}ms")
|
| 39 |
+
|
| 40 |
return results.points
|
| 41 |
|
| 42 |
def format_context(results):
|
|
|
|
| 83 |
STEP 2: Provide citations
|
| 84 |
- For each [N] in your answer, provide a citation with:
|
| 85 |
* citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
|
| 86 |
+
* source_id: Which source it came from (match the [Source N] label exactly)
|
| 87 |
* quote: Copy the EXACT sentences from that source, word-for-word
|
| 88 |
|
| 89 |
+
EXAMPLE - If you found text in [Source 3]:
|
| 90 |
+
- Your answer: "Career capital helps you succeed [1]."
|
| 91 |
+
- Your citation: {"citation_id": 1, "source_id": 3, "quote": "Career capital includes..."}
|
| 92 |
+
|
| 93 |
CRITICAL RULES:
|
| 94 |
1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
|
| 95 |
2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
|
| 96 |
+
3. source_id MUST match the source number: [Source 1] → source_id: 1, [Source 5] → source_id: 5
|
| 97 |
4. Each quote must be complete sentences from the source
|
| 98 |
|
| 99 |
OUTPUT FORMAT (valid JSON):
|
|
|
|
| 137 |
parsed = parse_llm_response(response.choices[0].message.content)
|
| 138 |
if "validation_errors" in parsed:
|
| 139 |
return {
|
| 140 |
+
"answer": parsed["answer"], # raw llm response
|
| 141 |
"citations": [],
|
| 142 |
"validation_errors": parsed["validation_errors"],
|
| 143 |
"total_citations": 0,
|
|
|
|
| 148 |
citations = parsed.get("citations", [])
|
| 149 |
|
| 150 |
# Validate citations
|
| 151 |
+
validation_start = time.time()
|
| 152 |
result = process_citations(citations, results)
|
| 153 |
+
validation_time = (time.time() - validation_start) * 1000
|
| 154 |
+
print(f"[TIMING] Validation: {validation_time:.0f}ms")
|
| 155 |
|
| 156 |
return {
|
| 157 |
"answer": answer,
|
|
|
|
| 178 |
"title": hit.payload['title'],
|
| 179 |
"url": hit.payload['url'],
|
| 180 |
"chunk_id": hit.payload.get('chunk_id'),
|
| 181 |
+
"cosine_similarity": hit.score, # Vector similarity from Qdrant
|
| 182 |
"text": hit.payload['text']
|
| 183 |
}
|
| 184 |
for i, hit in enumerate(results, 1)
|
|
|
|
| 224 |
|
| 225 |
def ask(question: str, show_context: bool = False) -> Dict[str, Any]:
|
| 226 |
"""Main RAG function: retrieve context and generate answer with validated citations."""
|
| 227 |
+
total_start = time.time()
|
| 228 |
+
|
| 229 |
results = retrieve_context(question)
|
| 230 |
if not results:
|
| 231 |
print("No relevant sources found above the score threshold.")
|
|
|
|
| 246 |
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 247 |
)
|
| 248 |
|
| 249 |
+
total_time = (time.time() - total_start) * 1000
|
| 250 |
+
print(f"[TIMING] Total: {total_time:.0f}ms")
|
| 251 |
+
|
| 252 |
# Display results
|
| 253 |
display_results(question, result, context if show_context else None)
|
| 254 |
|
test_url.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Simple test for create_highlighted_url function."""
|
| 3 |
+
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
from citations import create_highlighted_url
|
| 6 |
+
|
| 7 |
+
def extract_base_url(full_url: str) -> str:
|
| 8 |
+
"""Extract base URL without fragments."""
|
| 9 |
+
parsed = urlparse(full_url)
|
| 10 |
+
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
| 11 |
+
|
| 12 |
+
# Paste your URL and text here
|
| 13 |
+
full_url = "https://80000hours.org/articles/future-generations/"
|
| 14 |
+
quote_text = '''- Risks from'''
|
| 15 |
+
|
| 16 |
+
base_url = extract_base_url(full_url)
|
| 17 |
+
result = create_highlighted_url(base_url, quote_text)
|
| 18 |
+
print(result)
|