Spaces:
Sleeping
Sleeping
Upload src\retrieval.py with huggingface_hub
Browse files- src//retrieval.py +60 -0
src//retrieval.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Retrieval system for context-aware question answering
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
from src.models import ContextUnit, RetrievalResult, QueryRequest
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ContextBuilder:
|
| 10 |
+
"""Builds context windows for LLM queries."""
|
| 11 |
+
|
| 12 |
+
@staticmethod
|
| 13 |
+
def build_context_for_llm(units: List[ContextUnit], query: str, max_tokens: int = 3000) -> str:
|
| 14 |
+
"""Build context string for LLM."""
|
| 15 |
+
context_parts = []
|
| 16 |
+
|
| 17 |
+
for unit in units:
|
| 18 |
+
part = f"Cell {unit.cell.cell_id} ({unit.cell.cell_type}):\n"
|
| 19 |
+
if unit.intent and unit.intent != "[Pending intent inference]":
|
| 20 |
+
part += f"Intent: {unit.intent}\n"
|
| 21 |
+
part += f"Content: {unit.cell.source[:500]}\n"
|
| 22 |
+
if unit.dependencies:
|
| 23 |
+
part += f"Dependencies: {', '.join(unit.dependencies)}\n"
|
| 24 |
+
part += "\n"
|
| 25 |
+
|
| 26 |
+
context_parts.append(part)
|
| 27 |
+
|
| 28 |
+
context = "\n".join(context_parts)
|
| 29 |
+
|
| 30 |
+
# Truncate if too long
|
| 31 |
+
if len(context) > max_tokens * 4: # Rough token estimate
|
| 32 |
+
context = context[:max_tokens * 4] + "..."
|
| 33 |
+
|
| 34 |
+
return context
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class RetrievalEngine:
|
| 38 |
+
"""Main retrieval engine."""
|
| 39 |
+
|
| 40 |
+
def __init__(self, context_thread, indexer):
|
| 41 |
+
self.context_thread = context_thread
|
| 42 |
+
self.indexer = indexer
|
| 43 |
+
|
| 44 |
+
def retrieve(self, query: str, top_k: int = 5) -> RetrievalResult:
|
| 45 |
+
"""Retrieve relevant context units."""
|
| 46 |
+
# Use semantic search
|
| 47 |
+
semantic_results = self.indexer.search_units(query, k=top_k)
|
| 48 |
+
|
| 49 |
+
# Extract units and scores
|
| 50 |
+
units = [unit for unit, score in semantic_results]
|
| 51 |
+
scores = [score for unit, score in semantic_results]
|
| 52 |
+
|
| 53 |
+
# For now, just return semantic results
|
| 54 |
+
# In full implementation, combine with structural retrieval
|
| 55 |
+
|
| 56 |
+
return RetrievalResult(
|
| 57 |
+
units=units,
|
| 58 |
+
scores=scores,
|
| 59 |
+
query=query
|
| 60 |
+
)
|