| """ |
| Context building for RAG pipeline. |
| |
| This module constructs optimized context from retrieved chunks for LLM consumption, |
| handling token limits and formatting. |
| """ |
|
|
| from typing import List, Optional |
| from dataclasses import dataclass |
| from src.config.settings import get_settings |
| from src.retrieval.retriever import RetrievedChunk |
| from src.utils.text_utils import count_tokens |
| from src.utils.logging import get_logger |
|
|
| logger = get_logger(__name__) |
|
|
|
|
| @dataclass |
| class RAGContext: |
| """Context prepared for LLM with source information.""" |
|
|
| formatted_context: str |
| sources: List[dict] |
| total_tokens: int |
| num_chunks: int |
|
|
|
|
| class ContextBuilder: |
| """Build context from retrieved chunks for LLM prompts.""" |
|
|
| def __init__(self): |
| """Initialize context builder with settings.""" |
| settings = get_settings() |
| self.max_context_tokens = settings.max_context_tokens |
| self.max_response_tokens = settings.max_response_tokens |
|
|
| def build_context( |
| self, |
| chunks: List[RetrievedChunk], |
| query: str, |
| max_tokens: Optional[int] = None, |
| ) -> RAGContext: |
| """ |
| Build formatted context from retrieved chunks. |
| |
| Args: |
| chunks: Retrieved chunks sorted by relevance |
| query: Original user query |
| max_tokens: Maximum tokens for context (default from settings) |
| |
| Returns: |
| RAGContext: Formatted context with metadata |
| """ |
| max_ctx_tokens = max_tokens or self.max_context_tokens |
|
|
| |
| available_tokens = max_ctx_tokens - 2000 |
|
|
| context_parts = [] |
| sources = [] |
| total_tokens = 0 |
| included_chunks = 0 |
|
|
| for chunk in chunks: |
| |
| chunk_tokens = chunk.token_count or count_tokens(chunk.text) |
|
|
| if total_tokens + chunk_tokens > available_tokens: |
| logger.debug(f"Token limit reached, stopping at {included_chunks} chunks") |
| break |
|
|
| |
| chunk_text = self._format_chunk(chunk, included_chunks + 1) |
| context_parts.append(chunk_text) |
|
|
| |
| |
| page_1indexed = (chunk.page_numbers[0] + 1) if chunk.page_numbers else None |
| source_type = chunk.source_type |
| sources.append({ |
| "index": included_chunks + 1, |
| "filename": chunk.filename, |
| "chunk_id": chunk.chunk_id, |
| "score": round(chunk.score, 3), |
| "page_numbers": [p + 1 for p in chunk.page_numbers] if chunk.page_numbers else [], |
| "page": page_1indexed, |
| "source_type": source_type, |
| "url": chunk.url if source_type != "local" else None, |
| }) |
|
|
| total_tokens += chunk_tokens |
| included_chunks += 1 |
|
|
| |
| formatted_context = "\n\n".join(context_parts) |
|
|
| logger.info(f"Built context: {included_chunks} chunks, {total_tokens} tokens") |
|
|
| return RAGContext( |
| formatted_context=formatted_context, |
| sources=sources, |
| total_tokens=total_tokens, |
| num_chunks=included_chunks, |
| ) |
|
|
| def _format_chunk(self, chunk: RetrievedChunk, index: int) -> str: |
| """ |
| Format a single chunk with source citation. |
| |
| Args: |
| chunk: Retrieved chunk |
| index: Citation index |
| |
| Returns: |
| str: Formatted chunk text |
| """ |
| source_type = chunk.source_type |
|
|
| if source_type == "local": |
| return f"[Source {index}: {chunk.filename}]\n{chunk.text}" |
| elif source_type in ("duckduckgo", "tavily"): |
| url = chunk.url or "" |
| return f"[Source {index}: Web - {chunk.filename}]\nURL: {url}\n{chunk.text}" |
| elif source_type in ("arxiv", "semantic_scholar", "pubmed"): |
| url = chunk.url or "" |
| return f"[Source {index}: Paper - {chunk.filename}]\nURL: {url}\n{chunk.text}" |
| else: |
| return f"[Source {index}: {chunk.filename}]\n{chunk.text}" |
|
|
| def build_prompt( |
| self, |
| query: str, |
| context: RAGContext, |
| system_prompt: Optional[str] = None, |
| ) -> dict: |
| """ |
| Build the full prompt for the LLM. |
| |
| Args: |
| query: User query |
| context: RAG context with retrieved information |
| system_prompt: Optional custom system prompt |
| |
| Returns: |
| dict: Prompt structure with system and user messages |
| """ |
| default_system = """You are a knowledgeable research assistant helping users understand documents in a PDF collection. |
| |
| Your task is to answer questions based on the provided context from the documents. Follow these guidelines: |
| |
| 1. **Use the context**: Base your answers primarily on the information provided in the context sections. |
| 2. **Cite sources**: When referencing information, cite the source using [Source N] format. |
| 3. **Be accurate**: If the context doesn't contain enough information to fully answer the question, say so clearly. |
| 4. **Be comprehensive**: Synthesize information from multiple sources when relevant. |
| 5. **Be concise**: Provide clear, well-organized answers without unnecessary verbosity. |
| |
| If the question cannot be answered from the provided context, explain what information is missing and suggest what might help.""" |
|
|
| system = system_prompt or default_system |
|
|
| user_message = f"""Based on the following context from the document collection, please answer my question. |
| |
| ## Context from Documents |
| |
| {context.formatted_context} |
| |
| ## Question |
| |
| {query} |
| |
| Please provide a comprehensive answer based on the context above, citing sources where appropriate.""" |
|
|
| return { |
| "system": system, |
| "user": user_message, |
| "sources": context.sources, |
| } |
|
|
| def build_streaming_prompt( |
| self, |
| query: str, |
| context: RAGContext, |
| ) -> dict: |
| """ |
| Build prompt optimized for streaming responses. |
| |
| Args: |
| query: User query |
| context: RAG context |
| |
| Returns: |
| dict: Prompt structure for streaming |
| """ |
| return self.build_prompt(query, context) |
|
|