Ragcore / app /core /generator.py
NinjainPJs's picture
Initial deploy: RagCore RAG system with hybrid search and Gradio UI
a34068e
import logging
import time
from collections.abc import AsyncGenerator
from app.core.llm import GeminiService
from app.core.reranker import RerankerService
from app.models.schemas import GeneratedAnswer, RetrievedChunk
logger = logging.getLogger(__name__)
SYSTEM_PROMPT = """You are a helpful assistant answering questions based on the provided context.
CONTEXT:
{context}
RULES:
- Answer based ONLY on the provided context.
- Cite sources using [1], [2], etc. inline after the relevant information.
- If the context doesn't contain enough information, say "I don't have enough information in the provided documents to answer this question."
- Be concise but thorough.
- Use markdown formatting for readability.
QUESTION: {query}
ANSWER:"""
SUMMARY_PROMPT = """You are a helpful assistant. Summarize the following context.
CONTEXT:
{context}
RULES:
- Provide a structured summary using markdown.
- Cite sources using [1], [2], etc.
- Cover the key points from all provided sources.
QUESTION: {query}
SUMMARY:"""
class AnswerGenerator:
def __init__(self, llm: GeminiService, reranker: RerankerService):
self.llm = llm
self.reranker = reranker
def _build_context(self, chunks: list[RetrievedChunk]) -> str:
parts = []
for i, chunk in enumerate(chunks, 1):
source = chunk.metadata.source or "unknown"
header = f"[{i}] (Source: {source})"
parts.append(f"{header}\n{chunk.text}")
return "\n\n".join(parts)
def _build_prompt(self, query: str, chunks: list[RetrievedChunk], intent: str = "factual") -> str:
context = self._build_context(chunks)
template = SUMMARY_PROMPT if intent == "summarize" else SYSTEM_PROMPT
return template.format(context=context, query=query)
def generate_answer(
self,
query: str,
chunks: list[RetrievedChunk],
rerank_top_k: int = 5,
intent: str = "factual",
) -> GeneratedAnswer:
start = time.perf_counter()
# Rerank
reranked = self.reranker.rerank(query, chunks, top_k=rerank_top_k)
if not reranked:
return GeneratedAnswer(
query=query,
answer="No relevant documents found to answer your question.",
sources=[],
generation_time_ms=0,
model=self.llm.model_name,
)
prompt = self._build_prompt(query, reranked, intent)
answer = self.llm.generate(prompt)
elapsed = (time.perf_counter() - start) * 1000
logger.info(f"Generated answer in {elapsed:.0f}ms")
return GeneratedAnswer(
query=query,
answer=answer,
sources=reranked,
generation_time_ms=elapsed,
model=self.llm.model_name,
)
async def generate_answer_stream(
self,
query: str,
chunks: list[RetrievedChunk],
rerank_top_k: int = 5,
intent: str = "factual",
) -> AsyncGenerator[str | GeneratedAnswer, None]:
# Rerank
reranked = self.reranker.rerank(query, chunks, top_k=rerank_top_k)
if not reranked:
yield GeneratedAnswer(
query=query,
answer="No relevant documents found to answer your question.",
sources=[],
generation_time_ms=0,
model=self.llm.model_name,
)
return
prompt = self._build_prompt(query, reranked, intent)
start = time.perf_counter()
async for text_chunk in self.llm.generate_stream(prompt):
yield text_chunk
elapsed = (time.perf_counter() - start) * 1000
# Final message with sources
yield GeneratedAnswer(
query=query,
answer="", # Full answer was streamed
sources=reranked,
generation_time_ms=elapsed,
model=self.llm.model_name,
)