kgbchatbot / src /llm /answer.py
thomascerniglia's picture
Add LLM integration with Qwen2.5-7B-Instruct via HF Inference API
2e0e474
from typing import List, Tuple
from src.config import Settings
def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
"""Return formatted excerpts without LLM processing."""
lines = []
lines.append("**Top relevant excerpts** (no model used):\n")
for i, (txt, score) in enumerate(hits, start=1):
source = "unknown"
body = txt
if "[Source:" in txt:
parts = txt.rsplit("[Source:", 1)
body = parts[0].strip()
source = "[Source:" + parts[1]
lines.append(f"**{i}.** {body}\n\n*{source}* \n*similarity: {score:.3f}*")
return "\n\n---\n\n".join(lines)
def compose_answer_with_llm(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
"""Use HF Inference API to generate a natural language answer from retrieved documents."""
try:
from huggingface_hub import InferenceClient
# Prepare context from retrieved documents
chunks = [t for t, _ in hits]
context = "\n\n---\n\n".join(chunks)
if len(context) > settings.max_context_chars:
context = context[:settings.max_context_chars] + "\n\n[Context truncated]"
# System prompt for RAG
system_message = (
"You are a knowledgeable assistant analyzing declassified KGB documents. "
"Answer the user's question based ONLY on the provided context. "
"Cite sources using [Source: ...] notation when referencing specific documents. "
"If the context doesn't contain enough information to answer, say so clearly."
)
user_message = f"CONTEXT:\n{context}\n\n---\n\nQUESTION: {query}"
# Initialize HF Inference Client
client = InferenceClient(token=settings.hf_token)
# Call the model with streaming disabled for simplicity
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": user_message}
]
response = client.chat.completions.create(
model="Qwen/Qwen2.5-7B-Instruct:fastest",
messages=messages,
max_tokens=512,
temperature=0.3, # Lower temperature for more focused answers
)
return response.choices[0].message.content.strip()
except Exception as e:
# Fallback to retrieval-only if LLM fails
return f"⚠️ LLM Error: {str(e)}\n\n---\n\n" + compose_answer_retrieval_only(query, hits, settings)
def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
"""Main entry point - routes to retrieval-only or LLM-based answer."""
if settings.mode == "retrieval":
return compose_answer_retrieval_only(query, hits, settings)
elif settings.mode == "rag":
return compose_answer_with_llm(query, hits, settings)
else:
return compose_answer_retrieval_only(query, hits, settings)