Spaces:

thomascerniglia
/

kgbchatbot

Sleeping

App Files Files Community

kgbchatbot / src /llm /answer.py

thomascerniglia

Add LLM integration with Qwen2.5-7B-Instruct via HF Inference API

2e0e474 3 months ago

raw

history blame contribute delete

3.03 kB

	from typing import List, Tuple
	from src.config import Settings

	def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
	"""Return formatted excerpts without LLM processing."""
	lines = []
	lines.append("Top relevant excerpts (no model used):\n")
	for i, (txt, score) in enumerate(hits, start=1):
	source = "unknown"
	body = txt
	if "[Source:" in txt:
	parts = txt.rsplit("[Source:", 1)
	body = parts[0].strip()
	source = "[Source:" + parts[1]
	lines.append(f"{i}. {body}\n\n{source} \nsimilarity: {score:.3f}")
	return "\n\n---\n\n".join(lines)

	def compose_answer_with_llm(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
	"""Use HF Inference API to generate a natural language answer from retrieved documents."""
	try:
	from huggingface_hub import InferenceClient

	# Prepare context from retrieved documents
	chunks = [t for t, _ in hits]
	context = "\n\n---\n\n".join(chunks)
	if len(context) > settings.max_context_chars:
	context = context[:settings.max_context_chars] + "\n\n[Context truncated]"

	# System prompt for RAG
	system_message = (
	"You are a knowledgeable assistant analyzing declassified KGB documents. "
	"Answer the user's question based ONLY on the provided context. "
	"Cite sources using [Source: ...] notation when referencing specific documents. "
	"If the context doesn't contain enough information to answer, say so clearly."
	)

	user_message = f"CONTEXT:\n{context}\n\n---\n\nQUESTION: {query}"

	# Initialize HF Inference Client
	client = InferenceClient(token=settings.hf_token)

	# Call the model with streaming disabled for simplicity
	messages = [
	{"role": "system", "content": system_message},
	{"role": "user", "content": user_message}
	]

	response = client.chat.completions.create(
	model="Qwen/Qwen2.5-7B-Instruct:fastest",
	messages=messages,
	max_tokens=512,
	temperature=0.3, # Lower temperature for more focused answers
	)

	return response.choices[0].message.content.strip()

	except Exception as e:
	# Fallback to retrieval-only if LLM fails
	return f"⚠️ LLM Error: {str(e)}\n\n---\n\n" + compose_answer_retrieval_only(query, hits, settings)

	def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
	"""Main entry point - routes to retrieval-only or LLM-based answer."""
	if settings.mode == "retrieval":
	return compose_answer_retrieval_only(query, hits, settings)
	elif settings.mode == "rag":
	return compose_answer_with_llm(query, hits, settings)
	else:
	return compose_answer_retrieval_only(query, hits, settings)