Spaces:
Sleeping
Sleeping
| from typing import List, Tuple | |
| from src.config import Settings | |
| def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str: | |
| """Return formatted excerpts without LLM processing.""" | |
| lines = [] | |
| lines.append("**Top relevant excerpts** (no model used):\n") | |
| for i, (txt, score) in enumerate(hits, start=1): | |
| source = "unknown" | |
| body = txt | |
| if "[Source:" in txt: | |
| parts = txt.rsplit("[Source:", 1) | |
| body = parts[0].strip() | |
| source = "[Source:" + parts[1] | |
| lines.append(f"**{i}.** {body}\n\n*{source}* \n*similarity: {score:.3f}*") | |
| return "\n\n---\n\n".join(lines) | |
| def compose_answer_with_llm(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str: | |
| """Use HF Inference API to generate a natural language answer from retrieved documents.""" | |
| try: | |
| from huggingface_hub import InferenceClient | |
| # Prepare context from retrieved documents | |
| chunks = [t for t, _ in hits] | |
| context = "\n\n---\n\n".join(chunks) | |
| if len(context) > settings.max_context_chars: | |
| context = context[:settings.max_context_chars] + "\n\n[Context truncated]" | |
| # System prompt for RAG | |
| system_message = ( | |
| "You are a knowledgeable assistant analyzing declassified KGB documents. " | |
| "Answer the user's question based ONLY on the provided context. " | |
| "Cite sources using [Source: ...] notation when referencing specific documents. " | |
| "If the context doesn't contain enough information to answer, say so clearly." | |
| ) | |
| user_message = f"CONTEXT:\n{context}\n\n---\n\nQUESTION: {query}" | |
| # Initialize HF Inference Client | |
| client = InferenceClient(token=settings.hf_token) | |
| # Call the model with streaming disabled for simplicity | |
| messages = [ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": user_message} | |
| ] | |
| response = client.chat.completions.create( | |
| model="Qwen/Qwen2.5-7B-Instruct:fastest", | |
| messages=messages, | |
| max_tokens=512, | |
| temperature=0.3, # Lower temperature for more focused answers | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| # Fallback to retrieval-only if LLM fails | |
| return f"⚠️ LLM Error: {str(e)}\n\n---\n\n" + compose_answer_retrieval_only(query, hits, settings) | |
| def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str: | |
| """Main entry point - routes to retrieval-only or LLM-based answer.""" | |
| if settings.mode == "retrieval": | |
| return compose_answer_retrieval_only(query, hits, settings) | |
| elif settings.mode == "rag": | |
| return compose_answer_with_llm(query, hits, settings) | |
| else: | |
| return compose_answer_retrieval_only(query, hits, settings) | |