from typing import List, Tuple
from src.config import Settings

def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
    """Return formatted excerpts without LLM processing."""
    lines = []
    lines.append("**Top relevant excerpts** (no model used):\n")
    for i, (txt, score) in enumerate(hits, start=1):
        source = "unknown"
        body = txt
        if "[Source:" in txt:
            parts = txt.rsplit("[Source:", 1)
            body = parts[0].strip()
            source = "[Source:" + parts[1]
        lines.append(f"**{i}.** {body}\n\n*{source}*  \n*similarity: {score:.3f}*")
    return "\n\n---\n\n".join(lines)

def compose_answer_with_llm(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
    """Use HF Inference API to generate a natural language answer from retrieved documents."""
    try:
        from huggingface_hub import InferenceClient
        
        # Prepare context from retrieved documents
        chunks = [t for t, _ in hits]
        context = "\n\n---\n\n".join(chunks)
        if len(context) > settings.max_context_chars:
            context = context[:settings.max_context_chars] + "\n\n[Context truncated]"
        
        # System prompt for RAG
        system_message = (
            "You are a knowledgeable assistant analyzing declassified KGB documents. "
            "Answer the user's question based ONLY on the provided context. "
            "Cite sources using [Source: ...] notation when referencing specific documents. "
            "If the context doesn't contain enough information to answer, say so clearly."
        )
        
        user_message = f"CONTEXT:\n{context}\n\n---\n\nQUESTION: {query}"
        
        # Initialize HF Inference Client
        client = InferenceClient(token=settings.hf_token)
        
        # Call the model with streaming disabled for simplicity
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
        
        response = client.chat.completions.create(
            model="Qwen/Qwen2.5-7B-Instruct:fastest",
            messages=messages,
            max_tokens=512,
            temperature=0.3,  # Lower temperature for more focused answers
        )
        
        return response.choices[0].message.content.strip()
        
    except Exception as e:
        # Fallback to retrieval-only if LLM fails
        return f"⚠️ LLM Error: {str(e)}\n\n---\n\n" + compose_answer_retrieval_only(query, hits, settings)

def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
    """Main entry point - routes to retrieval-only or LLM-based answer."""
    if settings.mode == "retrieval":
        return compose_answer_retrieval_only(query, hits, settings)
    elif settings.mode == "rag":
        return compose_answer_with_llm(query, hits, settings)
    else:
        return compose_answer_retrieval_only(query, hits, settings)