Spaces:

decodingdatascience
/

Challengebot

Running

File size: 3,760 Bytes

3ea7b4f

import os
from pathlib import Path
import gradio as gr

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


# ======================
# Config (safe defaults)
# ======================
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
TOP_K = int(os.getenv("TOP_K", "3"))

# Your knowledge base file in the Space repo
DOC_PATH = Path(os.getenv("DOC_PATH", "challenge_context.txt"))

SYSTEM_GUARDRAILS = (
    "You are Challenge Copilot. Answer ONLY using the provided context. "
    "If the answer is not in the context, say: 'I don’t know based on the current document.' "
    "Then ask the user to add the missing official details to challenge_context.txt."
)


# ======================
# Build index (cached)
# ======================
_INDEX = None
_QUERY_ENGINE = None

def build_index():
    global _INDEX, _QUERY_ENGINE

    if _QUERY_ENGINE is not None:
        return _QUERY_ENGINE

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError(
            "OPENAI_API_KEY is missing. Add it in the Space Settings → Variables and secrets."
        )

    if not DOC_PATH.exists():
        # Create a placeholder so the Space boots even if you forgot the file
        DOC_PATH.write_text(
            "Add the official Building AI Application Challenge content here.\n",
            encoding="utf-8",
        )

    # LlamaIndex global settings
    Settings.llm = OpenAI(model=MODEL, temperature=0.2)
    Settings.embed_model = OpenAIEmbedding(model=EMBED_MODEL)
    Settings.chunk_size = 800
    Settings.chunk_overlap = 120

    # Reader expects a directory
    data_dir = str(DOC_PATH.parent)
    docs = SimpleDirectoryReader(
        input_dir=data_dir,
        required_exts=[".txt"],
        recursive=False
    ).load_data()

    # Only index the target file
    docs = [d for d in docs if d.metadata.get("file_name") == DOC_PATH.name]
    if not docs:
        raise FileNotFoundError(f"Could not load {DOC_PATH.name}. Make sure it exists in the repo.")

    _INDEX = VectorStoreIndex.from_documents(docs)
    _QUERY_ENGINE = _INDEX.as_query_engine(similarity_top_k=TOP_K)
    return _QUERY_ENGINE


def format_sources(resp, max_sources=3, max_chars=220):
    lines = []
    for i, sn in enumerate(getattr(resp, "source_nodes", [])[:max_sources], start=1):
        fn = sn.node.metadata.get("file_name", "unknown")
        snippet = sn.node.get_content().replace("\n", " ").strip()[:max_chars]
        score = getattr(sn, "score", None)
        score_txt = f" (score={score:.3f})" if isinstance(score, (float, int)) else ""
        lines.append(f"{i}. {fn}{score_txt}: {snippet}...")
    return "\n".join(lines) if lines else "No sources returned."


def chat(message, history):
    qe = build_index()

    prompt = (
        f"{SYSTEM_GUARDRAILS}\n\n"
        f"User question: {message}\n"
        f"Answer using ONLY the context."
    )
    resp = qe.query(prompt)
    answer = str(resp).strip()

    show_sources = os.getenv("SHOW_SOURCES", "true").lower() == "true"
    if show_sources:
        answer += "\n\n---\nSources:\n" + format_sources(resp, max_sources=TOP_K)

    return answer


demo = gr.ChatInterface(
    fn=chat,
    title="Challenge Copilot — RAG Q&A Bot",
    description="Ask questions about the Building AI Application Challenge using challenge_context.txt (LlamaIndex + OpenAI).",
    examples=[
        "What will I build in this live session?",
        "Who is this best for?",
        "What are the prerequisites?"
    ],
    theme="soft"
)

if __name__ == "__main__":
    demo.launch()