Spaces:

menikev
/

KnowYourRIght-Bot

Sleeping

File size: 6,966 Bytes

21ee664
c6fa86f
5b69f3e
ec071b2
2454a06
ec071b2
2454a06
df40412
ec071b2
 
 
 
 
8307462
ec071b2
8307462
ec071b2
8307462
 
ec071b2
 
8307462
60fc375
ec071b2
8307462
df40412
ccec758
 
a5640e9
ec071b2
 
 
8307462
 
ec071b2
8307462
 
76921a6
a5640e9
 
ec071b2
a5640e9
 
ec071b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8307462
ec071b2
 
 
76921a6
ec071b2
 
82275bb
ec071b2
a508099
ec071b2
8307462
ec071b2
8307462
ec071b2
 
8307462
ec071b2
8307462
c80fd59
ec071b2
 
 
 
 
8307462
ec071b2
8307462
ec071b2
8307462
ec071b2
8307462
 
 
ec071b2
 
 
 
 
 
 
ccec758
8307462
ccec758
8307462
 
ec071b2
af2b4ba
c6fa86f
ec071b2
 
c6fa86f
af2b4ba
8307462
 
2454a06
76921a6
ec071b2
 
df40412
ec071b2
 
 
 
 
 
 
33bd02a
 
ccec758
8307462
ec071b2
 
8307462
ec071b2
 
 
8307462
ccec758
 
ec071b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8307462
ec071b2
76921a6
af2b4ba
 
2454a06
ec071b2
 
af2b4ba
c6fa86f
df40412
 
 
ec071b2
 
60fc375
8307462
5ff90e2
ec071b2
2454a06
76921a6
 
 
ec071b2
 
76921a6
2454a06
60fc375
8307462
 
ec071b2
8307462
 
 
 
 
 
 
 
 
 
 
 
 
76921a6
 
8307462
 
2454a06
60fc375
21ee664
8307462
ec071b2
ccec758
ec071b2

import os
from pathlib import Path
import gradio as gr

from dotenv import load_dotenv
load_dotenv()

from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma   # <-- match ingestion
from langchain_huggingface import (
    HuggingFaceEmbeddings,
    HuggingFaceEndpoint,
)
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# --- 1) CONFIG / SAFETY ---

if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
    print("HUGGINGFACEHUB_API_TOKEN not found. Add it to your Space secrets.")
    raise SystemExit(1)

PERSIST_DIR = Path("data/processed/vector_db")
COLLECTION_NAME = "legal_documents"  # <-- MUST match complete_ingestion.py

if not PERSIST_DIR.exists() or not any(PERSIST_DIR.iterdir()):
    print("⚠️ Vector DB not found. Run complete_ingestion.py first.")
    raise SystemExit(1)

# --- 2) LOAD VECTOR DB / RETRIEVER ---

print("Loading vector database...")
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en",
    model_kwargs={"device": "cpu"},
)

vectordb = Chroma(
    persist_directory=str(PERSIST_DIR),
    embedding_function=embedding_model,
    collection_name=COLLECTION_NAME,  # <-- critical: open the right collection
)

# Quick sanity check (helps spot empty/wrong collection immediately)
try:
    count = vectordb._collection.count()
    print(f"✅ Loaded Chroma collection '{COLLECTION_NAME}' with {count} documents.")
    if count == 0:
        raise RuntimeError(
            "Chroma collection is empty. Confirm collection_name matches the one used in complete_ingestion.py"
        )
except Exception as e:
    print(f"Chroma sanity check failed: {e}")
    raise

# A slightly more forgiving retriever
retriever = vectordb.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "fetch_k": 20},
)
print("Vector database ready.")

# --- 3) LLM (Hugging Face Inference Endpoint) ---

print("Initializing LLM via Hugging Face Endpoint...")
llm = HuggingFaceEndpoint(
    repo_id=os.getenv("HF_ENDPOINT_MODEL", "mistralai/Mistral-7B-Instruct-v0.2"),
    temperature=0.15,
    max_new_tokens=512,
    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
)
print("LLM initialized.")

# --- 4) PROMPT & RAG CHAIN ---

RAG_PROMPT_TEMPLATE = """
You are an expert Nigerian Legal Assistant. Provide clear, concise explanations.

CONTEXT:
{context}

RULES:
1) Explain and summarize—do not paste raw sections verbatim.
2) Use ONLY the context above. If missing, say you don't know.
3) Conversational tone. Plain English (or Pidgin if user chose it).
4) At the end, list the referenced section(s)/source(s).

QUESTION: {question}

ANSWER:
"""

RAG_PROMPT = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

def format_docs(docs):
    # Keep rich info so the LLM can cite properly
    blocks = []
    for d in docs:
        src = d.metadata.get("source", "Unknown Source")
        sec = d.metadata.get("section", "Unknown Section")
        blocks.append(f"Source: {src}\nSection: {sec}\nContent: {d.page_content}")
    return "\n\n---\n\n".join(blocks)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RAG_PROMPT
    | llm
    | StrOutputParser()
)

# --- 5) APP LOGIC ---

def answer_question(user_input, lang_choice, history=[]):
    try:
        query = (user_input or "").strip()
        if not query:
            return history, history

        # Chatbot uses type='messages'
        history.append({"role": "user", "content": query})

        if query.lower() in {"hi", "hello", "hey"}:
            ans = (
                "Hello! I'm your Nigerian Legal AI Assistant. How can I help you today?"
                if lang_choice == "english"
                else "Howfa! I be your Nigerian Legal AI Assistant. How I fit help you today? No be legal advice o."
            )
            history.append({"role": "assistant", "content": ans})
            return history, history

        print(f"⚡ Running RAG chain for query: {query}")
        docs = retriever.invoke(query)
        print(f"Retrieved {len(docs)} docs")

        if not docs:
            answer = (
                "I could not find any relevant information in the legal documents for your query."
            )
        else:
            answer = rag_chain.invoke(query)

        # Build references from the retrieved docs
        refs = []
        for d in docs[:5]:
            src = d.metadata.get("source", "Unknown Source")
            sec = d.metadata.get("section", "Unknown Section")
            if src or sec:
                refs.append(f"- {src} — {sec}")

        if refs:
            answer += "\n\n**References:**\n" + "\n".join(refs)

        # Disclaimer
        answer += (
            "\n\n--- \n*⚠️ Disclaimer: This is AI-generated information and not legal advice. "
            "Please consult a qualified lawyer for professional guidance.*"
            if lang_choice == "english"
            else "\n\n--- \n*⚠️ No be legal advice o, abeg find lawyer for proper advice.*"
        )

        history.append({"role": "assistant", "content": answer.strip()})
        return history, history

    except Exception as e:
        print(f"❌ Error: {e}")
        err = "Sorry, an unexpected error occurred. Please try again."
        history.append({"role": "assistant", "content": err})
        return history, history

def _reset():
    return [], []

# --- 6) GRADIO UI ---

def build_ui():
    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="KnowYourRight Bot") as demo:
        gr.Markdown("# 📜 KnowYourRight Bot — Nigerian Legal Assistant")
        gr.Markdown("Ask questions about the Nigerian Constitution, Labour Act, FCCPA, Data Protection, and more.")

        chatbot = gr.Chatbot(
            label="Chat History",
            height=600,
            type="messages",
            avatar_images=("user.png", "bot.png"),
        )

        with gr.Row():
            msg = gr.Textbox(
                label="Your Question",
                placeholder="e.g., 'What are my rights as a tenant?'",
                lines=2,
                scale=4,
            )
            submit_btn = gr.Button("▶️ Send", variant="primary", scale=1)

        lang_choice = gr.Radio(["english", "pidgin"], value="english", label="Response Language")
        clear_btn = gr.Button("🗑️ Clear Chat")

        chat_state = gr.State([])

        submit_btn.click(answer_question, [msg, lang_choice, chat_state], [chatbot, chat_state])
        msg.submit(answer_question, [msg, lang_choice, chat_state], [chatbot, chat_state])

        submit_btn.click(lambda: "", None, msg)
        msg.submit(lambda: "", None, msg)

        clear_btn.click(_reset, None, [chatbot, chat_state])

    return demo

if __name__ == "__main__":
    print("Building Gradio UI...")
    demo = build_ui()
    print("Launching Gradio app...")
    demo.launch()