widget-RAG-Test

Paused

File size: 3,934 Bytes

d1d1b25

import spaces
import torch
import gradio as gr

from transformers import (
    pipeline,
    BitsAndBytesConfig,
)

from duckduckgo_search import DDGS


# =====================================================
# MODEL SETUP
# =====================================================

quantization_config = (
    BitsAndBytesConfig(load_in_4bit=True)
    if torch.cuda.is_available()
    else None
)

llama3_model_id = "meta-llama/Llama-3.1-8B-Instruct"

llama3_pipe = pipeline(
    "text-generation",
    model=llama3_model_id,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    model_kwargs={"quantization_config": quantization_config},
)

print("✅ Model Loaded")


# =====================================================
# SEARCH (HF SPACES SAFE)
# =====================================================

def google_search_results(query: str):
    """
    Live web search using DuckDuckGo
    (Google scraping does NOT work in Spaces)
    """
    outputs = []

    try:
        with DDGS() as ddgs:
            results = ddgs.text(query, max_results=5)

            for r in results:
                outputs.append(r["body"])

    except Exception as e:
        print("Search error:", e)

    return outputs


# =====================================================
# RAG ENRICHMENT
# =====================================================

def RAG_enrichment(input_question: str):

    enrichment = google_search_results(input_question)

    print("Search Results:", enrichment)

    new_output = (
        input_question
        + "\n\nUse the following real-time information to help answer:\n\n"
    )

    for info in enrichment:
        new_output += info + "\n\n"

    return new_output


# =====================================================
# LLAMA QA
# =====================================================

@spaces.GPU
def llama_QA(input_question: str, pipe):

    prompt = f"""
You are a helpful chatbot assistant.

Answer clearly and concisely.
If real-time info is missing, answer using available knowledge.

Question:
{input_question}

Answer:
"""

    outputs = pipe(
        prompt,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
    )

    response = outputs[0]["generated_text"]

    # remove prompt from output
    response = response.replace(prompt, "").strip()

    return response


# =====================================================
# GRADIO WRAPPER
# =====================================================

@spaces.GPU
def gradio_func(input_question):

    print("User Question:", input_question)

    # Non-RAG
    output1 = llama_QA(input_question, llama3_pipe)

    # RAG enriched prompt
    rag_input = RAG_enrichment(input_question)

    # RAG answer
    output2 = llama_QA(rag_input, llama3_pipe)

    return input_question, rag_input, output1, output2


# =====================================================
# UI
# =====================================================

def create_interface():

    with gr.Blocks() as demo:

        gr.Markdown("# 🔎 Llama3 RAG vs Non-RAG Demo")

        with gr.Row():
            question_input = gr.Textbox(
                label="Enter your question",
                value="what day is today in sydney?",
            )

        submit_btn = gr.Button("Ask")

        with gr.Row():
            input1 = gr.Textbox(label="Non-RAG Input")
            input2 = gr.Textbox(label="RAG Enriched Input")

        with gr.Row():
            output1 = gr.Textbox(label="Non-RAG Output")
            output2 = gr.Textbox(label="RAG Output")

        submit_btn.click(
            fn=gradio_func,
            inputs=[question_input],
            outputs=[input1, input2, output1, output2],
        )

    return demo


# =====================================================
# LAUNCH
# =====================================================

demo = create_interface()
demo.launch()