Spaces:

ft-lora
/

llama-agent-ui

Sleeping

File size: 4,893 Bytes

import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

# Install llama_cpp_python in the Space
subprocess.run("pip install llama_cpp_python==0.3.1", shell=True)
from llama_cpp import Llama

subprocess.run("pip install requests", shell=True)
import requests


def duckduckgo_search(query, max_results=3):
    """
    Perform a DuckDuckGo search and return summarized results.
    """
    url = "https://api.duckduckgo.com/"
    params = {
        "q": query,
        "format": "json",
        "no_redirect": 1,
        "skip_disambig": 1
    }
    
    try:
        resp = requests.get(url, params=params)
        data = resp.json()
        
        results = []
        # Add AbstractText if available for the source
        if data.get("AbstractText"):
            results.append(data["AbstractText"])
        
        # Related topics sometimes have extra info
        for topic in data.get("RelatedTopics", [])[:max_results]:
            if "Text" in topic:
                results.append(topic["Text"])
            elif "Topics" in topic:
                for subtopic in topic["Topics"][:max_results]:
                    results.append(subtopic.get("Text", ""))
        
        return "\n".join(results) if results else "No relevant results found."
    
    except Exception as e:
        return f"Error fetching search results: {e}"

def search_web(query):
    """Perform a web search and return summarized results."""
    return duckduckgo_search(query)


# Download 3B GGUF model into HF Space storage
model_path = hf_hub_download(
    repo_id="ft-lora/llama3.2-3b-gguf-q4km",
    filename="llama3.2-3b-instruct-finetuned.gguf"
)

# Initialize llama.cpp with smaller context & both CPU cores
llm = Llama(
    model_path=model_path,
    n_ctx=1024,        # smaller context -> faster on CPU
    n_threads=2,       # use both vCPUs on HF Spaces
    use_mmap=True,     # memory-mapped loading
    chat_format="llama-3",
)


def respond(message, history, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}]
    
    # history is already a list of {role, content} dicts
    for conv in history:
        messages.append(conv)

    messages.append({"role": "user", "content": message})
    response = ""

    for chunk in llm.create_chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        delta = chunk["choices"][0]["delta"]
        token = delta.get("content", "")
        response += token
        yield response

def agent_respond(question, history, system_message, max_tokens=256, temperature=0.7, top_p=0.95):
    messages = [{"role": "system", "content": system_message}] + history

    prompt = (
        f"Question: {question}\n"
        "You are an AI assistant that can use the tool `search_web(query)` to get up-to-date information.\n"
        "Decide if you need to search the web to answer this question.\n"
        "Respond with only 'Yes' or 'No'.\n"
        "Action:"
    )

    action_response = ""
    for chunk in llm.create_chat_completion(
        messages=messages + [{"role": "user", "content": prompt}],
        max_tokens=32,  # small for decision
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        delta = chunk["choices"][0]["delta"]
        token = delta.get("content", "")
        action_response += token

    # Search if appropriate
    if "yes" in action_response.lower():
        search_results = search_web(question)
        observation = f"Observation: {search_results}\nAnswer:"
    else:
        observation = "Answer:"

    # Ask model to generate final answer
    final_response = ""
    for chunk in llm.create_chat_completion(
        messages=messages + [{"role": "user", "content": f"{question}\n{observation}"}],
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        delta = chunk["choices"][0]["delta"]
        token = delta.get("content", "")
        final_response += token
        yield final_response


chatbot = gr.ChatInterface(
    agent_respond,
    type="messages",
    additional_inputs=[
        gr.Textbox(value="You are a helpful Chatbot who will help the user find the right answers.", label="System message"),
        # Smaller default generation length for faster replies
        gr.Slider(minimum=1, maximum=512, value=128, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

demo = gr.Blocks()
with demo:
    chatbot.render()


if __name__ == "__main__":
    demo.launch()