llama-agent-ui / app.py
Kajlid's picture
Update app.py
2359e74 verified
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
# Install llama_cpp_python in the Space
subprocess.run("pip install llama_cpp_python==0.3.1", shell=True)
from llama_cpp import Llama
subprocess.run("pip install requests", shell=True)
import requests
def duckduckgo_search(query, max_results=3):
"""
Perform a DuckDuckGo search and return summarized results.
"""
url = "https://api.duckduckgo.com/"
params = {
"q": query,
"format": "json",
"no_redirect": 1,
"skip_disambig": 1
}
try:
resp = requests.get(url, params=params)
data = resp.json()
results = []
# Add AbstractText if available for the source
if data.get("AbstractText"):
results.append(data["AbstractText"])
# Related topics sometimes have extra info
for topic in data.get("RelatedTopics", [])[:max_results]:
if "Text" in topic:
results.append(topic["Text"])
elif "Topics" in topic:
for subtopic in topic["Topics"][:max_results]:
results.append(subtopic.get("Text", ""))
return "\n".join(results) if results else "No relevant results found."
except Exception as e:
return f"Error fetching search results: {e}"
def search_web(query):
"""Perform a web search and return summarized results."""
return duckduckgo_search(query)
# Download 3B GGUF model into HF Space storage
model_path = hf_hub_download(
repo_id="ft-lora/llama3.2-3b-gguf-q4km",
filename="llama3.2-3b-instruct-finetuned.gguf"
)
# Initialize llama.cpp with smaller context & both CPU cores
llm = Llama(
model_path=model_path,
n_ctx=1024, # smaller context -> faster on CPU
n_threads=2, # use both vCPUs on HF Spaces
use_mmap=True, # memory-mapped loading
chat_format="llama-3",
)
def respond(message, history, system_message, max_tokens, temperature, top_p):
messages = [{"role": "system", "content": system_message}]
# history is already a list of {role, content} dicts
for conv in history:
messages.append(conv)
messages.append({"role": "user", "content": message})
response = ""
for chunk in llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
delta = chunk["choices"][0]["delta"]
token = delta.get("content", "")
response += token
yield response
def agent_respond(question, history, system_message, max_tokens=256, temperature=0.7, top_p=0.95):
messages = [{"role": "system", "content": system_message}] + history
prompt = (
f"Question: {question}\n"
"You are an AI assistant that can use the tool `search_web(query)` to get up-to-date information.\n"
"Decide if you need to search the web to answer this question.\n"
"Respond with only 'Yes' or 'No'.\n"
"Action:"
)
action_response = ""
for chunk in llm.create_chat_completion(
messages=messages + [{"role": "user", "content": prompt}],
max_tokens=32, # small for decision
stream=True,
temperature=temperature,
top_p=top_p,
):
delta = chunk["choices"][0]["delta"]
token = delta.get("content", "")
action_response += token
# Search if appropriate
if "yes" in action_response.lower():
search_results = search_web(question)
observation = f"Observation: {search_results}\nAnswer:"
else:
observation = "Answer:"
# Ask model to generate final answer
final_response = ""
for chunk in llm.create_chat_completion(
messages=messages + [{"role": "user", "content": f"{question}\n{observation}"}],
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
delta = chunk["choices"][0]["delta"]
token = delta.get("content", "")
final_response += token
yield final_response
chatbot = gr.ChatInterface(
agent_respond,
type="messages",
additional_inputs=[
gr.Textbox(value="You are a helpful Chatbot who will help the user find the right answers.", label="System message"),
# Smaller default generation length for faster replies
gr.Slider(minimum=1, maximum=512, value=128, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
demo = gr.Blocks()
with demo:
chatbot.render()
if __name__ == "__main__":
demo.launch()