File size: 4,893 Bytes
9f97bdb
9817a92
9f97bdb
 
 
 
 
 
 
 
 
 
 
9817a92
9f97bdb
9817a92
9f97bdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9817a92
9f97bdb
 
 
9817a92
 
2359e74
9f97bdb
2359e74
 
9f97bdb
9817a92
9f97bdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9817a92
 
9f97bdb
 
9817a92
 
 
 
 
9f97bdb
 
9817a92
 
 
2359e74
 
9f97bdb
 
2359e74
 
 
 
 
9f97bdb
2359e74
9f97bdb
 
 
2359e74
9f97bdb
 
2359e74
9f97bdb
 
 
 
2359e74
 
9f97bdb
 
 
 
 
2359e74
 
9f97bdb
 
2359e74
9f97bdb
 
 
2359e74
9f97bdb
 
 
 
2359e74
9f97bdb
9817a92
 
9f97bdb
9817a92
 
2359e74
9f97bdb
 
9817a92
 
 
 
 
 
 
 
 
 
 
9f97bdb
 
9817a92
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

# Install llama_cpp_python in the Space
subprocess.run("pip install llama_cpp_python==0.3.1", shell=True)
from llama_cpp import Llama

subprocess.run("pip install requests", shell=True)
import requests


def duckduckgo_search(query, max_results=3):
    """
    Perform a DuckDuckGo search and return summarized results.
    """
    url = "https://api.duckduckgo.com/"
    params = {
        "q": query,
        "format": "json",
        "no_redirect": 1,
        "skip_disambig": 1
    }
    
    try:
        resp = requests.get(url, params=params)
        data = resp.json()
        
        results = []
        # Add AbstractText if available for the source
        if data.get("AbstractText"):
            results.append(data["AbstractText"])
        
        # Related topics sometimes have extra info
        for topic in data.get("RelatedTopics", [])[:max_results]:
            if "Text" in topic:
                results.append(topic["Text"])
            elif "Topics" in topic:
                for subtopic in topic["Topics"][:max_results]:
                    results.append(subtopic.get("Text", ""))
        
        return "\n".join(results) if results else "No relevant results found."
    
    except Exception as e:
        return f"Error fetching search results: {e}"

def search_web(query):
    """Perform a web search and return summarized results."""
    return duckduckgo_search(query)


# Download 3B GGUF model into HF Space storage
model_path = hf_hub_download(
    repo_id="ft-lora/llama3.2-3b-gguf-q4km",
    filename="llama3.2-3b-instruct-finetuned.gguf"
)

# Initialize llama.cpp with smaller context & both CPU cores
llm = Llama(
    model_path=model_path,
    n_ctx=1024,        # smaller context -> faster on CPU
    n_threads=2,       # use both vCPUs on HF Spaces
    use_mmap=True,     # memory-mapped loading
    chat_format="llama-3",
)


def respond(message, history, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}]
    
    # history is already a list of {role, content} dicts
    for conv in history:
        messages.append(conv)

    messages.append({"role": "user", "content": message})
    response = ""

    for chunk in llm.create_chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        delta = chunk["choices"][0]["delta"]
        token = delta.get("content", "")
        response += token
        yield response

def agent_respond(question, history, system_message, max_tokens=256, temperature=0.7, top_p=0.95):
    messages = [{"role": "system", "content": system_message}] + history

    prompt = (
        f"Question: {question}\n"
        "You are an AI assistant that can use the tool `search_web(query)` to get up-to-date information.\n"
        "Decide if you need to search the web to answer this question.\n"
        "Respond with only 'Yes' or 'No'.\n"
        "Action:"
    )

    action_response = ""
    for chunk in llm.create_chat_completion(
        messages=messages + [{"role": "user", "content": prompt}],
        max_tokens=32,  # small for decision
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        delta = chunk["choices"][0]["delta"]
        token = delta.get("content", "")
        action_response += token

    # Search if appropriate
    if "yes" in action_response.lower():
        search_results = search_web(question)
        observation = f"Observation: {search_results}\nAnswer:"
    else:
        observation = "Answer:"

    # Ask model to generate final answer
    final_response = ""
    for chunk in llm.create_chat_completion(
        messages=messages + [{"role": "user", "content": f"{question}\n{observation}"}],
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        delta = chunk["choices"][0]["delta"]
        token = delta.get("content", "")
        final_response += token
        yield final_response


chatbot = gr.ChatInterface(
    agent_respond,
    type="messages",
    additional_inputs=[
        gr.Textbox(value="You are a helpful Chatbot who will help the user find the right answers.", label="System message"),
        # Smaller default generation length for faster replies
        gr.Slider(minimum=1, maximum=512, value=128, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

demo = gr.Blocks()
with demo:
    chatbot.render()


if __name__ == "__main__":
    demo.launch()