Kajlid commited on
Commit
9f97bdb
·
verified ·
1 Parent(s): f207fcf

Update app.py with search action

Browse files
Files changed (1) hide show
  1. app.py +130 -32
app.py CHANGED
@@ -1,54 +1,153 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
  """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
  """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- messages = [{"role": "system", "content": system_message}]
 
 
20
 
21
- messages.extend(history)
22
 
23
- messages.append({"role": "user", "content": message})
 
 
 
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  response = ""
26
 
27
- for message in client.chat_completion(
28
- messages,
29
  max_tokens=max_tokens,
30
  stream=True,
31
  temperature=temperature,
32
  top_p=top_p,
33
  ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
  response += token
40
  yield response
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
  chatbot = gr.ChatInterface(
47
- respond,
48
  type="messages",
49
  additional_inputs=[
50
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
 
52
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
  gr.Slider(
54
  minimum=0.1,
@@ -60,9 +159,8 @@ chatbot = gr.ChatInterface(
60
  ],
61
  )
62
 
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
  chatbot.render()
67
 
68
 
 
1
+ import subprocess
2
  import gradio as gr
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ # Install llama_cpp_python in the Space
6
+ subprocess.run("pip install llama_cpp_python==0.3.1", shell=True)
7
+ from llama_cpp import Llama
8
+
9
+ subprocess.run("pip install requests", shell=True)
10
+ import requests
11
+
12
+
13
+ def duckduckgo_search(query, max_results=3):
 
14
  """
15
+ Perform a DuckDuckGo search and return summarized results.
16
  """
17
+ url = "https://api.duckduckgo.com/"
18
+ params = {
19
+ "q": query,
20
+ "format": "json",
21
+ "no_redirect": 1,
22
+ "skip_disambig": 1
23
+ }
24
+
25
+ try:
26
+ resp = requests.get(url, params=params)
27
+ data = resp.json()
28
+
29
+ results = []
30
+ # Add AbstractText if available for the source
31
+ if data.get("AbstractText"):
32
+ results.append(data["AbstractText"])
33
+
34
+ # Related topics sometimes have extra info
35
+ for topic in data.get("RelatedTopics", [])[:max_results]:
36
+ if "Text" in topic:
37
+ results.append(topic["Text"])
38
+ elif "Topics" in topic:
39
+ for subtopic in topic["Topics"][:max_results]:
40
+ results.append(subtopic.get("Text", ""))
41
+
42
+ return "\n".join(results) if results else "No relevant results found."
43
+
44
+ except Exception as e:
45
+ return f"Error fetching search results: {e}"
46
 
47
+ def search_web(query):
48
+ """Perform a web search and return summarized results."""
49
+ return duckduckgo_search(query)
50
 
 
51
 
52
+ # Download 1B GGUF model into HF Space storage
53
+ model_path = hf_hub_download(
54
+ repo_id="ft-lora/llama3.2-1b-gguf-auto", # 1B GGUF repo
55
+ filename="llama3.2-1b-instruct-finetuned.gguf" # 1B GGUF file
56
+ )
57
 
58
+ # Initialize llama.cpp with smaller context & both CPU cores
59
+ llm = Llama(
60
+ model_path=model_path,
61
+ n_ctx=1024, # smaller context -> faster on CPU
62
+ n_threads=2, # use both vCPUs on HF Spaces
63
+ use_mmap=True, # memory-mapped loading
64
+ chat_format="llama-3",
65
+ )
66
+
67
+
68
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
69
+ messages = [{"role": "system", "content": system_message}]
70
+
71
+ # history is already a list of {role, content} dicts
72
+ for conv in history:
73
+ messages.append(conv)
74
+
75
+ messages.append({"role": "user", "content": message})
76
  response = ""
77
 
78
+ for chunk in llm.create_chat_completion(
79
+ messages=messages,
80
  max_tokens=max_tokens,
81
  stream=True,
82
  temperature=temperature,
83
  top_p=top_p,
84
  ):
85
+ delta = chunk["choices"][0]["delta"]
86
+ token = delta.get("content", "")
 
 
 
87
  response += token
88
  yield response
89
 
90
+ def agent_respond(question, history, system_message, max_tokens=128, temperature=0.7, top_p=0.95):
91
+ """
92
+ Agent loop: The model decides if it needs to search, calls the tool if necessary, then responds.
93
+ """
94
+
95
+ messages = [{"role": "system", "content": system_message}]
96
+ for conv in history:
97
+ messages.append(conv)
98
+
99
+ prompt = (
100
+ f"Question: {question}\n"
101
+ "You are an AI assistant that can use the tool `search_web(query)` to get up-to-date information.\n"
102
+ "Only search the web if that will give a more reliable response.\n"
103
+ "Decide if you need to search the web to answer this question.\n"
104
+ "Respond with only `Yes` or `No`.\n"
105
+ "Action:"
106
+ )
107
+
108
+ action_response = ""
109
+ for chunk in llm.create_chat_completion(
110
+ messages=messages + [{"role": "user", "content": prompt}],
111
+ max_tokens=max_tokens,
112
+ stream=True,
113
+ temperature=temperature,
114
+ top_p=top_p
115
+ ):
116
+ delta = chunk["choices"][0]["delta"]
117
+ token = delta.get("content", "")
118
+ action_response += token
119
+ # partial response
120
+ # yield action_response
121
+
122
+ # Check if the model decided to search
123
+ if "yes" in action_response.lower():
124
+ search_results = search_web(question)
125
+ observation = f"Observation: {search_results}\nAnswer:"
126
+ else:
127
+ observation = "Answer:"
128
+
129
+ # Ask the model to give final answer
130
+ final_response = ""
131
+ for chunk in llm.create_chat_completion(
132
+ messages=messages + [{"role": "user", "content": prompt + "\n" + observation}],
133
+ max_tokens=max_tokens,
134
+ stream=True,
135
+ temperature=temperature,
136
+ top_p=top_p
137
+ ):
138
+ delta = chunk["choices"][0]["delta"]
139
+ token = delta.get("content", "")
140
+ final_response += token
141
+ yield final_response
142
+
143
 
 
 
 
144
  chatbot = gr.ChatInterface(
145
+ agent_respond,
146
  type="messages",
147
  additional_inputs=[
148
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
149
+ # Smaller default generation length for faster replies
150
+ gr.Slider(minimum=1, maximum=512, value=128, step=1, label="Max new tokens"),
151
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
152
  gr.Slider(
153
  minimum=0.1,
 
159
  ],
160
  )
161
 
162
+ demo = gr.Blocks()
163
+ with demo:
 
164
  chatbot.render()
165
 
166