Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -54,25 +54,31 @@ class LangGraphAgent:
|
|
| 54 |
if not OPENROUTER_API_KEY:
|
| 55 |
raise ValueError("OPENROUTER_API_KEY is not set. Cannot initialize LLM.")
|
| 56 |
|
|
|
|
|
|
|
|
|
|
| 57 |
if llm_choice == "llama":
|
| 58 |
self.llm = ChatOpenAI(
|
| 59 |
-
model="meta-llama/llama-
|
| 60 |
api_key=OPENROUTER_API_KEY,
|
| 61 |
base_url="https://openrouter.ai/api/v1",
|
| 62 |
-
temperature=0.1,
|
| 63 |
-
# max_tokens=150 # Llama 8B might benefit from a smaller max_token for concise answers
|
| 64 |
)
|
| 65 |
-
|
|
|
|
|
|
|
| 66 |
elif llm_choice == "qwen":
|
| 67 |
self.llm = ChatOpenAI(
|
| 68 |
-
model="qwen/
|
| 69 |
api_key=OPENROUTER_API_KEY,
|
| 70 |
base_url="https://openrouter.ai/api/v1",
|
| 71 |
-
temperature=0.1
|
| 72 |
)
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
else:
|
| 75 |
-
raise ValueError(f"Unsupported LLM choice: {llm_choice}. Choose '
|
| 76 |
|
| 77 |
self.tools_map = {tool.name: tool for tool in tools}
|
| 78 |
self.graph = self._build_graph()
|
|
@@ -99,6 +105,10 @@ class LangGraphAgent:
|
|
| 99 |
|
| 100 |
def _should_call_tools(self, state: AgentState) -> str:
|
| 101 |
print("LLM deciding next step...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
last_message = state["messages"][-1]
|
| 103 |
if hasattr(last_message, "tool_calls") and last_message.tool_calls:
|
| 104 |
print(f"LLM decided to call tools: {last_message.tool_calls}")
|
|
@@ -107,10 +117,15 @@ class LangGraphAgent:
|
|
| 107 |
return "end"
|
| 108 |
|
| 109 |
def _call_llm(self, state: AgentState) -> Dict[str, Any]:
|
| 110 |
-
print("Calling LLM...")
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
print(f"LLM response: {response.content[:100]}...")
|
| 115 |
return {"messages": [response]}
|
| 116 |
|
|
@@ -165,26 +180,45 @@ class LangGraphAgent:
|
|
| 165 |
|
| 166 |
if final_graph_state and final_graph_state["messages"]:
|
| 167 |
for msg in reversed(final_graph_state["messages"]):
|
| 168 |
-
if isinstance(msg, AIMessage) and not msg.tool_calls:
|
| 169 |
answer = msg.content.strip()
|
|
|
|
|
|
|
|
|
|
| 170 |
# Remove common prefixes that LLMs might add despite instructions
|
| 171 |
prefixes_to_remove = [
|
| 172 |
"FINAL ANSWER:", "The answer is", "Here is the answer:",
|
| 173 |
-
"The final answer is", "Answer:", "Solution:"
|
|
|
|
|
|
|
| 174 |
]
|
| 175 |
for prefix in prefixes_to_remove:
|
| 176 |
-
|
|
|
|
| 177 |
answer = answer[len(prefix):].strip()
|
| 178 |
|
| 179 |
-
#
|
| 180 |
-
if
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
print(f"Agent returning answer: {answer}")
|
| 185 |
return answer
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
else:
|
| 189 |
print("Error: Agent did not reach a final state or no messages found.")
|
| 190 |
return "Error: Agent did not produce a conclusive answer."
|
|
@@ -196,7 +230,7 @@ class LangGraphAgent:
|
|
| 196 |
return f"Error during agent execution: {e}"
|
| 197 |
|
| 198 |
# --- Main Evaluation Logic (Modified from starter) ---
|
| 199 |
-
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 200 |
"""
|
| 201 |
Fetches all questions, runs the LangGraphAgent on them, submits all answers,
|
| 202 |
and displays the results.
|
|
@@ -212,17 +246,15 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 212 |
|
| 213 |
if not OPENROUTER_API_KEY:
|
| 214 |
return "Error: OPENROUTER_API_KEY not found. Please set it in your .env file.", None
|
| 215 |
-
|
| 216 |
-
print("Warning: TAVILY_API_KEY not found. Tavily search might not work as expected.")
|
| 217 |
-
# return "Error: TAVILY_API_KEY not found. Please set it in your .env file.", None
|
| 218 |
|
| 219 |
api_url = DEFAULT_API_URL
|
| 220 |
questions_url = f"{api_url}/questions"
|
| 221 |
submit_url = f"{api_url}/submit"
|
| 222 |
|
|
|
|
| 223 |
try:
|
| 224 |
-
|
| 225 |
-
agent = LangGraphAgent(llm_choice="llama")
|
| 226 |
except Exception as e:
|
| 227 |
print(f"Error instantiating agent: {e}")
|
| 228 |
return f"Error initializing agent: {e}", None
|
|
@@ -315,7 +347,6 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 315 |
results_df = pd.DataFrame(results_log)
|
| 316 |
return status_message, results_df
|
| 317 |
|
| 318 |
-
# --- Gradio Interface (Mostly as provided) ---
|
| 319 |
with gr.Blocks() as demo:
|
| 320 |
gr.Markdown("# LangGraph GAIA Agent Evaluation Runner")
|
| 321 |
gr.Markdown(
|
|
@@ -325,20 +356,28 @@ with gr.Blocks() as demo:
|
|
| 325 |
2. **Create a `.env` file** in the root of your space with your API keys:
|
| 326 |
```
|
| 327 |
OPENROUTER_API_KEY="your_openrouter_api_key"
|
| 328 |
-
TAVILY_API_KEY="your_tavily_api_key" # Optional, but
|
| 329 |
```
|
| 330 |
3. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 331 |
-
4.
|
|
|
|
| 332 |
---
|
| 333 |
**Disclaimers:**
|
| 334 |
- Ensure your Hugging Face Space is public for the `agent_code` link to be verifiable.
|
| 335 |
- Submitting all answers can take some time as the agent processes each question.
|
| 336 |
-
-
|
| 337 |
"""
|
| 338 |
)
|
| 339 |
|
| 340 |
gr.LoginButton()
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 343 |
|
| 344 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
|
@@ -346,6 +385,7 @@ with gr.Blocks() as demo:
|
|
| 346 |
|
| 347 |
run_button.click(
|
| 348 |
fn=run_and_submit_all,
|
|
|
|
| 349 |
outputs=[status_output, results_table]
|
| 350 |
)
|
| 351 |
|
|
|
|
| 54 |
if not OPENROUTER_API_KEY:
|
| 55 |
raise ValueError("OPENROUTER_API_KEY is not set. Cannot initialize LLM.")
|
| 56 |
|
| 57 |
+
self.llm_choice = llm_choice
|
| 58 |
+
self.supports_tool_calling = False # Default to false
|
| 59 |
+
|
| 60 |
if llm_choice == "llama":
|
| 61 |
self.llm = ChatOpenAI(
|
| 62 |
+
model="meta-llama/llama-3.1-8b-instruct:free", # Corrected to Llama 3.1 as per user's earlier request
|
| 63 |
api_key=OPENROUTER_API_KEY,
|
| 64 |
base_url="https://openrouter.ai/api/v1",
|
| 65 |
+
temperature=0.1,
|
|
|
|
| 66 |
)
|
| 67 |
+
# Llama 3.1 8B on OpenRouter might not support tool calling via the OpenAI SDK binding method
|
| 68 |
+
self.supports_tool_calling = False
|
| 69 |
+
print("Initialized Llama 3.1 8B Instruct (tool calling assumed NOT supported).")
|
| 70 |
elif llm_choice == "qwen":
|
| 71 |
self.llm = ChatOpenAI(
|
| 72 |
+
model="qwen/qwen-2-7b-instruct:free", # Using a Qwen-2 model as qwq-32b might be older
|
| 73 |
api_key=OPENROUTER_API_KEY,
|
| 74 |
base_url="https://openrouter.ai/api/v1",
|
| 75 |
+
temperature=0.1
|
| 76 |
)
|
| 77 |
+
# Qwen models on OpenRouter might not support tool calling via the OpenAI SDK binding method
|
| 78 |
+
self.supports_tool_calling = False
|
| 79 |
+
print("Initialized Qwen-2 7B Instruct (tool calling assumed NOT supported).")
|
| 80 |
else:
|
| 81 |
+
raise ValueError(f"Unsupported LLM choice: {llm_choice}. Choose 'llama', or 'qwen'.")
|
| 82 |
|
| 83 |
self.tools_map = {tool.name: tool for tool in tools}
|
| 84 |
self.graph = self._build_graph()
|
|
|
|
| 105 |
|
| 106 |
def _should_call_tools(self, state: AgentState) -> str:
|
| 107 |
print("LLM deciding next step...")
|
| 108 |
+
if not self.supports_tool_calling:
|
| 109 |
+
print("Tool calling not supported by the current LLM. Ending interaction.")
|
| 110 |
+
return "end"
|
| 111 |
+
|
| 112 |
last_message = state["messages"][-1]
|
| 113 |
if hasattr(last_message, "tool_calls") and last_message.tool_calls:
|
| 114 |
print(f"LLM decided to call tools: {last_message.tool_calls}")
|
|
|
|
| 117 |
return "end"
|
| 118 |
|
| 119 |
def _call_llm(self, state: AgentState) -> Dict[str, Any]:
|
| 120 |
+
print(f"Calling LLM ({self.llm_choice})...")
|
| 121 |
+
if self.supports_tool_calling:
|
| 122 |
+
print("Binding tools to LLM for function calling.")
|
| 123 |
+
llm_with_tools = self.llm.bind_tools(tools)
|
| 124 |
+
response = llm_with_tools.invoke(state["messages"])
|
| 125 |
+
else:
|
| 126 |
+
print("Invoking LLM without binding tools.")
|
| 127 |
+
response = self.llm.invoke(state["messages"])
|
| 128 |
+
|
| 129 |
print(f"LLM response: {response.content[:100]}...")
|
| 130 |
return {"messages": [response]}
|
| 131 |
|
|
|
|
| 180 |
|
| 181 |
if final_graph_state and final_graph_state["messages"]:
|
| 182 |
for msg in reversed(final_graph_state["messages"]):
|
| 183 |
+
if isinstance(msg, AIMessage) and not msg.tool_calls and msg.content: # Ensure content exists
|
| 184 |
answer = msg.content.strip()
|
| 185 |
+
if not answer: # Skip empty answers after initial stripping
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
# Remove common prefixes that LLMs might add despite instructions
|
| 189 |
prefixes_to_remove = [
|
| 190 |
"FINAL ANSWER:", "The answer is", "Here is the answer:",
|
| 191 |
+
"The final answer is", "Answer:", "Solution:",
|
| 192 |
+
"The direct answer is", "Here's the concise answer:",
|
| 193 |
+
"Here you go:", "Certainly, the answer is"
|
| 194 |
]
|
| 195 |
for prefix in prefixes_to_remove:
|
| 196 |
+
# Case-insensitive prefix removal
|
| 197 |
+
if answer.lower().startswith(prefix.lower()):
|
| 198 |
answer = answer[len(prefix):].strip()
|
| 199 |
|
| 200 |
+
# More robust quote stripping
|
| 201 |
+
if answer.startswith(("\"", "'")) and answer.endswith(("\"", "'")):
|
| 202 |
+
temp_answer = answer[1:-1]
|
| 203 |
+
# Avoid stripping if it's a legitimately quoted string like "'quoted string'" as the answer itself
|
| 204 |
+
if not (temp_answer.startswith(("\"", "'")) and temp_answer.endswith(("\"", "'"))):
|
| 205 |
+
answer = temp_answer
|
| 206 |
+
|
| 207 |
+
if not answer: # Check again if answer became empty after stripping
|
| 208 |
+
continue
|
| 209 |
|
| 210 |
print(f"Agent returning answer: {answer}")
|
| 211 |
return answer
|
| 212 |
+
|
| 213 |
+
# Refined fallback logic
|
| 214 |
+
print("No suitable AI message with valid content found after processing. Attempting to return last raw AI message if available.")
|
| 215 |
+
last_ai_msg_content = next((m.content.strip() for m in reversed(final_graph_state["messages"]) if isinstance(m, AIMessage) and m.content and not m.tool_calls), None)
|
| 216 |
+
if last_ai_msg_content:
|
| 217 |
+
print(f"Agent returning last raw AI message as fallback: {last_ai_msg_content}")
|
| 218 |
+
return last_ai_msg_content
|
| 219 |
+
|
| 220 |
+
print("No suitable AI message found for final answer, even as fallback.")
|
| 221 |
+
return "Error: Agent could not extract a valid answer." # More specific error
|
| 222 |
else:
|
| 223 |
print("Error: Agent did not reach a final state or no messages found.")
|
| 224 |
return "Error: Agent did not produce a conclusive answer."
|
|
|
|
| 230 |
return f"Error during agent execution: {e}"
|
| 231 |
|
| 232 |
# --- Main Evaluation Logic (Modified from starter) ---
|
| 233 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None, llm_model_choice: str):
|
| 234 |
"""
|
| 235 |
Fetches all questions, runs the LangGraphAgent on them, submits all answers,
|
| 236 |
and displays the results.
|
|
|
|
| 246 |
|
| 247 |
if not OPENROUTER_API_KEY:
|
| 248 |
return "Error: OPENROUTER_API_KEY not found. Please set it in your .env file.", None
|
| 249 |
+
# TAVILY_API_KEY check is handled by the tool initialization itself with a warning.
|
|
|
|
|
|
|
| 250 |
|
| 251 |
api_url = DEFAULT_API_URL
|
| 252 |
questions_url = f"{api_url}/questions"
|
| 253 |
submit_url = f"{api_url}/submit"
|
| 254 |
|
| 255 |
+
print(f"Attempting to initialize agent with LLM: {llm_model_choice}")
|
| 256 |
try:
|
| 257 |
+
agent = LangGraphAgent(llm_choice=llm_model_choice)
|
|
|
|
| 258 |
except Exception as e:
|
| 259 |
print(f"Error instantiating agent: {e}")
|
| 260 |
return f"Error initializing agent: {e}", None
|
|
|
|
| 347 |
results_df = pd.DataFrame(results_log)
|
| 348 |
return status_message, results_df
|
| 349 |
|
|
|
|
| 350 |
with gr.Blocks() as demo:
|
| 351 |
gr.Markdown("# LangGraph GAIA Agent Evaluation Runner")
|
| 352 |
gr.Markdown(
|
|
|
|
| 356 |
2. **Create a `.env` file** in the root of your space with your API keys:
|
| 357 |
```
|
| 358 |
OPENROUTER_API_KEY="your_openrouter_api_key"
|
| 359 |
+
TAVILY_API_KEY="your_tavily_api_key" # Optional, but TavilySearch tool won't work without it
|
| 360 |
```
|
| 361 |
3. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 362 |
+
4. **Select the LLM model** you want the agent to use.
|
| 363 |
+
5. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 364 |
---
|
| 365 |
**Disclaimers:**
|
| 366 |
- Ensure your Hugging Face Space is public for the `agent_code` link to be verifiable.
|
| 367 |
- Submitting all answers can take some time as the agent processes each question.
|
| 368 |
+
- The agent will use the selected LLM. Note that only some models (e.g., llama) support tool/function calling. If a model without tool support is chosen for a task requiring tools, it may not perform optimally or might not use tools.
|
| 369 |
"""
|
| 370 |
)
|
| 371 |
|
| 372 |
gr.LoginButton()
|
| 373 |
|
| 374 |
+
llm_choice_dropdown = gr.Dropdown(
|
| 375 |
+
choices=["llama", "qwen"],
|
| 376 |
+
value="llama", # Default to llama as it supports tool calling
|
| 377 |
+
label="Select LLM Model",
|
| 378 |
+
info="Choose the Large Language Model for the agent."
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 382 |
|
| 383 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
|
|
|
| 385 |
|
| 386 |
run_button.click(
|
| 387 |
fn=run_and_submit_all,
|
| 388 |
+
inputs=[llm_choice_dropdown], # Add llm_choice_dropdown as an input
|
| 389 |
outputs=[status_output, results_table]
|
| 390 |
)
|
| 391 |
|