File size: 17,898 Bytes
a264557
a170059
 
052aa3e
a170059
3bf855f
a264557
3bf855f
d75d9a2
93dbedf
c1f3f5c
93dbedf
3bf855f
 
b83c856
3bf855f
93dbedf
3bf855f
 
 
 
 
 
 
 
 
c1f3f5c
3bf855f
 
 
93dbedf
3bf855f
 
93dbedf
3bf855f
 
 
 
c1f3f5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bf855f
c1f3f5c
3bf855f
 
c1f3f5c
3bf855f
 
 
c1f3f5c
3bf855f
 
cc67291
3bf855f
 
 
 
 
 
 
 
 
 
c1f3f5c
3bf855f
d4303f4
 
cdedb37
 
 
c1f3f5c
 
 
 
cdedb37
 
d4303f4
c1f3f5c
3bf855f
 
 
 
cc67291
3bf855f
d4303f4
c1f3f5c
 
 
 
cdedb37
c1f3f5c
 
 
cdedb37
 
c1f3f5c
cdedb37
c1f3f5c
 
3bf855f
 
 
 
c1f3f5c
cdedb37
 
 
 
3bf855f
 
c1f3f5c
 
 
 
3bf855f
 
 
 
 
a264557
a170059
3bf855f
c1f3f5c
cdedb37
3bf855f
 
c1f3f5c
 
 
 
 
 
 
 
 
 
cdedb37
c1f3f5c
 
 
 
 
 
 
3bf855f
d4303f4
c1f3f5c
d4303f4
 
c1f3f5c
3bf855f
cdedb37
 
c1f3f5c
 
 
 
 
 
 
 
 
 
 
 
 
cdedb37
cc67291
d4303f4
c1f3f5c
d4303f4
 
 
 
cc67291
cdedb37
cc67291
c1f3f5c
 
 
 
 
 
cdedb37
c1f3f5c
 
 
 
cdedb37
3bf855f
 
 
 
 
 
cdedb37
3bf855f
cdedb37
 
c1f3f5c
3bf855f
 
c1f3f5c
cdedb37
 
c1f3f5c
3bf855f
c1f3f5c
 
 
 
 
 
3bf855f
93dbedf
 
 
 
3bf855f
cc67291
3bf855f
 
cc67291
3bf855f
 
a170059
3bf855f
a170059
 
 
 
 
 
e80aab9
a170059
cc67291
93dbedf
3bf855f
 
a264557
052aa3e
3bf855f
 
a170059
 
93dbedf
a170059
 
 
3bf855f
cc67291
a170059
cc67291
 
a170059
cc67291
 
a170059
3bf855f
052aa3e
a170059
 
cc67291
a170059
cdedb37
3bf855f
d4303f4
 
a170059
052aa3e
 
a170059
 
 
 
93dbedf
052aa3e
a170059
c1f3f5c
cc67291
a170059
 
 
 
c1f3f5c
a170059
 
cc67291
 
 
052aa3e
cc67291
a170059
cc67291
052aa3e
3bf855f
cdedb37
d4303f4
 
cdedb37
c1f3f5c
d4303f4
 
a170059
 
052aa3e
93dbedf
cc67291
e80aab9
 
052aa3e
 
3bf855f
cc67291
 
d4303f4
cc67291
 
3bf855f
052aa3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import os
import gradio as gr
import requests
import inspect
import pandas as pd
import re # For parsing LLM output

# --- HF Inference API for LLM ---
from huggingface_hub import InferenceClient

LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"

try:
    hf_token = os.getenv("HF_TOKEN")
    llm_client = InferenceClient(model=LLM_MODEL, token=hf_token)
except Exception as e:
    print(f"Error initializing InferenceClient: {e}")
    llm_client = None

# --- Tools ---
from duckduckgo_search import DDGS

def search_tool(query: str) -> str:
    print(f"Tool: search_tool, Query: {query}")
    try:
        with DDGS() as ddgs:
            results = ddgs.text(query, max_results=3) # Fewer results to be less verbose
            if results:
                return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results])
            else:
                return "No results found for your query. This might mean the query returned no relevant documents, or there could be a temporary issue (e.g., rate limit)."
    except Exception as e:
        print(f"Error in search_tool: {e}")
        return f"Error performing search: {str(e)}. This could be due to a network issue, an invalid query, or a rate limit."

def calculator_tool(expression: str) -> str:
    print(f"Tool: calculator_tool, Expression: {expression}")
    try:
        # Basic check for safety, though a proper parser is better for production
        if not re.match(r"^[0-9\s\+\-\*\/\(\)\.\%sqrtpijabsindcostanlog]+$", expression):
            # Add more functions as needed, e.g. math.sqrt, math.pi etc.
            # For simplicity, we are keeping a limited set here.
            if expression not in ["pi", "sqrt"] and not any(op in expression for op in ['+', '-', '*', '/']):
                 return f"Error: Invalid characters in expression. Only numbers, basic operators, sqrt, pi allowed. Expression: {expression}"

        # Using a more controlled eval
        allowed_names = {"sqrt": lambda x: x**0.5, "pi": 3.1415926535} # Add more safe functions
        code = compile(expression, "<string>", "eval")
        for name in code.co_names:
            if name not in allowed_names and name not in __builtins__:
                raise NameError(f"Use of {name} is not allowed")
        
        result = eval(code, {"__builtins__": {}}, allowed_names)
        return str(result)

    except Exception as e:
        print(f"Error in calculator_tool: {e}")
        return f"Error calculating: {str(e)}. Ensure the expression is valid and uses allowed functions/operators."

# --- Agent Definition ---
class ReActAgent:
    def __init__(self, llm_client, tools: dict, max_iterations=7):
        print("ReActAgent initialized.")
        if llm_client is None:
            raise ValueError("LLM client not initialized.")
        self.llm = llm_client
        self.tools = tools
        self.max_iterations = max_iterations

        self.tool_descriptions = "\n".join([
            f"- {name}: {inspect.getdoc(func)}"
            for name, func in tools.items()
        ])
        self.tool_names = ", ".join(tools.keys())

        # Refined prompt for better tool usage and stopping
        self.react_prompt_template = inspect.cleandoc(f"""
            You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
            Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.

            You will proceed in a Thought, Action, Observation loop.
            1. First, provide a "Thought:" explaining your reasoning for the current question.
            2. Next, provide an "Action:".
                - If you need to search the web, use search_tool[query].
                - If you need to perform a calculation (e.g., arithmetic like 5*5, or math expressions), use calculator_tool[expression].
                - If no tool is needed for this immediate step based on your current thought and the information available, use "Action: None". Only use Action: None if you are certain no tool can help or is required for the current step.
            3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:".
            4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:".

            The final answer itself (the text after "Final Answer:") must be an EXACT, non-empty match to the correct response, without any extra explanations, apologies, or prefixes.

            Available tools:
            {self.tool_descriptions}

            Use the following format FOR THE CURRENT QUESTION ONLY:
            Question: the input question you must answer

            {'{scratchpad}'}

            Thought: [Your reasoning and plan for the current question. If continuing from an observation, reason about that observation.]
            Action: [search_tool[query_for_search] OR calculator_tool[math_expression_to_calculate] OR Action: None]. AFTER THIS, STOP.
            Observation: [The system will provide this. Do NOT generate this part.]
            Thought: [Your reasoning based on the previous observation.]
            Action: [Another action or Action: None]. AFTER THIS, STOP.
            Observation: [The system will provide this.]
            ... (Repeat Thought/Action/STOP/Observation as needed)
            Thought: I have sufficient information to answer the current question.
            Final Answer: [Provide ONLY the precise, non-empty answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".]

            Start your response for the current turn with "Thought:".
        """) # Removed initial "Question: {question}" here, it's now part of the formatted prompt


    def run_llm(self, prompt: str) -> str:
        try:
            stop_tokens = [
                "\nObservation:", "Observation:",
                "\nFinal Answer:", "Final Answer:"
            ]
            
            response = self.llm.text_generation(
                prompt,
                max_new_tokens=350,
                temperature=0.05, # Lowered further for more determinism
                do_sample=True,   # Important if temperature < 1.0
                stop=stop_tokens, # Using `stop` as per FutureWarning
            )
            return response.strip()
        except Exception as e:
            print(f"Error during LLM call: {e}")
            return f"Error generating response: {str(e)}"

    def __call__(self, question: str) -> str:
        print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
        scratchpad_history = "" 
        
        for i in range(self.max_iterations):
            print(f"\nIteration {i+1}")

            # Construct the prompt for the LLM for the current turn
            # The template now has {scratchpad} in the middle, then format instructions, then prompts for Thought/Action.
            # We ensure the LLM starts its generation with a Thought.
            # The initial prompt will be the template + Question + "Thought:"
            # Subsequent prompts will be template + Question + scratchpad_history + "Thought:"
            
            # The main instruction block, question, and current scratchpad history
            current_prompt_base = self.react_prompt_template.format(scratchpad=scratchpad_history).split("Thought:")[0]
            current_prompt_text = f"Question: {question}\n" + current_prompt_base
            
            if not scratchpad_history: # First turn
                 current_prompt_text += "Thought:" # Prime for the first thought
            else: # Subsequent turns, scratchpad_history has previous T/A/O
                 current_prompt_text += scratchpad_history + "\nThought:" # Prime for next thought after observation

            print(f"--- PROMPT FOR LLM (Iteration {i+1}, last 300 chars) ---\n...{current_prompt_text[-300:]}\n--- END PROMPT ---")
            llm_output_this_turn = self.run_llm(current_prompt_text)

            print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
            print(llm_output_this_turn)
            print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")

            if not llm_output_this_turn:
                print("LLM returned empty or error, stopping.")
                return "Agent could not determine an answer within the allowed steps."

            # Prepend "Thought:" if LLM didn't include it (due to priming)
            # This ensures scratchpad consistency if the LLM directly starts with the thought content.
            actual_llm_generation = llm_output_this_turn
            if not llm_output_this_turn.strip().startswith("Thought:") and \
               (scratchpad_history.strip().endswith("Observation:") or not scratchpad_history):
                actual_llm_generation = "Thought: " + llm_output_this_turn

            scratchpad_history += actual_llm_generation + "\n"

            # Check for Final Answer in the llm_output_this_turn
            # The llm_output_this_turn could be "Thought: ... Final Answer: ..." if no tool was needed.
            final_answer_segment = actual_llm_generation # Check the full segment for Final Answer
            all_final_answers = re.findall(r"Final Answer:\s*(.*)", final_answer_segment, re.DOTALL | re.IGNORECASE)
            
            if all_final_answers:
                answer = all_final_answers[-1].strip()
                # Clean common contamination
                if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
                if "Action:" in answer: answer = answer.split("Action:")[0].strip()
                if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
                if "Question:" in answer: answer = answer.split("Question:")[0].strip()
                inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
                if inner_final_answers: answer = inner_final_answers[-1].strip()
                
                if answer: # Only if the answer is not empty after cleaning
                    print(f"Found and extracted Final Answer: '{answer}'")
                    return answer
                else:
                    print("LLM produced 'Final Answer:' but the content was empty or invalid after cleaning. Continuing.")
                    # Scratchpad already has this turn's problematic output. Loop continues.

            # Parse Action from llm_output_this_turn (or actual_llm_generation)
            action_segment = actual_llm_generation # Check the full segment for Action
            action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", action_segment, re.DOTALL)
            action_none_match = re.search(r"Action:\s*None", action_segment, re.IGNORECASE)

            if action_match:
                tool_name = action_match.group(1).strip()
                tool_input = action_match.group(2).strip()
                if tool_name in self.tools:
                    print(f"Executing Tool: {tool_name}, Input: {tool_input}")
                    try:
                        observation_content = self.tools[tool_name](tool_input)
                    except Exception as e:
                        observation_content = f"Error executing tool {tool_name}: {e}"
                    print(f"Observation content: {observation_content[:200]}...")
                    scratchpad_history += f"Observation: {observation_content}\n"
                else:
                    print(f"Unknown tool: {tool_name}")
                    scratchpad_history += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
            elif action_none_match:
                print("Action: None detected.")
                scratchpad_history += f"Observation: No action taken, proceeding with reasoning.\n"
            else:
                print("No valid Action (tool use or None) found in LLM output for this turn. LLM might be thinking or its format is off.")
                # If the LLM is supposed to always output an Action (even None) but doesn't,
                # it's a deviation. We add a generic observation to try and get it back on track.
                # This can happen if it only outputs a Thought.
                scratchpad_history += "Observation: LLM did not provide an Action in the expected format. Please provide a Thought and then an Action (or Action: None).\n"


        print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
        standard_failure_message = "Agent could not determine an answer within the allowed steps."
        print(f"Returning standard failure message: {standard_failure_message}")
        return standard_failure_message

# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

# --- Main Execution Logic ---
def run_and_submit_all(profile: gr.OAuthProfile | None):
    space_id = os.getenv("SPACE_ID")
    if profile:
        username = f"{profile.username}"
    else:
        return "Please Login to Hugging Face with the button.", None

    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    submit_url = f"{api_url}/submit"

    try:
        available_tools = {"search_tool": search_tool, "calculator_tool": calculator_tool}
        if llm_client is None:
             return "LLM Client could not be initialized. Check logs and HF_TOKEN.", None
        agent = ReActAgent(llm_client=llm_client, tools=available_tools)
    except Exception as e:
        return f"Error initializing agent: {e}", None

    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code not available (SPACE_ID not set)"

    try:
        response = requests.get(questions_url, timeout=20)
        response.raise_for_status()
        questions_data = response.json()
        if not questions_data:
            return "Fetched questions list is empty or invalid format.", None
    except Exception as e:
        return f"Error fetching questions: {e}", None

    results_log, answers_payload = [], []
    for item in questions_data:
        task_id, question_text = item.get("task_id"), item.get("question")
        if not task_id or question_text is None: continue
        try:
            print(f"\n--- Processing Task ID: {task_id}, Question: {question_text[:100]}... ---")
            submitted_answer = agent(question_text)
            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
            print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
        except Exception as e:
            print(f"Error running agent on task {task_id}: {e}") 
            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
            answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})


    if not answers_payload:
        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)

    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}

    try:
        response = requests.post(submit_url, json=submission_data, timeout=120)
        response.raise_for_status()
        result_data = response.json()
        final__status = ( # Renamed to avoid conflict
            f"Submission Successful!\nUser: {result_data.get('username')}\n"
            f"Overall Score: {result_data.get('score', 'N/A')}% "
            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
            f"Message: {result_data.get('message', 'No message received.')}"
        )
        return final_status, pd.DataFrame(results_log) # Corrected variable name
    except requests.exceptions.HTTPError as e:
        error_detail = f"Server responded with status {e.response.status_code}."
        try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}"
        except: error_detail += f" Response: {e.response.text[:500]}"
        return f"Submission Failed: {error_detail}", pd.DataFrame(results_log)
    except Exception as e:
        return f"An unexpected error occurred during submission: {e}", pd.DataFrame(results_log)

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
    gr.Markdown( 
        """
        **Instructions & Disclaimers:**
        Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences.
        Check logs for RAW LLM OUTPUT and PROMPT FOR LLM for debugging.
        """
    )
    gr.LoginButton()
    run_button = gr.Button("Run Evaluation & Submit All Answers")
    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])

if __name__ == "__main__":
    print("\n" + "-"*30 + " App Starting " + "-"*30)
    space_host_startup = os.getenv("SPACE_HOST")
    space_id_startup = os.getenv("SPACE_ID")
    if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
    if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")

    if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
    else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
    print("-"*(60 + len(" App Starting ")) + "\n")
    demo.launch(debug=True, share=False)