File size: 8,241 Bytes
10e9b7d
4c50fee
10e9b7d
eccf8e4
3c4371f
12c29d2
488b7fa
24811b1
12c29d2
 
 
37790b8
12c29d2
 
10e9b7d
3db6293
e80aab9
12c29d2
 
 
 
 
 
70d2572
31243f4
70d2572
fb7db6d
70d2572
 
 
37790b8
70d2572
 
 
 
 
693dcc6
12c29d2
 
70d2572
12c29d2
 
 
70d2572
 
 
 
 
 
 
 
12c29d2
31243f4
567c67d
70d2572
6767692
70d2572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6767692
70d2572
12c29d2
37790b8
46df6c3
 
7e4a06b
46df6c3
3c4371f
7e4a06b
3c4371f
7d65c66
fb7db6d
7e4a06b
31243f4
 
fb7db6d
31243f4
70d2572
31243f4
3c4371f
31243f4
fb7db6d
36ed51a
fb7db6d
eccf8e4
31243f4
7d65c66
31243f4
 
693dcc6
46df6c3
31243f4
fb7db6d
7d65c66
 
3c4371f
fb7db6d
693dcc6
31243f4
 
3098349
 
693dcc6
31243f4
fb7db6d
70d2572
3098349
 
 
70d2572
 
3098349
70d2572
 
 
 
 
 
 
4e66a6c
70d2572
 
 
 
 
 
 
3098349
 
31243f4
70d2572
7d65c66
 
31243f4
693dcc6
fb7db6d
70d2572
 
 
6767692
31243f4
693dcc6
fb7db6d
7d65c66
693dcc6
fb7db6d
e80aab9
4e66a6c
 
e80aab9
 
31243f4
e80aab9
 
693dcc6
 
 
e80aab9
4e66a6c
693dcc6
4e66a6c
693dcc6
7d65c66
4e66a6c
 
 
 
e80aab9
693dcc6
e80aab9
70d2572
0ee0419
e514fd7
693dcc6
70d2572
3098349
567c67d
693dcc6
e80aab9
7e4a06b
31243f4
9088b99
7d65c66
fb7db6d
31243f4
 
 
e80aab9
693dcc6
e80aab9
693dcc6
fb7db6d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import os
import time
import gradio as gr
import requests
import pandas as pd
from smolagents import (
    CodeAgent,
    InferenceClientModel,
    DuckDuckGoSearchTool,
    WikipediaSearchTool,
    PythonInterpreterTool,
    VisitWebpageTool,
    tool,
)

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

@tool
def get_current_date_time() -> str:
    """Returns the current date and time in ISO format."""
    from datetime import datetime
    return datetime.now().isoformat()

class StrictHuggingFaceAgent:
    def __init__(self):
        print("Initializing Strict Hugging Face Agent with Few-Shot Prompting...")
        
        hf_token = os.getenv("HF_TOKEN")
        if not hf_token:
            raise ValueError("HF_TOKEN environment variable not set in Space Secrets.")
            
        self.model = InferenceClientModel(
            model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
            token=hf_token,
        )
        
        self.tools = [
            DuckDuckGoSearchTool(),
            WikipediaSearchTool(),
            VisitWebpageTool(), 
            PythonInterpreterTool(),
            get_current_date_time,
        ]
        
        self.agent = CodeAgent(
            tools=self.tools,
            model=self.model,
            max_steps=7,
            additional_authorized_imports=["datetime", "re", "json", "math", "collections", "pandas", "requests", "bs4"],
        )
        print("Agent ready.")

    def __call__(self, question: str) -> str:
        print(f"\nAgent received question: {question[:80]}...")
        max_retries = 3
        
        for attempt in range(max_retries):
            try:
                time.sleep(2) 
                answer = self.agent.run(question)
                # Clean up any accidental leading/trailing whitespace or quotes the agent might slip in
                clean_answer = str(answer).strip(" '\"\n\t.") 
                print(f"Agent answer: {clean_answer}")
                return clean_answer
            except Exception as e:
                err_msg = str(e).lower()
                if "429" in err_msg or "rate limit" in err_msg or "too many requests" in err_msg:
                    wait_time = 20 * (attempt + 1)
                    print(f"Rate limit hit! Pausing for {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
                else:
                    print(f"Agent error processing question: {e}")
                    return f"Error: {str(e)}"
                    
        return "Error: Rate limit exceeded after maximum retries."

# --- App Runner ---
def run_and_submit_all(profile: gr.OAuthProfile | None):
    space_id = os.getenv("SPACE_ID")
    if profile:
        username = f"{profile.username}"
        print(f"User logged in: {username}")
    else:
        print("User not logged in.")
        return "Please Login to Hugging Face with the button.", None
        
    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    submit_url = f"{api_url}/submit"
    
    try:
        agent = StrictHuggingFaceAgent()
    except Exception as e:
        print(f"Error instantiating agent: {e}")
        return f"Error initializing agent: {e}", None
        
    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
    
    try:
        response = requests.get(questions_url, timeout=15)
        response.raise_for_status()
        questions_data = response.json()
        if not questions_data:
            return "No questions.", None
    except Exception as e:
        return f"Error fetching questions: {e}", None
        
    results_log = []
    answers_payload = []
    print(f"Running agent on {len(questions_data)} questions...")
    
    for i, item in enumerate(questions_data):
        task_id = item.get("task_id")
        question_text = item.get("question")
        file_url = item.get("file_url") 
        
        if not task_id or not question_text:
            continue
            
        # Inject the file URL if it exists
        if file_url:
            question_text += f"\n\n[IMPORTANT: This task requires analyzing an attached file. You MUST download or read it directly from this URL: {file_url} using your Python tool.]"
            
        # The ultimate, unbreakable strict prompt WITH few-shot examples
        ultra_strict_prompt = (
            f"{question_text}\n\n"
            "=== CRITICAL OUTPUT INSTRUCTIONS ===\n"
            "You are being evaluated by a strict programmatic regex parser.\n"
            "Your final answer MUST consist of ONLY the exact requested name, number, or string.\n"
            "DO NOT wrap your answer in quotes, DO NOT add a trailing period, and DO NOT provide any explanation or conversational filler.\n\n"
            "Here are examples of perfect submissions:\n"
            "Example 1\n"
            "Question: What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?\n"
            "Answer: Claus\n\n"
            "Example 2\n"
            "Question: How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?\n"
            "Answer: 519\n\n"
            "Example 3\n"
            "Question: .rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI\n"
            "Answer: right\n\n"
            "Failure to follow these instructions perfectly will result in an immediate score of 0."
        )
            
        try:
            submitted_answer = agent(ultra_strict_prompt)
            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
        except Exception as e:
            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"ERROR: {e}"})
            
        # 15 second cooldown to protect your new Hugging Face token limits
        print("Cooling down for 15 seconds to protect quotas...")
        time.sleep(15)
            
    if not answers_payload:
        return "No answers.", pd.DataFrame(results_log)
        
    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
    print(f"Submitting {len(answers_payload)} answers...")
    
    try:
        # INCREASED TIMEOUT TO 300 SECONDS (5 Minutes) to allow the scoring server to wake up!
        response = requests.post(submit_url, json=submission_data, timeout=300)
        response.raise_for_status()
        result_data = response.json()
        final_status = (
            f"Submission Successful!\n"
            f"User: {result_data.get('username')}\n"
            f"Score: {result_data.get('score')}%\n"
            f"Correct: {result_data.get('correct_count')}/{result_data.get('total_attempted')}\n"
            f"Message: {result_data.get('message')}"
        )
        print("\n" + "="*40)
        print(final_status)
        print("="*40 + "\n")
        return final_status, pd.DataFrame(results_log)
    except Exception as e:
        # ADDED PRINT STATEMENT so you can actually see the error in the logs!
        error_msg = f"Submission failed: {e}"
        print(f"\n🚨 {error_msg} 🚨\n")
        return error_msg, pd.DataFrame(results_log)

# --- Build Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("# Strict Hugging Face Evaluation Runner (Few-Shot Edition)")
    gr.Markdown(
        """
        **Instructions:**
        1. Ensure your fresh `HF_TOKEN` is set in Space Secrets.
        2. Log in below.
        3. Click 'Run Evaluation & Submit' to start. 
        """
    )
    gr.LoginButton()
    run_button = gr.Button("Run Evaluation & Submit All Answers")
    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
    
    run_button.click(
        fn=run_and_submit_all,
        outputs=[status_output, results_table]
    )

if __name__ == "__main__":
    print("Starting Gradio app...")
    demo.launch(debug=True, share=False)