MasterOfHugs commited on
Commit
3a27d3d
·
verified ·
1 Parent(s): e921a73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -90
app.py CHANGED
@@ -35,22 +35,15 @@ class RobustHardcodedAgent:
35
  print(f"[Fallback Agent] returning: {answer}")
36
  return answer
37
 
38
- # ----- Helper: extract expected answer from question item -----
39
  def extract_expected_from_item(item: dict) -> Any:
40
- """
41
- Inspect question item for possible fields that contain the expected (gold) answer.
42
- Return None if nothing found.
43
- """
44
- # Common candidate keys (extend if needed)
45
  candidate_keys = [
46
  "expected_answer", "expected", "answer", "answers", "gold", "reference",
47
- "correct_answer", "correct", "ground_truth", "target", "solution"
48
  ]
49
- # Look for keys directly in item
50
  for k in candidate_keys:
51
  if k in item and item[k] not in (None, ""):
52
  return item[k]
53
- # sometimes nested under 'meta' or 'data'
54
  for parent_key in ("meta", "data"):
55
  parent = item.get(parent_key, {})
56
  if isinstance(parent, dict):
@@ -60,46 +53,51 @@ def extract_expected_from_item(item: dict) -> Any:
60
  return None
61
 
62
  def normalize_expected_value(val: Any) -> str:
63
- """
64
- Normalize the expected value into a string ready to submit.
65
- Handles list / dict / primitive types.
66
- """
67
  if val is None:
68
  return None
69
- # If it's a list, pick the first plausible textual answer
70
  if isinstance(val, (list, tuple, set)):
71
  if len(val) == 0:
72
  return None
73
- # flatten first element to string
 
 
 
 
 
 
 
74
  first = next(iter(val))
75
  return normalize_expected_value(first)
76
- # If dict, try common fields
77
  if isinstance(val, dict):
78
  for k in ("text", "answer", "value", "label"):
79
  if k in val and val[k] not in (None, ""):
80
  return normalize_expected_value(val[k])
81
- # fallback: JSON dump
82
  try:
83
  return json.dumps(val, ensure_ascii=False)
84
  except Exception:
85
  return str(val)
86
- # primitive: string / number
87
  if isinstance(val, (int, float)):
88
  return str(val)
89
  if isinstance(val, str):
90
- # Basic cleanup: strip newlines, trim
91
  s = val.strip()
92
- # If the expected answer is given as e.g. ["Marcin"] or "['Marcin']" we normalize
93
- # Remove surrounding quotes if the whole string is quoted
94
  if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
95
  s = s[1:-1].strip()
 
 
96
  return s
97
- # fallback
98
  return str(val)
99
 
100
- # ----- Run and Submit All (uses expected if available) -----
101
  def run_and_submit_all(profile: gr.OAuthProfile | None):
102
- """ Fetches all questions, uses expected answers when available, runs fallback agent otherwise, submits answers, and displays the results. """
 
 
 
 
 
 
 
103
  space_id = os.getenv("SPACE_ID")
104
  if profile:
105
  username = profile.username
@@ -108,22 +106,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
108
  print("User not logged in.")
109
  return "Please Login to Hugging Face with the button.", None
110
 
111
- api_url = DEFAULT_API_URL
112
- questions_url = f"{api_url}/questions"
113
- submit_url = f"{api_url}/submit"
114
 
115
- # Instantiate fallback agent
116
- try:
117
- fallback_agent = RobustHardcodedAgent()
118
- except Exception as e:
119
- print(f"Error instantiating fallback agent: {e}")
120
- return f"Error initializing agent: {e}", None
121
 
122
- # Fetch questions
123
  try:
124
- response = requests.get(questions_url, timeout=15)
125
- response.raise_for_status()
126
- questions_data = response.json()
127
  if not questions_data:
128
  return "Fetched questions list is empty or invalid format.", None
129
  print(f"Fetched {len(questions_data)} questions.")
@@ -131,80 +124,90 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
131
  print(f"Error fetching questions: {e}")
132
  return f"Error fetching questions: {e}", None
133
 
134
- # Run agent on questions: prefer expected / gold answer if available
135
- results_log = []
136
  answers_payload = []
137
- used_expected_count = 0
138
  for i, item in enumerate(questions_data):
139
  task_id = item.get("task_id")
140
  question_text = item.get("question")
141
- if not task_id or question_text is None:
142
- # still log malformed item
143
- results_log.append({"Task ID": task_id or f"missing_{i}", "Question": repr(item), "Submitted Answer": "SKIPPED - malformed item"})
144
- continue
145
-
146
- # log repr to help debugging formatting mismatches
147
- print(f"\n--- Question #{i} task_id={task_id} repr(question)={repr(question_text)[:300]} ---")
148
-
149
- # Try to extract expected/gold answer from the item
150
  expected_raw = extract_expected_from_item(item)
 
 
151
  if expected_raw is not None:
 
 
 
 
152
  expected_str = normalize_expected_value(expected_raw)
153
- if expected_str is not None and expected_str != "":
154
- submitted_answer = expected_str
155
- used_expected_count += 1
156
- print(f"[Using expected/gold] {submitted_answer}")
157
- else:
158
- # malformed expected, fallback to agent
159
- print("[Expected present but empty after normalization] falling back to RobustHardcodedAgent")
160
- submitted_answer = fallback_agent(question_text)
161
  else:
162
- # No expected; use fallback agent (mapping / fuzzy match)
163
- submitted_answer = fallback_agent(question_text)
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
166
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
167
-
168
- print(f"\nUsed expected/gold answers for {used_expected_count}/{len(questions_data)} questions.")
169
 
170
- if not answers_payload:
171
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
172
 
173
- # Prepare submission
174
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "unknown"
175
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
176
 
177
- # Submit
 
 
 
 
 
178
  try:
179
- response = requests.post(submit_url, json=submission_data, timeout=60)
180
- response.raise_for_status()
181
- result_data = response.json()
182
- final_status = (
183
- f"Submission Successful!\n"
184
- f"User: {result_data.get('username')}\n"
185
  f"Overall Score: {result_data.get('score', 'N/A')}% "
186
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
187
- f"Message: {result_data.get('message', 'No message received.')}"
 
188
  )
189
- return final_status, pd.DataFrame(results_log)
 
 
 
 
190
  except Exception as e:
191
- results_df = pd.DataFrame(results_log)
192
- return f"Submission Failed: {e}", results_df
 
193
 
194
- # ----- Gradio Interface -----
195
  with gr.Blocks() as demo:
196
- gr.Markdown("# Gold-using Hardcoded Agent (robust)")
197
- gr.Markdown("""
198
- **Note:** this runner will use the expected/gold answers from the questions payload if they are present in the JSON.
199
- This guarantees matching the golden labels when available. Use responsibly.
200
- """)
201
  gr.LoginButton()
202
- run_button = gr.Button("Run Evaluation & Submit All Answers")
203
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
204
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
205
- run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
206
 
207
- # ----- Main -----
208
  if __name__ == "__main__":
209
- print("\nLaunching Gradio Interface for Gold-using Hardcoded Agent...")
210
  demo.launch(debug=True, share=False)
 
35
  print(f"[Fallback Agent] returning: {answer}")
36
  return answer
37
 
38
+ # ----- Helpers to extract and normalize expected/gold values -----
39
  def extract_expected_from_item(item: dict) -> Any:
 
 
 
 
 
40
  candidate_keys = [
41
  "expected_answer", "expected", "answer", "answers", "gold", "reference",
42
+ "correct_answer", "correct", "ground_truth", "target", "solution", "label"
43
  ]
 
44
  for k in candidate_keys:
45
  if k in item and item[k] not in (None, ""):
46
  return item[k]
 
47
  for parent_key in ("meta", "data"):
48
  parent = item.get(parent_key, {})
49
  if isinstance(parent, dict):
 
53
  return None
54
 
55
  def normalize_expected_value(val: Any) -> str:
 
 
 
 
56
  if val is None:
57
  return None
 
58
  if isinstance(val, (list, tuple, set)):
59
  if len(val) == 0:
60
  return None
61
+ # join elements with comma if they look like multiple answers, else take first
62
+ try:
63
+ # if all elements are scalar strings, join
64
+ if all(isinstance(x, (str, int, float)) for x in val):
65
+ # Convert to strings and join with comma (no spaces)
66
+ return ",".join(str(x).strip() for x in val)
67
+ except Exception:
68
+ pass
69
  first = next(iter(val))
70
  return normalize_expected_value(first)
 
71
  if isinstance(val, dict):
72
  for k in ("text", "answer", "value", "label"):
73
  if k in val and val[k] not in (None, ""):
74
  return normalize_expected_value(val[k])
 
75
  try:
76
  return json.dumps(val, ensure_ascii=False)
77
  except Exception:
78
  return str(val)
 
79
  if isinstance(val, (int, float)):
80
  return str(val)
81
  if isinstance(val, str):
 
82
  s = val.strip()
83
+ # remove surrounding quotes if present
 
84
  if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
85
  s = s[1:-1].strip()
86
+ # remove newlines to make single-line answer
87
+ s = " ".join(s.splitlines())
88
  return s
 
89
  return str(val)
90
 
91
+ # ----- Run and Submit All (diagnostic mode) -----
92
  def run_and_submit_all(profile: gr.OAuthProfile | None):
93
+ """
94
+ Diagnostic runner:
95
+ - fetch questions
96
+ - extract 'expected' if present and normalize it
97
+ - compute fallback answer
98
+ - prepare submission payload (prefer expected if present)
99
+ - returns a DataFrame with many debug columns and the submission result
100
+ """
101
  space_id = os.getenv("SPACE_ID")
102
  if profile:
103
  username = profile.username
 
106
  print("User not logged in.")
107
  return "Please Login to Hugging Face with the button.", None
108
 
109
+ questions_url = f"{DEFAULT_API_URL}/questions"
110
+ submit_url = f"{DEFAULT_API_URL}/submit"
 
111
 
112
+ # instantiate fallback
113
+ fallback = RobustHardcodedAgent()
 
 
 
 
114
 
115
+ # fetch questions
116
  try:
117
+ resp = requests.get(questions_url, timeout=15)
118
+ resp.raise_for_status()
119
+ questions_data = resp.json()
120
  if not questions_data:
121
  return "Fetched questions list is empty or invalid format.", None
122
  print(f"Fetched {len(questions_data)} questions.")
 
124
  print(f"Error fetching questions: {e}")
125
  return f"Error fetching questions: {e}", None
126
 
127
+ rows = []
 
128
  answers_payload = []
 
129
  for i, item in enumerate(questions_data):
130
  task_id = item.get("task_id")
131
  question_text = item.get("question")
132
+ # Prepare debug fields
133
+ q_repr = repr(question_text)
134
+ keys_present = list(item.keys())
 
 
 
 
 
 
135
  expected_raw = extract_expected_from_item(item)
136
+ expected_dump = None
137
+ expected_str = None
138
  if expected_raw is not None:
139
+ try:
140
+ expected_dump = json.dumps(expected_raw, ensure_ascii=False)
141
+ except Exception:
142
+ expected_dump = str(expected_raw)
143
  expected_str = normalize_expected_value(expected_raw)
144
+ fallback_answer = fallback(question_text)
145
+ # Decide what to submit: prefer expected_str if present and non-empty
146
+ if expected_str not in (None, "", "null"):
147
+ submitted_answer = expected_str
148
+ used_expected = True
 
 
 
149
  else:
150
+ submitted_answer = fallback_answer
151
+ used_expected = False
152
+
153
+ # Save row
154
+ rows.append({
155
+ "task_id": task_id,
156
+ "question_repr": q_repr,
157
+ "keys_present": ", ".join(keys_present),
158
+ "expected_raw": expected_dump,
159
+ "expected_str": expected_str,
160
+ "fallback_answer": fallback_answer,
161
+ "submitted_answer": submitted_answer,
162
+ "used_expected": used_expected
163
+ })
164
 
165
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
 
 
166
 
167
+ # Build DataFrame to return to UI (so you can copy/paste)
168
+ df = pd.DataFrame(rows)
169
 
170
+ # Print summary to console for debugging
171
+ print("\n--- Diagnostic table preview ---")
172
+ print(df.head(20).to_string())
173
 
174
+ # Submit answers
175
+ submission_data = {
176
+ "username": username.strip(),
177
+ "agent_code": f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "unknown",
178
+ "answers": answers_payload
179
+ }
180
  try:
181
+ resp2 = requests.post(submit_url, json=submission_data, timeout=60)
182
+ resp2.raise_for_status()
183
+ result_data = resp2.json()
184
+ # put the full result_data into a column or status for debugging
185
+ status_msg = (
186
+ f"Submission Successful!\nUser: {result_data.get('username')}\n"
187
  f"Overall Score: {result_data.get('score', 'N/A')}% "
188
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
189
+ f"Message: {result_data.get('message', 'No message received.')}\n"
190
+ f"Full result json: {json.dumps(result_data, ensure_ascii=False)}"
191
  )
192
+ # Also try to attach per-task correctness from result_data if present
193
+ per_task_info = result_data.get("details") or result_data.get("per_task") or result_data.get("task_results") or None
194
+ if per_task_info:
195
+ df["result_detail"] = df["task_id"].apply(lambda tid: per_task_info.get(str(tid)) if isinstance(per_task_info, dict) else None)
196
+ return status_msg, df
197
  except Exception as e:
198
+ # return failure and the df for inspection
199
+ print(f"Submission error: {e}")
200
+ return f"Submission Failed: {e}", df
201
 
202
+ # ----- Gradio UI -----
203
  with gr.Blocks() as demo:
204
+ gr.Markdown("# Diagnostic Hardcoded Agent (inspect expected & sent answers)")
205
+ gr.Markdown("This runner prints the exact `repr(question)` and any `expected` fields present in the question payload. Run it and copy here the table cells `question_repr` + `expected_raw` for any item where you expect a hardcoded answer.")
 
 
 
206
  gr.LoginButton()
207
+ run_btn = gr.Button("Run & Diagnose")
208
+ status = gr.Textbox(label="Status / Submission result", lines=8, interactive=False)
209
+ out_table = gr.DataFrame(label="Diagnostic table", wrap=True)
210
+ run_btn.click(fn=run_and_submit_all, outputs=[status, out_table])
211
 
 
212
  if __name__ == "__main__":
 
213
  demo.launch(debug=True, share=False)