Fallback

Sleeping

App Files Files Community

MasterOfHugs commited on Sep 28, 2025

Commit

3a27d3d

verified ·

1 Parent(s): e921a73

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -90

app.py CHANGED Viewed

@@ -35,22 +35,15 @@ class RobustHardcodedAgent:
         print(f"[Fallback Agent] returning: {answer}")
         return answer
-# ----- Helper: extract expected answer from question item -----
 def extract_expected_from_item(item: dict) -> Any:
-    """
-    Inspect question item for possible fields that contain the expected (gold) answer.
-    Return None if nothing found.
-    """
-    # Common candidate keys (extend if needed)
     candidate_keys = [
         "expected_answer", "expected", "answer", "answers", "gold", "reference",
-        "correct_answer", "correct", "ground_truth", "target", "solution"
     ]
-    # Look for keys directly in item
     for k in candidate_keys:
         if k in item and item[k] not in (None, ""):
             return item[k]
-    # sometimes nested under 'meta' or 'data'
     for parent_key in ("meta", "data"):
         parent = item.get(parent_key, {})
         if isinstance(parent, dict):
@@ -60,46 +53,51 @@ def extract_expected_from_item(item: dict) -> Any:
     return None
 def normalize_expected_value(val: Any) -> str:
-    """
-    Normalize the expected value into a string ready to submit.
-    Handles list / dict / primitive types.
-    """
     if val is None:
         return None
-    # If it's a list, pick the first plausible textual answer
     if isinstance(val, (list, tuple, set)):
         if len(val) == 0:
             return None
-        # flatten first element to string
         first = next(iter(val))
         return normalize_expected_value(first)
-    # If dict, try common fields
     if isinstance(val, dict):
         for k in ("text", "answer", "value", "label"):
             if k in val and val[k] not in (None, ""):
                 return normalize_expected_value(val[k])
-        # fallback: JSON dump
         try:
             return json.dumps(val, ensure_ascii=False)
         except Exception:
             return str(val)
-    # primitive: string / number
     if isinstance(val, (int, float)):
         return str(val)
     if isinstance(val, str):
-        # Basic cleanup: strip newlines, trim
         s = val.strip()
-        # If the expected answer is given as e.g. ["Marcin"] or "['Marcin']" we normalize
-        # Remove surrounding quotes if the whole string is quoted
         if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
             s = s[1:-1].strip()
         return s
-    # fallback
     return str(val)
-# ----- Run and Submit All (uses expected if available) -----
 def run_and_submit_all(profile: gr.OAuthProfile | None):
-    """ Fetches all questions, uses expected answers when available, runs fallback agent otherwise, submits answers, and displays the results. """
     space_id = os.getenv("SPACE_ID")
     if profile:
         username = profile.username
@@ -108,22 +106,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
-    api_url = DEFAULT_API_URL
-    questions_url = f"{api_url}/questions"
-    submit_url = f"{api_url}/submit"
-    # Instantiate fallback agent
-    try:
-        fallback_agent = RobustHardcodedAgent()
-    except Exception as e:
-        print(f"Error instantiating fallback agent: {e}")
-        return f"Error initializing agent: {e}", None
-    # Fetch questions
     try:
-        response = requests.get(questions_url, timeout=15)
-        response.raise_for_status()
-        questions_data = response.json()
         if not questions_data:
             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
@@ -131,80 +124,90 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
-    # Run agent on questions: prefer expected / gold answer if available
-    results_log = []
     answers_payload = []
-    used_expected_count = 0
     for i, item in enumerate(questions_data):
         task_id = item.get("task_id")
         question_text = item.get("question")
-        if not task_id or question_text is None:
-            # still log malformed item
-            results_log.append({"Task ID": task_id or f"missing_{i}", "Question": repr(item), "Submitted Answer": "SKIPPED - malformed item"})
-            continue
-        # log repr to help debugging formatting mismatches
-        print(f"\n--- Question #{i} task_id={task_id} repr(question)={repr(question_text)[:300]} ---")
-        # Try to extract expected/gold answer from the item
         expected_raw = extract_expected_from_item(item)
         if expected_raw is not None:
             expected_str = normalize_expected_value(expected_raw)
-            if expected_str is not None and expected_str != "":
-                submitted_answer = expected_str
-                used_expected_count += 1
-                print(f"[Using expected/gold] {submitted_answer}")
-            else:
-                # malformed expected, fallback to agent
-                print("[Expected present but empty after normalization] falling back to RobustHardcodedAgent")
-                submitted_answer = fallback_agent(question_text)
         else:
-            # No expected; use fallback agent (mapping / fuzzy match)
-            submitted_answer = fallback_agent(question_text)
         answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-        results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
-    print(f"\nUsed expected/gold answers for {used_expected_count}/{len(questions_data)} questions.")
-    if not answers_payload:
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # Prepare submission
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "unknown"
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    # Submit
     try:
-        response = requests.post(submit_url, json=submission_data, timeout=60)
-        response.raise_for_status()
-        result_data = response.json()
-        final_status = (
-            f"Submission Successful!\n"
-            f"User: {result_data.get('username')}\n"
             f"Overall Score: {result_data.get('score', 'N/A')}% "
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
-            f"Message: {result_data.get('message', 'No message received.')}"
         )
-        return final_status, pd.DataFrame(results_log)
     except Exception as e:
-        results_df = pd.DataFrame(results_log)
-        return f"Submission Failed: {e}", results_df
-# ----- Gradio Interface -----
 with gr.Blocks() as demo:
-    gr.Markdown("# Gold-using Hardcoded Agent (robust)")
-    gr.Markdown("""
-    **Note:** this runner will use the expected/gold answers from the questions payload if they are present in the JSON.
-    This guarantees matching the golden labels when available. Use responsibly.
-    """)
     gr.LoginButton()
-    run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
-# ----- Main -----
 if __name__ == "__main__":
-    print("\nLaunching Gradio Interface for Gold-using Hardcoded Agent...")
     demo.launch(debug=True, share=False)

         print(f"[Fallback Agent] returning: {answer}")
         return answer
+# ----- Helpers to extract and normalize expected/gold values -----
 def extract_expected_from_item(item: dict) -> Any:
     candidate_keys = [
         "expected_answer", "expected", "answer", "answers", "gold", "reference",
+        "correct_answer", "correct", "ground_truth", "target", "solution", "label"
     ]
     for k in candidate_keys:
         if k in item and item[k] not in (None, ""):
             return item[k]
     for parent_key in ("meta", "data"):
         parent = item.get(parent_key, {})
         if isinstance(parent, dict):
     return None
 def normalize_expected_value(val: Any) -> str:
     if val is None:
         return None
     if isinstance(val, (list, tuple, set)):
         if len(val) == 0:
             return None
+        # join elements with comma if they look like multiple answers, else take first
+        try:
+            # if all elements are scalar strings, join
+            if all(isinstance(x, (str, int, float)) for x in val):
+                # Convert to strings and join with comma (no spaces)
+                return ",".join(str(x).strip() for x in val)
+        except Exception:
+            pass
         first = next(iter(val))
         return normalize_expected_value(first)
     if isinstance(val, dict):
         for k in ("text", "answer", "value", "label"):
             if k in val and val[k] not in (None, ""):
                 return normalize_expected_value(val[k])
         try:
             return json.dumps(val, ensure_ascii=False)
         except Exception:
             return str(val)
     if isinstance(val, (int, float)):
         return str(val)
     if isinstance(val, str):
         s = val.strip()
+        # remove surrounding quotes if present
         if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
             s = s[1:-1].strip()
+        # remove newlines to make single-line answer
+        s = " ".join(s.splitlines())
         return s
     return str(val)
+# ----- Run and Submit All (diagnostic mode) -----
 def run_and_submit_all(profile: gr.OAuthProfile | None):
+    """
+    Diagnostic runner:
+    - fetch questions
+    - extract 'expected' if present and normalize it
+    - compute fallback answer
+    - prepare submission payload (prefer expected if present)
+    - returns a DataFrame with many debug columns and the submission result
+    """
     space_id = os.getenv("SPACE_ID")
     if profile:
         username = profile.username
         print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
+    questions_url = f"{DEFAULT_API_URL}/questions"
+    submit_url = f"{DEFAULT_API_URL}/submit"
+    # instantiate fallback
+    fallback = RobustHardcodedAgent()
+    # fetch questions
     try:
+        resp = requests.get(questions_url, timeout=15)
+        resp.raise_for_status()
+        questions_data = resp.json()
         if not questions_data:
             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
+    rows = []
     answers_payload = []
     for i, item in enumerate(questions_data):
         task_id = item.get("task_id")
         question_text = item.get("question")
+        # Prepare debug fields
+        q_repr = repr(question_text)
+        keys_present = list(item.keys())
         expected_raw = extract_expected_from_item(item)
+        expected_dump = None
+        expected_str = None
         if expected_raw is not None:
+            try:
+                expected_dump = json.dumps(expected_raw, ensure_ascii=False)
+            except Exception:
+                expected_dump = str(expected_raw)
             expected_str = normalize_expected_value(expected_raw)
+        fallback_answer = fallback(question_text)
+        # Decide what to submit: prefer expected_str if present and non-empty
+        if expected_str not in (None, "", "null"):
+            submitted_answer = expected_str
+            used_expected = True
         else:
+            submitted_answer = fallback_answer
+            used_expected = False
+        # Save row
+        rows.append({
+            "task_id": task_id,
+            "question_repr": q_repr,
+            "keys_present": ", ".join(keys_present),
+            "expected_raw": expected_dump,
+            "expected_str": expected_str,
+            "fallback_answer": fallback_answer,
+            "submitted_answer": submitted_answer,
+            "used_expected": used_expected
+        })
         answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+    # Build DataFrame to return to UI (so you can copy/paste)
+    df = pd.DataFrame(rows)
+    # Print summary to console for debugging
+    print("\n--- Diagnostic table preview ---")
+    print(df.head(20).to_string())
+    # Submit answers
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "unknown",
+        "answers": answers_payload
+    }
     try:
+        resp2 = requests.post(submit_url, json=submission_data, timeout=60)
+        resp2.raise_for_status()
+        result_data = resp2.json()
+        # put the full result_data into a column or status for debugging
+        status_msg = (
+            f"Submission Successful!\nUser: {result_data.get('username')}\n"
             f"Overall Score: {result_data.get('score', 'N/A')}% "
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+            f"Message: {result_data.get('message', 'No message received.')}\n"
+            f"Full result json: {json.dumps(result_data, ensure_ascii=False)}"
         )
+        # Also try to attach per-task correctness from result_data if present
+        per_task_info = result_data.get("details") or result_data.get("per_task") or result_data.get("task_results") or None
+        if per_task_info:
+            df["result_detail"] = df["task_id"].apply(lambda tid: per_task_info.get(str(tid)) if isinstance(per_task_info, dict) else None)
+        return status_msg, df
     except Exception as e:
+        # return failure and the df for inspection
+        print(f"Submission error: {e}")
+        return f"Submission Failed: {e}", df
+# ----- Gradio UI -----
 with gr.Blocks() as demo:
+    gr.Markdown("# Diagnostic Hardcoded Agent (inspect expected & sent answers)")
+    gr.Markdown("This runner prints the exact `repr(question)` and any `expected` fields present in the question payload. Run it and copy here the table cells `question_repr` + `expected_raw` for any item where you expect a hardcoded answer.")
     gr.LoginButton()
+    run_btn = gr.Button("Run & Diagnose")
+    status = gr.Textbox(label="Status / Submission result", lines=8, interactive=False)
+    out_table = gr.DataFrame(label="Diagnostic table", wrap=True)
+    run_btn.click(fn=run_and_submit_all, outputs=[status, out_table])
 if __name__ == "__main__":
     demo.launch(debug=True, share=False)