my_customisedAgent

Runtime error

App Files Files Community

Toumaima commited on May 9

Commit

e197f92

verified ·

1 Parent(s): 42893d3

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -98

app.py CHANGED Viewed

@@ -4,9 +4,8 @@ import requests
 import string
 import warnings
 import pandas as pd
-from huggingface_hub import login
 import re
-import json
 from groq import Groq
 # --- Constants ---
@@ -16,7 +15,7 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
-        self.client = Groq(api_key=os.environ["GROQ_API_KEY"])
         self.agent_prompt = (
             """You are a general AI assistant. I will ask you a question. Report your thoughts, and
             finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
@@ -75,7 +74,7 @@ class BasicAgent:
                     "true": "false", "yes": "no", "black": "white"
                 }
                 opposite = opposites.get(word, f"UNKNOWN_OPPOSITE_OF_{word}")
-                return "FINAL ANSWER: RIGHT"
         return self.format_final_answer("COULD_NOT_SOLVE")
     def query_groq(self, question: str) -> str:
@@ -86,6 +85,7 @@ class BasicAgent:
                 messages=[{"role": "user", "content": full_prompt}]
             )
             answer = response.choices[0].message.content
             if "FINAL ANSWER: " in answer:
                 return answer.split("FINAL ANSWER: ")[-1].strip().upper()
             else:
@@ -103,68 +103,6 @@ class BasicAgent:
             return self.solve_riddle(question)
         return self.query_groq(question)
-# --- Answer Scoring ---
-def question_scorer(model_answer: str, ground_truth: str) -> bool:
-    def normalize_str(input_str, remove_punct=True) -> str:
-        no_spaces = re.sub(r"\s", "", input_str)
-        if remove_punct:
-            translator = str.maketrans("", "", string.punctuation)
-            return no_spaces.lower().translate(translator)
-        else:
-            return no_spaces.lower()
-    def normalize_number_str(number_str: str) -> float | None:
-        for char in ["$", "%", ","]:
-            number_str = number_str.replace(char, "")
-        try:
-            return float(number_str)
-        except ValueError:
-            print(f"String '{number_str}' cannot be normalized to number.")
-            return None
-    def split_string(s: str, char_list: list[str] = [",", ";"]) -> list[str]:
-        pattern = f"[{''.join(map(re.escape, char_list))}]"
-        return [elem.strip() for elem in re.split(pattern, s)]
-    def is_float(val) -> bool:
-        try:
-            float(val)
-            return True
-        except ValueError:
-            return False
-    if model_answer is None:
-        model_answer = "None"
-    if is_float(ground_truth):
-        print(f"Evaluating '{model_answer}' as a number.")
-        normalized = normalize_number_str(model_answer)
-        return normalized == float(ground_truth) if normalized is not None else False
-    elif any(char in ground_truth for char in [",", ";"]):
-        print(f"Evaluating '{model_answer}' as a comma/semicolon-separated list.")
-        gt_elems = split_string(ground_truth)
-        ma_elems = split_string(model_answer)
-        if len(gt_elems) != len(ma_elems):
-            warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
-            return False
-        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
-            if is_float(gt_elem):
-                normalized = normalize_number_str(ma_elem)
-                if normalized != float(gt_elem):
-                    return False
-            else:
-                if normalize_str(ma_elem, remove_punct=False) != normalize_str(gt_elem, remove_punct=False):
-                    return False
-        return True
-    else:
-        print(f"Evaluating '{model_answer}' as a string.")
-        return normalize_str(model_answer) == normalize_str(ground_truth)
-# --- Run and Submit All ---
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
     if profile:
@@ -184,7 +122,6 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
@@ -196,45 +133,31 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     results_log = []
     answers_payload = []
-    correct_count = 0
-    total_with_gold = 0
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
-        gold_answer = item.get("gold_answer")
-        print (gold_answer)
         if not task_id or question_text is None:
             continue
         try:
             submitted_answer = agent(question_text)
-            is_correct = question_scorer(submitted_answer, gold_answer) if gold_answer else None
-            if is_correct is not None:
-                total_with_gold += 1
-                if is_correct:
-                    correct_count += 1
-            answers_payload.append({
-                "task_id": task_id,
-                "submitted_answer": submitted_answer
-            })
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
-                "Submitted Answer": submitted_answer,
                 "Gold Answer": gold_answer,
-                "Correct?": "✅" if is_correct else "❌" if is_correct is not None else "N/A"
             })
         except Exception as e:
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
-                "Submitted Answer": f"AGENT ERROR: {e}",
                 "Gold Answer": gold_answer,
-                "Correct?": "❌"
             })
     if not answers_payload:
@@ -251,22 +174,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         response.raise_for_status()
         result_data = response.json()
         print(result_data)
-        accuracy_text = ""
-        if total_with_gold > 0:
-            accuracy = (correct_count / total_with_gold) * 100
-            accuracy_text = f"\nLocal Accuracy: {accuracy:.2f}% ({correct_count}/{total_with_gold} correct)"
         final_status = (
             f"Submission Successful!\n"
             f"User: {result_data.get('username')}\n"
-            f"Overall Score (from server): {result_data.get('score', '?')}% "
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
             f"Message: {result_data.get('message', 'No message received.')}"
-            f"{accuracy_text}"
         )
         return final_status, pd.DataFrame(results_log)
     except Exception as e:
         return f"Submission Failed: {e}", pd.DataFrame(results_log)
@@ -285,4 +200,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

 import string
 import warnings
 import pandas as pd
 import re
+from huggingface_hub import login
 from groq import Groq
 # --- Constants ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
+        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))
         self.agent_prompt = (
             """You are a general AI assistant. I will ask you a question. Report your thoughts, and
             finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
                     "true": "false", "yes": "no", "black": "white"
                 }
                 opposite = opposites.get(word, f"UNKNOWN_OPPOSITE_OF_{word}")
+                return f"FINAL ANSWER: {opposite.upper()}"
         return self.format_final_answer("COULD_NOT_SOLVE")
     def query_groq(self, question: str) -> str:
                 messages=[{"role": "user", "content": full_prompt}]
             )
             answer = response.choices[0].message.content
+            print(f"[Groq Raw Response]: {answer}")
             if "FINAL ANSWER: " in answer:
                 return answer.split("FINAL ANSWER: ")[-1].strip().upper()
             else:
             return self.solve_riddle(question)
         return self.query_groq(question)
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
     if profile:
         return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
     results_log = []
     answers_payload = []
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
+        gold_answer = item.get("answer") or item.get("ground_truth")
         if not task_id or question_text is None:
             continue
         try:
             submitted_answer = agent(question_text)
+            print(f"Q: {question_text}")
+            print(f"Predicted: {submitted_answer} | Gold: {gold_answer}")
+            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
                 "Gold Answer": gold_answer,
+                "Submitted Answer": submitted_answer
             })
         except Exception as e:
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
                 "Gold Answer": gold_answer,
+                "Submitted Answer": f"AGENT ERROR: {e}"
             })
     if not answers_payload:
         response.raise_for_status()
         result_data = response.json()
         print(result_data)
         final_status = (
             f"Submission Successful!\n"
             f"User: {result_data.get('username')}\n"
+            f"Overall Score: {result_data.get('score', '?')}% "
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
             f"Message: {result_data.get('message', 'No message received.')}"
         )
         return final_status, pd.DataFrame(results_log)
     except Exception as e:
         return f"Submission Failed: {e}", pd.DataFrame(results_log)
 if __name__ == "__main__":
     print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)