gabejavitt commited on
Commit
ebe46d7
Β·
verified Β·
1 Parent(s): 26b8984

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -1
app.py CHANGED
@@ -1845,6 +1845,92 @@ except Exception as e:
1845
  # =============================================================================
1846
  # RUN AND SUBMIT FUNCTION
1847
  # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1848
  def run_and_submit_all(profile: gr.OAuthProfile | None):
1849
  """
1850
  Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -1876,6 +1962,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1876
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
1877
  print(agent_code)
1878
 
 
1879
  # 2. Fetch Questions
1880
  print(f"\n{'='*70}")
1881
  print(f"πŸ“₯ FETCHING QUESTIONS")
@@ -1902,6 +1989,19 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1902
  print(f"❌ An unexpected error occurred fetching questions: {e}")
1903
  return f"An unexpected error occurred fetching questions: {e}", None
1904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1905
  # 3. Run your Agent
1906
  print(f"\n{'='*70}")
1907
  print(f"πŸš€ STARTING EVALUATION")
@@ -1919,7 +2019,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1919
 
1920
  task_id = item.get("task_id")
1921
  question_text = item.get("question")
1922
- correct_answer = item.get("answer", "N/A") # Get correct answer from API
1923
 
1924
  # Look for file locally in files/ directory
1925
  local_file_path = None
 
1845
  # =============================================================================
1846
  # RUN AND SUBMIT FUNCTION
1847
  # =============================================================================
1848
+ def load_answer_sheet(filepath: str = "answer_sheet.json") -> Dict[str, str]:
1849
+ """Load the answer sheet from a JSON file"""
1850
+ try:
1851
+ if os.path.exists(filepath):
1852
+ with open(filepath, 'r', encoding='utf-8') as f:
1853
+ answers = json.load(f)
1854
+ print(f"βœ… Loaded answer sheet with {len(answers)} answers from {filepath}")
1855
+ return answers
1856
+ else:
1857
+ print(f"⚠️ Answer sheet not found at {filepath}")
1858
+ return {}
1859
+ except Exception as e:
1860
+ print(f"❌ Error loading answer sheet: {e}")
1861
+ return {}
1862
+
1863
+
1864
+ def check_answer_correctness(submitted: str, correct: str) -> Tuple[bool, str]:
1865
+ """
1866
+ Check if submitted answer matches correct answer with fuzzy matching
1867
+ Returns: (is_correct, feedback_message)
1868
+ """
1869
+ # Normalize both answers
1870
+ submitted_norm = submitted.strip().lower()
1871
+ correct_norm = correct.strip().lower()
1872
+
1873
+ # Exact match
1874
+ if submitted_norm == correct_norm:
1875
+ return True, "βœ… EXACT MATCH"
1876
+
1877
+ # Remove common punctuation and check again
1878
+ import string
1879
+ submitted_clean = submitted_norm.translate(str.maketrans('', '', string.punctuation))
1880
+ correct_clean = correct_norm.translate(str.maketrans('', '', string.punctuation))
1881
+
1882
+ if submitted_clean == correct_clean:
1883
+ return True, "βœ… MATCH (punctuation difference)"
1884
+
1885
+ # Check if it's a number formatting issue
1886
+ try:
1887
+ # Try to parse as numbers
1888
+ submitted_num = float(submitted_clean.replace(',', '').replace('
1889
+ , ''))
1890
+ correct_num = float(correct_clean.replace(',', '').replace('
1891
+ , ''))
1892
+ if abs(submitted_num - correct_num) < 0.01: # Allow small floating point differences
1893
+ return True, "βœ… MATCH (numeric equivalence)"
1894
+ except (ValueError, AttributeError):
1895
+ pass
1896
+
1897
+ # Check if submitted answer contains correct answer (for list-type answers)
1898
+ if ',' in correct_norm:
1899
+ correct_items = set([item.strip() for item in correct_norm.split(',')])
1900
+ submitted_items = set([item.strip() for item in submitted_norm.split(',')])
1901
+
1902
+ if correct_items == submitted_items:
1903
+ return True, "βœ… MATCH (item order difference)"
1904
+
1905
+ missing_items = correct_items - submitted_items
1906
+ extra_items = submitted_items - correct_items
1907
+
1908
+ if missing_items and not extra_items:
1909
+ return False, f"❌ MISSING: {', '.join(missing_items)}"
1910
+ elif extra_items and not missing_items:
1911
+ return False, f"❌ EXTRA: {', '.join(extra_items)}"
1912
+ elif missing_items and extra_items:
1913
+ return False, f"❌ MISSING: {', '.join(missing_items)} | EXTRA: {', '.join(extra_items)}"
1914
+
1915
+ # Check case-insensitive substring match
1916
+ if submitted_norm in correct_norm or correct_norm in submitted_norm:
1917
+ return False, f"❌ PARTIAL MATCH (submitted: '{submitted}' | correct: '{correct}')"
1918
+
1919
+ return False, f"❌ WRONG (submitted: '{submitted}' | correct: '{correct}')"
1920
+
1921
+
1922
+ def create_answer_sheet_template(questions: List[Dict], filepath: str = "answer_sheet.json"):
1923
+ """Create an answer sheet template from questions"""
1924
+ answer_template = {}
1925
+ for q in questions:
1926
+ answer_template[q['task_id']] = ""
1927
+
1928
+ with open(filepath, 'w', encoding='utf-8') as f:
1929
+ json.dump(answer_template, f, indent=2)
1930
+
1931
+ print(f"βœ… Created answer sheet template at {filepath}")
1932
+ print(f" Please fill in the correct answers for {len(answer_template)} questions")
1933
+
1934
  def run_and_submit_all(profile: gr.OAuthProfile | None):
1935
  """
1936
  Fetches all questions, runs the BasicAgent on them, submits all answers,
 
1962
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
1963
  print(agent_code)
1964
 
1965
+
1966
  # 2. Fetch Questions
1967
  print(f"\n{'='*70}")
1968
  print(f"πŸ“₯ FETCHING QUESTIONS")
 
1989
  print(f"❌ An unexpected error occurred fetching questions: {e}")
1990
  return f"An unexpected error occurred fetching questions: {e}", None
1991
 
1992
+ # Load answer sheet
1993
+ answer_sheet = load_answer_sheet("answer_sheet.json")
1994
+
1995
+ # If answer sheet doesn't exist, create template
1996
+ if not answer_sheet:
1997
+ create_answer_sheet_template(questions, "answer_sheet.json")
1998
+ print("\n⚠️ Please fill in the answer_sheet.json file with correct answers")
1999
+ print(" Then run the script again to check agent performance\n")
2000
+
2001
+ results = []
2002
+ local_correct = 0
2003
+ local_total = 0
2004
+
2005
  # 3. Run your Agent
2006
  print(f"\n{'='*70}")
2007
  print(f"πŸš€ STARTING EVALUATION")
 
2019
 
2020
  task_id = item.get("task_id")
2021
  question_text = item.get("question")
2022
+ correct_answer = answer_sheet[task_id]
2023
 
2024
  # Look for file locally in files/ directory
2025
  local_file_path = None