Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1845,6 +1845,92 @@ except Exception as e:
|
|
| 1845 |
# =============================================================================
|
| 1846 |
# RUN AND SUBMIT FUNCTION
|
| 1847 |
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1848 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 1849 |
"""
|
| 1850 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
|
@@ -1876,6 +1962,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1876 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 1877 |
print(agent_code)
|
| 1878 |
|
|
|
|
| 1879 |
# 2. Fetch Questions
|
| 1880 |
print(f"\n{'='*70}")
|
| 1881 |
print(f"π₯ FETCHING QUESTIONS")
|
|
@@ -1902,6 +1989,19 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1902 |
print(f"β An unexpected error occurred fetching questions: {e}")
|
| 1903 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 1904 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1905 |
# 3. Run your Agent
|
| 1906 |
print(f"\n{'='*70}")
|
| 1907 |
print(f"π STARTING EVALUATION")
|
|
@@ -1919,7 +2019,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1919 |
|
| 1920 |
task_id = item.get("task_id")
|
| 1921 |
question_text = item.get("question")
|
| 1922 |
-
correct_answer =
|
| 1923 |
|
| 1924 |
# Look for file locally in files/ directory
|
| 1925 |
local_file_path = None
|
|
|
|
| 1845 |
# =============================================================================
|
| 1846 |
# RUN AND SUBMIT FUNCTION
|
| 1847 |
# =============================================================================
|
| 1848 |
+
def load_answer_sheet(filepath: str = "answer_sheet.json") -> Dict[str, str]:
|
| 1849 |
+
"""Load the answer sheet from a JSON file"""
|
| 1850 |
+
try:
|
| 1851 |
+
if os.path.exists(filepath):
|
| 1852 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 1853 |
+
answers = json.load(f)
|
| 1854 |
+
print(f"β
Loaded answer sheet with {len(answers)} answers from {filepath}")
|
| 1855 |
+
return answers
|
| 1856 |
+
else:
|
| 1857 |
+
print(f"β οΈ Answer sheet not found at {filepath}")
|
| 1858 |
+
return {}
|
| 1859 |
+
except Exception as e:
|
| 1860 |
+
print(f"β Error loading answer sheet: {e}")
|
| 1861 |
+
return {}
|
| 1862 |
+
|
| 1863 |
+
|
| 1864 |
+
def check_answer_correctness(submitted: str, correct: str) -> Tuple[bool, str]:
|
| 1865 |
+
"""
|
| 1866 |
+
Check if submitted answer matches correct answer with fuzzy matching
|
| 1867 |
+
Returns: (is_correct, feedback_message)
|
| 1868 |
+
"""
|
| 1869 |
+
# Normalize both answers
|
| 1870 |
+
submitted_norm = submitted.strip().lower()
|
| 1871 |
+
correct_norm = correct.strip().lower()
|
| 1872 |
+
|
| 1873 |
+
# Exact match
|
| 1874 |
+
if submitted_norm == correct_norm:
|
| 1875 |
+
return True, "β
EXACT MATCH"
|
| 1876 |
+
|
| 1877 |
+
# Remove common punctuation and check again
|
| 1878 |
+
import string
|
| 1879 |
+
submitted_clean = submitted_norm.translate(str.maketrans('', '', string.punctuation))
|
| 1880 |
+
correct_clean = correct_norm.translate(str.maketrans('', '', string.punctuation))
|
| 1881 |
+
|
| 1882 |
+
if submitted_clean == correct_clean:
|
| 1883 |
+
return True, "β
MATCH (punctuation difference)"
|
| 1884 |
+
|
| 1885 |
+
# Check if it's a number formatting issue
|
| 1886 |
+
try:
|
| 1887 |
+
# Try to parse as numbers
|
| 1888 |
+
submitted_num = float(submitted_clean.replace(',', '').replace('
|
| 1889 |
+
, ''))
|
| 1890 |
+
correct_num = float(correct_clean.replace(',', '').replace('
|
| 1891 |
+
, ''))
|
| 1892 |
+
if abs(submitted_num - correct_num) < 0.01: # Allow small floating point differences
|
| 1893 |
+
return True, "β
MATCH (numeric equivalence)"
|
| 1894 |
+
except (ValueError, AttributeError):
|
| 1895 |
+
pass
|
| 1896 |
+
|
| 1897 |
+
# Check if submitted answer contains correct answer (for list-type answers)
|
| 1898 |
+
if ',' in correct_norm:
|
| 1899 |
+
correct_items = set([item.strip() for item in correct_norm.split(',')])
|
| 1900 |
+
submitted_items = set([item.strip() for item in submitted_norm.split(',')])
|
| 1901 |
+
|
| 1902 |
+
if correct_items == submitted_items:
|
| 1903 |
+
return True, "β
MATCH (item order difference)"
|
| 1904 |
+
|
| 1905 |
+
missing_items = correct_items - submitted_items
|
| 1906 |
+
extra_items = submitted_items - correct_items
|
| 1907 |
+
|
| 1908 |
+
if missing_items and not extra_items:
|
| 1909 |
+
return False, f"β MISSING: {', '.join(missing_items)}"
|
| 1910 |
+
elif extra_items and not missing_items:
|
| 1911 |
+
return False, f"β EXTRA: {', '.join(extra_items)}"
|
| 1912 |
+
elif missing_items and extra_items:
|
| 1913 |
+
return False, f"β MISSING: {', '.join(missing_items)} | EXTRA: {', '.join(extra_items)}"
|
| 1914 |
+
|
| 1915 |
+
# Check case-insensitive substring match
|
| 1916 |
+
if submitted_norm in correct_norm or correct_norm in submitted_norm:
|
| 1917 |
+
return False, f"β PARTIAL MATCH (submitted: '{submitted}' | correct: '{correct}')"
|
| 1918 |
+
|
| 1919 |
+
return False, f"β WRONG (submitted: '{submitted}' | correct: '{correct}')"
|
| 1920 |
+
|
| 1921 |
+
|
| 1922 |
+
def create_answer_sheet_template(questions: List[Dict], filepath: str = "answer_sheet.json"):
|
| 1923 |
+
"""Create an answer sheet template from questions"""
|
| 1924 |
+
answer_template = {}
|
| 1925 |
+
for q in questions:
|
| 1926 |
+
answer_template[q['task_id']] = ""
|
| 1927 |
+
|
| 1928 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 1929 |
+
json.dump(answer_template, f, indent=2)
|
| 1930 |
+
|
| 1931 |
+
print(f"β
Created answer sheet template at {filepath}")
|
| 1932 |
+
print(f" Please fill in the correct answers for {len(answer_template)} questions")
|
| 1933 |
+
|
| 1934 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 1935 |
"""
|
| 1936 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
|
|
|
| 1962 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 1963 |
print(agent_code)
|
| 1964 |
|
| 1965 |
+
|
| 1966 |
# 2. Fetch Questions
|
| 1967 |
print(f"\n{'='*70}")
|
| 1968 |
print(f"π₯ FETCHING QUESTIONS")
|
|
|
|
| 1989 |
print(f"β An unexpected error occurred fetching questions: {e}")
|
| 1990 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 1991 |
|
| 1992 |
+
# Load answer sheet
|
| 1993 |
+
answer_sheet = load_answer_sheet("answer_sheet.json")
|
| 1994 |
+
|
| 1995 |
+
# If answer sheet doesn't exist, create template
|
| 1996 |
+
if not answer_sheet:
|
| 1997 |
+
create_answer_sheet_template(questions, "answer_sheet.json")
|
| 1998 |
+
print("\nβ οΈ Please fill in the answer_sheet.json file with correct answers")
|
| 1999 |
+
print(" Then run the script again to check agent performance\n")
|
| 2000 |
+
|
| 2001 |
+
results = []
|
| 2002 |
+
local_correct = 0
|
| 2003 |
+
local_total = 0
|
| 2004 |
+
|
| 2005 |
# 3. Run your Agent
|
| 2006 |
print(f"\n{'='*70}")
|
| 2007 |
print(f"π STARTING EVALUATION")
|
|
|
|
| 2019 |
|
| 2020 |
task_id = item.get("task_id")
|
| 2021 |
question_text = item.get("question")
|
| 2022 |
+
correct_answer = answer_sheet[task_id]
|
| 2023 |
|
| 2024 |
# Look for file locally in files/ directory
|
| 2025 |
local_file_path = None
|