Spaces:

bstraehle
/

gaia

Running

App Files Files Community

bstraehle commited on 17 days ago

Commit

72c76b2

verified ·

1 Parent(s): 4a3493d

Update utils/utils.py

Browse files

Files changed (1) hide show

utils/utils.py +14 -25

utils/utils.py CHANGED Viewed

@@ -4,19 +4,19 @@ from docx import Document
 from pptx import Presentation
 from datasets import load_dataset
-QUESTION_TYPE_GAIA      = "gaia"
-QUESTION_TYPE_HLE       = "hle"
-QUESTION_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
-QUESTION_FILE_PATH_HLE  = "files/hle_validation.jsonl"
-def get_questions_from_file(question_type, level):
     file_path = ""
-    if question_type == QUESTION_TYPE_GAIA:
-        file_path = QUESTION_FILE_PATH_GAIA
-    elif question_type == QUESTION_TYPE_HLE:
-        file_path = QUESTION_FILE_PATH_HLE
     df = pd.read_json(file_path, lines=True)
@@ -29,13 +29,7 @@ def get_questions_from_file(question_type, level):
     return result
-def get_questions_from_dataset(question_type, level):
-    # Extract dataset type from file path (e.g., "gaia" or "hle")
-    basename = os.path.splitext(os.path.basename(file_path))[0]
-    print(f"basename={basename}")
-    dataset_type = basename.replace("_validation", "")
-    print(f"basename={dataset_type}")
     # Get space ID from environment, defaulting to "bstraehle/gaia"
     space_id = os.environ.get("SPACE_ID", "bstraehle/gaia")
     # Extract username from space_id
@@ -50,11 +44,11 @@ def get_questions_from_dataset(question_type, level):
     df = dataset.to_pandas()
     # Filter by dataset type using the task_id prefix
-    if dataset_type == "gaia":
-        print(f"dataset_type={dataset_type}")
         df = df[df["task_id"].str.startswith("gaia-")]
-    elif dataset_type == "hle":
-        print(f"dataset_type={dataset_type}")
         df = df[df["task_id"].str.startswith("hle-")]
     # Filter by level if level > 0 (for GAIA benchmark)
@@ -63,11 +57,6 @@ def get_questions_from_dataset(question_type, level):
         df = df[df["Level"] == level]
     result=[]
-    for _, row in df.iterrows():
-        result.append([row["Question"], row["Final answer"], row["file_name"]])
-    return result
 def is_ext(file_path, ext):
     return os.path.splitext(file_path)[1].lower() == ext.lower()

 from pptx import Presentation
 from datasets import load_dataset
+DATASET_TYPE_GAIA      = "gaia"
+DATASET_TYPE_HLE       = "hle"
+DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
+DATASET_FILE_PATH_HLE  = "files/hle_validation.jsonl"
+def get_questions_from_file(dataset_type, level):
     file_path = ""
+    if dataset_type == DATASET_TYPE_GAIA:
+        file_path = DATASET_FILE_PATH_GAIA
+    elif dataset_type == DATASET_TYPE_HLE:
+        file_path = DATASET_FILE_PATH_HLE
     df = pd.read_json(file_path, lines=True)
     return result
+def get_questions_from_dataset(dataset_type, level):
     # Get space ID from environment, defaulting to "bstraehle/gaia"
     space_id = os.environ.get("SPACE_ID", "bstraehle/gaia")
     # Extract username from space_id
     df = dataset.to_pandas()
     # Filter by dataset type using the task_id prefix
+    if dataset_type == DATASET_TYPE_GAIA:
+        print(f"filtering for dataset_type={dataset_type}")
         df = df[df["task_id"].str.startswith("gaia-")]
+    elif dataset_type == DATASET_TYPE_HLE:
+        print(f"filtering for dataset_type={dataset_type}")
         df = df[df["task_id"].str.startswith("hle-")]
     # Filter by level if level > 0 (for GAIA benchmark)
         df = df[df["Level"] == level]
     result=[]
 def is_ext(file_path, ext):
     return os.path.splitext(file_path)[1].lower() == ext.lower()