gaia / utils /utils.py
bstraehle's picture
Upload utils.py
bdff22f verified
import os
import pandas as pd
from docx import Document
from pptx import Presentation
from datasets import load_dataset
# Dataset configuration
SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")
DATASET_TYPE_GAIA = "gaia"
DATASET_TYPE_HLE = "hle"
DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
DATASET_FILE_PATH_HLE = "files/hle_validation.jsonl"
# Dataset processing
def get_dataset_from_file(dataset_type, level):
file_path = ""
if dataset_type == DATASET_TYPE_GAIA:
file_path = DATASET_FILE_PATH_GAIA
elif dataset_type == DATASET_TYPE_HLE:
file_path = DATASET_FILE_PATH_HLE
df = pd.read_json(file_path, lines=True)
df = df[df["Level"] == level]
result=[]
for _, row in df.iterrows():
result.append([row["Question"], row["Final answer"], row["file_name"]])
return result
def get_dataset(dataset_type, level):
dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
dataset = load_dataset(dataset_repo, split="validation")
df = dataset.to_pandas()
if dataset_type == DATASET_TYPE_GAIA:
df = df[df["Level"].isin([1, 2, 3])]
elif dataset_type == DATASET_TYPE_HLE:
df = df[df["Level"] == 0]
df = df[df["Level"] == level]
result=[]
for _, row in df.iterrows():
result.append([row["Question"], row["Final answer"], row["file_name"]])
return result
# File processing
def is_ext(file_path, ext):
return os.path.splitext(file_path)[1].lower() == ext.lower()
def read_file_json(file_path):
df = None
if is_ext(file_path, ".csv"):
df = pd.read_csv(file_path)
elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
df = pd.read_excel(file_path)
elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
df = pd.read_json(file_path)
return "" if df is None else df.to_json()
def read_docx_text(file_path):
doc = Document(file_path)
text = []
for block in doc.element.body:
if block.tag.endswith("p"):
for paragraph in doc.paragraphs:
if paragraph._element == block:
if paragraph.style.name.startswith("Heading"):
text.append("\n**" + paragraph.text + "**\n")
elif paragraph.text:
text.append(paragraph.text)
elif block.tag.endswith("tbl"):
for table in doc.tables:
if table._element == block:
for row in table.rows:
row_text = []
for cell in row.cells:
row_text.append(cell.text.strip())
text.append(" | ".join(row_text))
return "\n".join(text)
def read_pptx_text(file_path):
prs = Presentation(file_path)
text = []
for slide in prs.slides:
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_text.append(shape.text)
text.append("\n".join(slide_text))
return "\n\n".join(text)