Spaces:

bstraehle
/

gaia

Running

App Files Files Community

gaia / utils /utils.py

bstraehle

Update utils/utils.py

a80e28b verified 9 days ago

raw

history blame

3.6 kB

	import os, re
	import pandas as pd
	from docx import Document
	from pptx import Presentation
	from datasets import load_dataset

	# Dataset configuration

	SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")

	DATASET_TYPE_GAIA = "gaia"
	DATASET_TYPE_HLE = "hle"

	DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
	DATASET_FILE_PATH_HLE = "files/hle_validation.jsonl"

	# Dataset processing

	def get_dataset_from_file(dataset_type, level):
	file_path = ""

	if dataset_type == DATASET_TYPE_GAIA:
	file_path = DATASET_FILE_PATH_GAIA
	elif dataset_type == DATASET_TYPE_HLE:
	file_path = DATASET_FILE_PATH_HLE

	df = pd.read_json(file_path, lines=True)

	df = df[df["Level"] == level]

	result=[]

	for _, row in df.iterrows():
	result.append([row["Question"], row["Final answer"], row["file_name"]])

	return result

	def get_dataset(dataset_type, level):
	dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
	dataset = load_dataset(dataset_repo, split="validation")

	df = dataset.to_pandas()

	if dataset_type == DATASET_TYPE_GAIA:
	df = df[df["Level"].isin([1, 2, 3])]
	elif dataset_type == DATASET_TYPE_HLE:
	df = df[df["Level"] == 0]

	df = df[df["Level"] == level]

	result=[]

	for _, row in df.iterrows():
	result.append([row["Question"], row["Final answer"], row["file_name"]])

	return result

	# File processing

	def is_ext(file_path, ext):
	return os.path.splitext(file_path)[1].lower() == ext.lower()

	def read_file_json(file_path):
	df = None

	if is_ext(file_path, ".csv"):
	df = pd.read_csv(file_path)
	elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
	df = pd.read_excel(file_path)
	elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
	df = pd.read_json(file_path)

	return "" if df is None else df.to_json()

	def read_docx_text(file_path):
	doc = Document(file_path)

	text = []

	for block in doc.element.body:
	if block.tag.endswith("p"):
	for paragraph in doc.paragraphs:
	if paragraph._element == block:
	if paragraph.style.name.startswith("Heading"):
	text.append("\n" + paragraph.text + "\n")
	elif paragraph.text:
	text.append(paragraph.text)
	elif block.tag.endswith("tbl"):
	for table in doc.tables:
	if table._element == block:
	for row in table.rows:
	row_text = []
	for cell in row.cells:
	row_text.append(cell.text.strip())
	text.append(" \| ".join(row_text))

	return "\n".join(text)

	def read_pptx_text(file_path):
	prs = Presentation(file_path)

	text = []

	for slide in prs.slides:
	slide_text = []
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	slide_text.append(shape.text)
	text.append("\n".join(slide_text))

	return "\n\n".join(text)

	# Input validation

	def validate_input(question, openai_api_key, gemini_api_key, anthropic_api_key):
	is_valid = True

	if (len(question) > 500
	or len(openai_api_key) > 150
	or len(gemini_api_key) > 150
	or len(anthropic_api_key) > 150):
	is_valid = False

	sanitized_question = re.sub(r'[^\w\s.,!?\'\-()@$%&+/:;"=\[\]]', '', question)

	if sanitized_question != question:
	is_valid = False

	return is_valid