import os import pandas as pd from docx import Document from pptx import Presentation from datasets import load_dataset # Dataset configuration SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME") DATASET_TYPE_GAIA = "gaia" DATASET_TYPE_HLE = "hle" DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl" DATASET_FILE_PATH_HLE = "files/hle_validation.jsonl" # Dataset processing def get_dataset_from_file(dataset_type, level): file_path = "" if dataset_type == DATASET_TYPE_GAIA: file_path = DATASET_FILE_PATH_GAIA elif dataset_type == DATASET_TYPE_HLE: file_path = DATASET_FILE_PATH_HLE df = pd.read_json(file_path, lines=True) df = df[df["Level"] == level] result=[] for _, row in df.iterrows(): result.append([row["Question"], row["Final answer"], row["file_name"]]) return result def get_dataset(dataset_type, level): dataset_repo = f"{SPACE_AUTHOR_NAME}/validation" dataset = load_dataset(dataset_repo, split="validation") df = dataset.to_pandas() if dataset_type == DATASET_TYPE_GAIA: df = df[df["Level"].isin([1, 2, 3])] elif dataset_type == DATASET_TYPE_HLE: df = df[df["Level"] == 0] df = df[df["Level"] == level] result=[] for _, row in df.iterrows(): result.append([row["Question"], row["Final answer"], row["file_name"]]) return result # File processing def is_ext(file_path, ext): return os.path.splitext(file_path)[1].lower() == ext.lower() def read_file_json(file_path): df = None if is_ext(file_path, ".csv"): df = pd.read_csv(file_path) elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"): df = pd.read_excel(file_path) elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"): df = pd.read_json(file_path) return "" if df is None else df.to_json() def read_docx_text(file_path): doc = Document(file_path) text = [] for block in doc.element.body: if block.tag.endswith("p"): for paragraph in doc.paragraphs: if paragraph._element == block: if paragraph.style.name.startswith("Heading"): text.append("\n**" + paragraph.text + "**\n") elif paragraph.text: text.append(paragraph.text) elif block.tag.endswith("tbl"): for table in doc.tables: if table._element == block: for row in table.rows: row_text = [] for cell in row.cells: row_text.append(cell.text.strip()) text.append(" | ".join(row_text)) return "\n".join(text) def read_pptx_text(file_path): prs = Presentation(file_path) text = [] for slide in prs.slides: slide_text = [] for shape in slide.shapes: if hasattr(shape, "text"): slide_text.append(shape.text) text.append("\n".join(slide_text)) return "\n\n".join(text)