File size: 3,162 Bytes
812fce1 260cf1e 812fce1 702ce31 72c76b2 77ef617 72c76b2 77ef617 702ce31 cc82662 77ef617 72c76b2 77ef617 812fce1 d907c10 77ef617 812fce1 98435e5 b7f003c 6895356 812fce1 cc82662 260cf1e acaa214 83557ac 260cf1e 49dc208 72c76b2 49dc208 72c76b2 49dc208 83557ac 49dc208 83557ac 260cf1e 83557ac 0d4f176 83557ac 0d4f176 260cf1e 702ce31 812fce1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import pandas as pd
from docx import Document
from pptx import Presentation
from datasets import load_dataset
# Dataset configuration
DATASET_TYPE_GAIA = "gaia"
DATASET_TYPE_HLE = "hle"
DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
DATASET_FILE_PATH_HLE = "files/hle_validation.jsonl"
# Dataset processing
def get_dataset_from_file(dataset_type, level):
file_path = ""
if dataset_type == DATASET_TYPE_GAIA:
file_path = DATASET_FILE_PATH_GAIA
elif dataset_type == DATASET_TYPE_HLE:
file_path = DATASET_FILE_PATH_HLE
df = pd.read_json(file_path, lines=True)
df = df[df["Level"] == level]
result=[]
for _, row in df.iterrows():
result.append([row["Question"], row["Final answer"], row["file_name"]])
return result
def get_dataset(dataset_type, level):
space_id = os.environ.get("SPACE_ID", "bstraehle/gaia")
username = space_id.split("/")[0]
dataset_repo = f"{username}/validation"
dataset = load_dataset(dataset_repo, split="validation")
df = dataset.to_pandas()
if dataset_type == DATASET_TYPE_GAIA:
df = df[df["Level"].isin([1, 2, 3])]
elif dataset_type == DATASET_TYPE_HLE:
df = df[df["Level"] == 0]
df = df[df["Level"] == level]
result=[]
for _, row in df.iterrows():
result.append([row["Question"], row["Final answer"], row["file_name"]])
return result
# File processing
def is_ext(file_path, ext):
return os.path.splitext(file_path)[1].lower() == ext.lower()
def read_file_json(file_path):
df = None
if is_ext(file_path, ".csv"):
df = pd.read_csv(file_path)
elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
df = pd.read_excel(file_path)
elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
df = pd.read_json(file_path)
return "" if df is None else df.to_json()
def read_docx_text(file_path):
doc = Document(file_path)
text = []
for block in doc.element.body:
if block.tag.endswith("p"):
for paragraph in doc.paragraphs:
if paragraph._element == block:
if paragraph.style.name.startswith("Heading"):
text.append("\n**" + paragraph.text + "**\n")
elif paragraph.text:
text.append(paragraph.text)
elif block.tag.endswith("tbl"):
for table in doc.tables:
if table._element == block:
for row in table.rows:
row_text = []
for cell in row.cells:
row_text.append(cell.text.strip())
text.append(" | ".join(row_text))
return "\n".join(text)
def read_pptx_text(file_path):
prs = Presentation(file_path)
text = []
for slide in prs.slides:
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_text.append(shape.text)
text.append("\n".join(slide_text))
return "\n\n".join(text) |