File size: 3,162 Bytes
812fce1
 
 
 
260cf1e
812fce1
702ce31
 
72c76b2
 
77ef617
72c76b2
 
77ef617
702ce31
 
cc82662
77ef617
 
72c76b2
 
 
 
77ef617
812fce1
 
d907c10
77ef617
812fce1
 
98435e5
b7f003c
6895356
812fce1
 
cc82662
260cf1e
 
 
acaa214
83557ac
260cf1e
49dc208
72c76b2
49dc208
72c76b2
49dc208
83557ac
49dc208
83557ac
260cf1e
83557ac
0d4f176
 
83557ac
0d4f176
260cf1e
702ce31
 
812fce1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import pandas as pd
from docx import Document
from pptx import Presentation
from datasets import load_dataset

# Dataset configuration

DATASET_TYPE_GAIA      = "gaia"
DATASET_TYPE_HLE       = "hle"

DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
DATASET_FILE_PATH_HLE  = "files/hle_validation.jsonl"

# Dataset processing

def get_dataset_from_file(dataset_type, level):
    file_path = ""

    if dataset_type == DATASET_TYPE_GAIA:
        file_path = DATASET_FILE_PATH_GAIA
    elif dataset_type == DATASET_TYPE_HLE:
        file_path = DATASET_FILE_PATH_HLE
    
    df = pd.read_json(file_path, lines=True)
  
    df = df[df["Level"] == level]

    result=[]
    
    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])
    
    return result

def get_dataset(dataset_type, level):
    space_id = os.environ.get("SPACE_ID", "bstraehle/gaia")
    username = space_id.split("/")[0]
    dataset_repo = f"{username}/validation"
    dataset = load_dataset(dataset_repo, split="validation")

    df = dataset.to_pandas()

    if dataset_type == DATASET_TYPE_GAIA:
        df = df[df["Level"].isin([1, 2, 3])]
    elif dataset_type == DATASET_TYPE_HLE:
        df = df[df["Level"] == 0]

    df = df[df["Level"] == level]

    result=[]

    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])

    return result

# File processing

def is_ext(file_path, ext):
    return os.path.splitext(file_path)[1].lower() == ext.lower()
    
def read_file_json(file_path):
    df = None

    if is_ext(file_path, ".csv"):
        df = pd.read_csv(file_path)
    elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
        df = pd.read_excel(file_path)
    elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
        df = pd.read_json(file_path)
    
    return "" if df is None else df.to_json()

def read_docx_text(file_path):
    doc = Document(file_path)
    
    text = []

    for block in doc.element.body:
        if block.tag.endswith("p"):
            for paragraph in doc.paragraphs:
                if paragraph._element == block:
                    if paragraph.style.name.startswith("Heading"):
                        text.append("\n**" + paragraph.text + "**\n")
                    elif paragraph.text:
                        text.append(paragraph.text)
        elif block.tag.endswith("tbl"):
            for table in doc.tables:
                if table._element == block:
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        text.append(" | ".join(row_text))
            
    return "\n".join(text)

def read_pptx_text(file_path):
    prs = Presentation(file_path)
    
    text = []
    
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    
    return "\n\n".join(text)