File size: 3,125 Bytes
bdff22f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import pandas as pd
from docx import Document
from pptx import Presentation
from datasets import load_dataset

# Dataset configuration

SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")

DATASET_TYPE_GAIA      = "gaia"
DATASET_TYPE_HLE       = "hle"

DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
DATASET_FILE_PATH_HLE  = "files/hle_validation.jsonl"

# Dataset processing

def get_dataset_from_file(dataset_type, level):
    file_path = ""

    if dataset_type == DATASET_TYPE_GAIA:
        file_path = DATASET_FILE_PATH_GAIA
    elif dataset_type == DATASET_TYPE_HLE:
        file_path = DATASET_FILE_PATH_HLE
    
    df = pd.read_json(file_path, lines=True)
  
    df = df[df["Level"] == level]

    result=[]
    
    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])
    
    return result

def get_dataset(dataset_type, level):
    dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
    dataset = load_dataset(dataset_repo, split="validation")

    df = dataset.to_pandas()

    if dataset_type == DATASET_TYPE_GAIA:
        df = df[df["Level"].isin([1, 2, 3])]
    elif dataset_type == DATASET_TYPE_HLE:
        df = df[df["Level"] == 0]

    df = df[df["Level"] == level]

    result=[]

    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])

    return result

# File processing

def is_ext(file_path, ext):
    return os.path.splitext(file_path)[1].lower() == ext.lower()
    
def read_file_json(file_path):
    df = None

    if is_ext(file_path, ".csv"):
        df = pd.read_csv(file_path)
    elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
        df = pd.read_excel(file_path)
    elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
        df = pd.read_json(file_path)
    
    return "" if df is None else df.to_json()

def read_docx_text(file_path):
    doc = Document(file_path)
    
    text = []

    for block in doc.element.body:
        if block.tag.endswith("p"):
            for paragraph in doc.paragraphs:
                if paragraph._element == block:
                    if paragraph.style.name.startswith("Heading"):
                        text.append("\n**" + paragraph.text + "**\n")
                    elif paragraph.text:
                        text.append(paragraph.text)
        elif block.tag.endswith("tbl"):
            for table in doc.tables:
                if table._element == block:
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        text.append(" | ".join(row_text))
            
    return "\n".join(text)

def read_pptx_text(file_path):
    prs = Presentation(file_path)
    
    text = []
    
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    
    return "\n\n".join(text)