File size: 3,598 Bytes
f350ea8
812fce1
 
 
260cf1e
812fce1
702ce31
 
d59a286
 
72c76b2
 
77ef617
72c76b2
 
77ef617
702ce31
 
cc82662
77ef617
 
72c76b2
 
 
 
77ef617
812fce1
 
d907c10
77ef617
812fce1
 
98435e5
b7f003c
6895356
812fce1
 
cc82662
d59a286
acaa214
83557ac
260cf1e
49dc208
72c76b2
49dc208
72c76b2
49dc208
83557ac
49dc208
83557ac
260cf1e
83557ac
0d4f176
 
83557ac
0d4f176
260cf1e
702ce31
 
812fce1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8160da4
 
 
 
28688ed
d16721c
f350ea8
1584483
 
 
a80e28b
d16721c
f350ea8
1584483
 
 
 
8160da4
f350ea8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os, re
import pandas as pd
from docx import Document
from pptx import Presentation
from datasets import load_dataset

# Dataset configuration

SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")

DATASET_TYPE_GAIA      = "gaia"
DATASET_TYPE_HLE       = "hle"

DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
DATASET_FILE_PATH_HLE  = "files/hle_validation.jsonl"

# Dataset processing

def get_dataset_from_file(dataset_type, level):
    file_path = ""

    if dataset_type == DATASET_TYPE_GAIA:
        file_path = DATASET_FILE_PATH_GAIA
    elif dataset_type == DATASET_TYPE_HLE:
        file_path = DATASET_FILE_PATH_HLE
    
    df = pd.read_json(file_path, lines=True)
  
    df = df[df["Level"] == level]

    result=[]
    
    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])
    
    return result

def get_dataset(dataset_type, level):
    dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
    dataset = load_dataset(dataset_repo, split="validation")

    df = dataset.to_pandas()

    if dataset_type == DATASET_TYPE_GAIA:
        df = df[df["Level"].isin([1, 2, 3])]
    elif dataset_type == DATASET_TYPE_HLE:
        df = df[df["Level"] == 0]

    df = df[df["Level"] == level]

    result=[]

    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])

    return result

# File processing

def is_ext(file_path, ext):
    return os.path.splitext(file_path)[1].lower() == ext.lower()
    
def read_file_json(file_path):
    df = None

    if is_ext(file_path, ".csv"):
        df = pd.read_csv(file_path)
    elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
        df = pd.read_excel(file_path)
    elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
        df = pd.read_json(file_path)
    
    return "" if df is None else df.to_json()

def read_docx_text(file_path):
    doc = Document(file_path)
    
    text = []

    for block in doc.element.body:
        if block.tag.endswith("p"):
            for paragraph in doc.paragraphs:
                if paragraph._element == block:
                    if paragraph.style.name.startswith("Heading"):
                        text.append("\n**" + paragraph.text + "**\n")
                    elif paragraph.text:
                        text.append(paragraph.text)
        elif block.tag.endswith("tbl"):
            for table in doc.tables:
                if table._element == block:
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        text.append(" | ".join(row_text))
            
    return "\n".join(text)

def read_pptx_text(file_path):
    prs = Presentation(file_path)
    
    text = []
    
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    
    return "\n\n".join(text)

# Input validation

def validate_input(question, openai_api_key, gemini_api_key, anthropic_api_key):
    is_valid = True

    if (len(question) > 500
        or len(openai_api_key) > 150
        or len(gemini_api_key) > 150
        or len(anthropic_api_key) > 150):
        is_valid = False
    
    sanitized_question = re.sub(r'[^\w\s.,!?\'\-()@$%&+/:;"=\[\]]', '', question)
    
    if sanitized_question != question:
        is_valid = False

    return is_valid