File size: 3,967 Bytes
f350ea8
812fce1
 
 
260cf1e
812fce1
702ce31
 
d59a286
 
72c76b2
 
77ef617
72c76b2
 
77ef617
702ce31
 
cc82662
77ef617
 
72c76b2
 
 
 
77ef617
812fce1
 
d907c10
77ef617
812fce1
 
98435e5
b7f003c
6895356
812fce1
 
cc82662
d59a286
acaa214
83557ac
260cf1e
49dc208
72c76b2
49dc208
72c76b2
49dc208
83557ac
49dc208
83557ac
260cf1e
83557ac
0d4f176
 
83557ac
0d4f176
260cf1e
702ce31
 
812fce1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8160da4
 
 
 
f350ea8
d16721c
f350ea8
361b5c6
 
f350ea8
 
 
d16721c
f350ea8
 
 
 
 
 
 
8160da4
f350ea8
 
8160da4
f350ea8
 
8160da4
f350ea8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os, re
import pandas as pd
from docx import Document
from pptx import Presentation
from datasets import load_dataset

# Dataset configuration

SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")

DATASET_TYPE_GAIA      = "gaia"
DATASET_TYPE_HLE       = "hle"

DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
DATASET_FILE_PATH_HLE  = "files/hle_validation.jsonl"

# Dataset processing

def get_dataset_from_file(dataset_type, level):
    file_path = ""

    if dataset_type == DATASET_TYPE_GAIA:
        file_path = DATASET_FILE_PATH_GAIA
    elif dataset_type == DATASET_TYPE_HLE:
        file_path = DATASET_FILE_PATH_HLE
    
    df = pd.read_json(file_path, lines=True)
  
    df = df[df["Level"] == level]

    result=[]
    
    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])
    
    return result

def get_dataset(dataset_type, level):
    dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
    dataset = load_dataset(dataset_repo, split="validation")

    df = dataset.to_pandas()

    if dataset_type == DATASET_TYPE_GAIA:
        df = df[df["Level"].isin([1, 2, 3])]
    elif dataset_type == DATASET_TYPE_HLE:
        df = df[df["Level"] == 0]

    df = df[df["Level"] == level]

    result=[]

    for _, row in df.iterrows():
        result.append([row["Question"], row["Final answer"], row["file_name"]])

    return result

# File processing

def is_ext(file_path, ext):
    return os.path.splitext(file_path)[1].lower() == ext.lower()
    
def read_file_json(file_path):
    df = None

    if is_ext(file_path, ".csv"):
        df = pd.read_csv(file_path)
    elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
        df = pd.read_excel(file_path)
    elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
        df = pd.read_json(file_path)
    
    return "" if df is None else df.to_json()

def read_docx_text(file_path):
    doc = Document(file_path)
    
    text = []

    for block in doc.element.body:
        if block.tag.endswith("p"):
            for paragraph in doc.paragraphs:
                if paragraph._element == block:
                    if paragraph.style.name.startswith("Heading"):
                        text.append("\n**" + paragraph.text + "**\n")
                    elif paragraph.text:
                        text.append(paragraph.text)
        elif block.tag.endswith("tbl"):
            for table in doc.tables:
                if table._element == block:
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        text.append(" | ".join(row_text))
            
    return "\n".join(text)

def read_pptx_text(file_path):
    prs = Presentation(file_path)
    
    text = []
    
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    
    return "\n\n".join(text)

# Input validation

def validate_input(question, openai_api_key, gemini_api_key, anthropic_api_key, file_name):
    is_valid = True

    if question.len() > 500 or
       openai_api_key.len() > 150 or 
       gemini_api_key.len() > 150 or
       anthropic_api_key.len() > 150 or
       file_name > 150:
        is_valid = False
    
    # Allow: letters, numbers, spaces, basic punctuation, common symbols
    # Block: HTML tags, script injection, control characters, etc.
    #temp = re.sub(r'[^\w\s.,!?\'\-()@$%&+/:;"=\[\]]', '', input_text)

    # Max length is MAX_LENGTH
    #temp = temp[:MAX_LENGTH]

    # Max word length is MAX_WORD_LENGTH
    #temp = ' '.join(word[:MAX_WORD_LENGTH] for word in temp.split(' '))

    # Max word count is MAX_WORD_COUNT
    #temp = ' '.join(temp.split(' ')[:MAX_WORD_COUNT])

    return is_valid