|
|
import os, re |
|
|
import pandas as pd |
|
|
from docx import Document |
|
|
from pptx import Presentation |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
|
|
|
SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME") |
|
|
|
|
|
DATASET_TYPE_GAIA = "gaia" |
|
|
DATASET_TYPE_HLE = "hle" |
|
|
|
|
|
DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl" |
|
|
DATASET_FILE_PATH_HLE = "files/hle_validation.jsonl" |
|
|
|
|
|
|
|
|
|
|
|
def get_dataset_from_file(dataset_type, level): |
|
|
file_path = "" |
|
|
|
|
|
if dataset_type == DATASET_TYPE_GAIA: |
|
|
file_path = DATASET_FILE_PATH_GAIA |
|
|
elif dataset_type == DATASET_TYPE_HLE: |
|
|
file_path = DATASET_FILE_PATH_HLE |
|
|
|
|
|
df = pd.read_json(file_path, lines=True) |
|
|
|
|
|
df = df[df["Level"] == level] |
|
|
|
|
|
result=[] |
|
|
|
|
|
for _, row in df.iterrows(): |
|
|
result.append([row["Question"], row["Final answer"], row["file_name"]]) |
|
|
|
|
|
return result |
|
|
|
|
|
def get_dataset(dataset_type, level): |
|
|
dataset_repo = f"{SPACE_AUTHOR_NAME}/validation" |
|
|
dataset = load_dataset(dataset_repo, split="validation") |
|
|
|
|
|
df = dataset.to_pandas() |
|
|
|
|
|
if dataset_type == DATASET_TYPE_GAIA: |
|
|
df = df[df["Level"].isin([1, 2, 3])] |
|
|
elif dataset_type == DATASET_TYPE_HLE: |
|
|
df = df[df["Level"] == 0] |
|
|
|
|
|
df = df[df["Level"] == level] |
|
|
|
|
|
result=[] |
|
|
|
|
|
for _, row in df.iterrows(): |
|
|
result.append([row["Question"], row["Final answer"], row["file_name"]]) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
def is_ext(file_path, ext): |
|
|
return os.path.splitext(file_path)[1].lower() == ext.lower() |
|
|
|
|
|
def read_file_json(file_path): |
|
|
df = None |
|
|
|
|
|
if is_ext(file_path, ".csv"): |
|
|
df = pd.read_csv(file_path) |
|
|
elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"): |
|
|
df = pd.read_excel(file_path) |
|
|
elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"): |
|
|
df = pd.read_json(file_path) |
|
|
|
|
|
return "" if df is None else df.to_json() |
|
|
|
|
|
def read_docx_text(file_path): |
|
|
doc = Document(file_path) |
|
|
|
|
|
text = [] |
|
|
|
|
|
for block in doc.element.body: |
|
|
if block.tag.endswith("p"): |
|
|
for paragraph in doc.paragraphs: |
|
|
if paragraph._element == block: |
|
|
if paragraph.style.name.startswith("Heading"): |
|
|
text.append("\n**" + paragraph.text + "**\n") |
|
|
elif paragraph.text: |
|
|
text.append(paragraph.text) |
|
|
elif block.tag.endswith("tbl"): |
|
|
for table in doc.tables: |
|
|
if table._element == block: |
|
|
for row in table.rows: |
|
|
row_text = [] |
|
|
for cell in row.cells: |
|
|
row_text.append(cell.text.strip()) |
|
|
text.append(" | ".join(row_text)) |
|
|
|
|
|
return "\n".join(text) |
|
|
|
|
|
def read_pptx_text(file_path): |
|
|
prs = Presentation(file_path) |
|
|
|
|
|
text = [] |
|
|
|
|
|
for slide in prs.slides: |
|
|
slide_text = [] |
|
|
for shape in slide.shapes: |
|
|
if hasattr(shape, "text"): |
|
|
slide_text.append(shape.text) |
|
|
text.append("\n".join(slide_text)) |
|
|
|
|
|
return "\n\n".join(text) |
|
|
|
|
|
|
|
|
|
|
|
def validate_input(question, openai_api_key, gemini_api_key, anthropic_api_key): |
|
|
is_valid = True |
|
|
|
|
|
if (len(question) > 500 |
|
|
or len(openai_api_key) > 150 |
|
|
or len(gemini_api_key) > 150 |
|
|
or len(anthropic_api_key) > 150): |
|
|
is_valid = False |
|
|
|
|
|
sanitized_question = re.sub(r'[^\w\s.,!?\'\-()@$%&+/:;"=\[\]]', '', question) |
|
|
|
|
|
if sanitized_question != question: |
|
|
is_valid = False |
|
|
|
|
|
return is_valid |