agents_gaia / agents /file_reader.py
Isics's picture
initial commit
32844c7
import json
import pandas as pd
import pypdf
import yaml
from smolagents import CodeAgent, Model, tool
from config import authorized_libraries
@tool
def read_yaml(path: str) -> str:
"""
Reads a YAML file and returns the contents as a dictionary parsed as a string.
Args:
path (str): path to YAML file.
Returns:
str: contents of YAML file.
Example:
>>> result = read_yaml("path/to/file.yaml")
"""
with open(path, 'r') as f:
return yaml.load(f, Loader=yaml.FullLoader)
@tool
def read_json(path: str) -> str:
"""
Reads a JSON file and returns the contents as a dictionary parsed as a string.
Args:
path (str): path to JSON file.
Returns:
str: contents of JSON file.
Example:
>>> result = read_json("path/to/file.json")
"""
with open(path, 'r') as f:
return json.load(f)
@tool
def read_txt(path: str) -> str:
"""
Reads a txt file and returns the contents as a string.
Args:
path (str): path to a text file.
Returns:
str: contents of the text file.
Example:
>>> result = read_yaml("path/to/textfile.text")
"""
with open(path, 'r') as f:
return f.read()
@tool
def read_csv(path: str) -> str:
"""
Reads a CSV file and returns its content formatted as a markdown table.
Useful for understanding the structure and data of comma-separated files.
Args:
path (str): path to the CSV file (e.g., 'data.csv').
Returns:
str: The content of the CSV as a markdown string.
"""
try:
df = pd.read_csv(path)
return df.to_markdown(index=False)
except Exception as e:
return f"Error reading CSV: {str(e)}"
@tool
def read_excel(path: str) -> str:
"""
Reads the first sheet of an Excel file and returns its content as a markdown table.
Args:
path (str): path to the .xlsx file.
Returns:
str: The content of the first sheet as a markdown string.
"""
try:
df = pd.read_excel(path, engine='openpyxl')
return df.to_markdown(index=False)
except Exception as e:
return f"Error reading Excel: {str(e)}"
@tool
def read_pdf(path: str) -> str:
"""
Extracts text from a PDF file.
Args:
path (str): path to the PDF file.
Returns:
str: The raw text content extracted from the PDF pages.
"""
try:
reader = pypdf.PdfReader(path)
text_content = []
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
text_content.append(f"--- Page {i + 1} ---\n{text}")
return "\n".join(text_content)
except Exception as e:
return f"Error reading PDF: {str(e)}"
@tool
def inspect_csv(path: str) -> str:
"""
Reads the first 5 rows and the columns of a CSV file.
Use this to understand the data structure before writing code to process the full file.
Args:
path (str): path to the CSV file.
"""
try:
df = pd.read_csv(path)
info = f"Columns: {list(df.columns)}\n"
info += f"Total Rows: {len(df)}\n\n"
info += "First 5 rows:\n"
info += df.head(5).to_markdown(index=False)
return info
except Exception as e:
return f"Error inspecting CSV: {str(e)}"
def create_file_reader(model: Model) -> CodeAgent:
return CodeAgent(
model=model,
tools=[
read_yaml, read_json, read_txt, read_csv, read_pdf, inspect_csv, read_excel
],
add_base_tools=True,
additional_authorized_imports=authorized_libraries,
name="files_manager",
description="Reads a file and returns the contents as a string, multiple formats accepted.",
verbosity_level=0,
max_steps=8,
)