Spaces:
Build error
Build error
Commit ·
fb8f1a6
1
Parent(s): 3a2d999
Add tabular data interaction tools for CSV, Excel, and Parquet files, including summary, head, select, filter, describe, and to_json operations.
Browse files
tools.py
CHANGED
|
@@ -15,6 +15,10 @@ from typing import Dict
|
|
| 15 |
import shutil
|
| 16 |
import subprocess as sp
|
| 17 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
#=========================================
|
| 20 |
# Search Tools
|
|
@@ -489,3 +493,104 @@ def execute_source_file(path: str, timeout: int=10) -> str:
|
|
| 489 |
|
| 490 |
finally:
|
| 491 |
shutil.rmtree(work)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
import shutil
|
| 16 |
import subprocess as sp
|
| 17 |
import tempfile
|
| 18 |
+
import pandas as pd
|
| 19 |
+
import textwrap
|
| 20 |
+
import io
|
| 21 |
+
import json
|
| 22 |
|
| 23 |
#=========================================
|
| 24 |
# Search Tools
|
|
|
|
| 493 |
|
| 494 |
finally:
|
| 495 |
shutil.rmtree(work)
|
| 496 |
+
|
| 497 |
+
#=========================================
|
| 498 |
+
# Tabular data tools
|
| 499 |
+
#=========================================
|
| 500 |
+
MAX_BYTES_RETURN = 200000
|
| 501 |
+
|
| 502 |
+
# Helper functions
|
| 503 |
+
def _load_table(path: Path, sheet: str) -> pd.DataFrame:
|
| 504 |
+
"""
|
| 505 |
+
Load a table from a file.
|
| 506 |
+
Args:
|
| 507 |
+
path (Path): The path to the file.
|
| 508 |
+
sheet (str): The sheet to load.
|
| 509 |
+
Returns:
|
| 510 |
+
pd.DataFrame: The loaded table.
|
| 511 |
+
"""
|
| 512 |
+
ext = path.suffix.lower()
|
| 513 |
+
if ext in (".csv", ".tsv"):
|
| 514 |
+
return pd.read_csv(path)
|
| 515 |
+
if ext in (".xlsx", ".xls"):
|
| 516 |
+
return pd.read_excel(path, sheet_name=sheet)
|
| 517 |
+
if ext in (".parquet"):
|
| 518 |
+
return pd.read_parquet(path)
|
| 519 |
+
raise ValueError(f"Unsupported file extension: {ext}")
|
| 520 |
+
|
| 521 |
+
def _safe_truncate(text: str, limit: int = MAX_BYTES_RETURN) -> tuple[str, bool]:
|
| 522 |
+
"""
|
| 523 |
+
Truncate text to a given limit.
|
| 524 |
+
Args:
|
| 525 |
+
text (str): The text to truncate.
|
| 526 |
+
limit (int): The limit in bytes.
|
| 527 |
+
Returns:
|
| 528 |
+
tuple[str, bool]: The truncated text and a boolean indicating if truncation occurred.
|
| 529 |
+
"""
|
| 530 |
+
utf8 = text.encode("utf-8")
|
| 531 |
+
truncated = len(utf8) > limit
|
| 532 |
+
if truncated:
|
| 533 |
+
utf8 = utf8[:limit]
|
| 534 |
+
return utf8.decode("utf-8", errors="ignore"), truncated
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
@tool
|
| 538 |
+
def interact_tabular(file_path: str, operation: str = "summary", sheet: str = "Sheet1") -> str:
|
| 539 |
+
"""
|
| 540 |
+
Interact with a tabular data file, such as a CSV, Excel, or Parquet file.
|
| 541 |
+
Args:
|
| 542 |
+
path (str): The path to the file.
|
| 543 |
+
operation (str): The operation to perform: summary | head [N] | select col1,col2 | filter <expr>
|
| 544 |
+
describe | to_json
|
| 545 |
+
sheet (str): The sheet to load.
|
| 546 |
+
Returns:
|
| 547 |
+
str: The result of the operation.
|
| 548 |
+
"""
|
| 549 |
+
path = Path(file_path).expanduser().resolve(strict=True)
|
| 550 |
+
df = _load_table(path, sheet)
|
| 551 |
+
op, *args = operation.lower().split(maxsplit=1)
|
| 552 |
+
if op == "summary":
|
| 553 |
+
result = textwrap.dedent(f"""\
|
| 554 |
+
rows: {len(df)}
|
| 555 |
+
columns: {", ".join(df.columns)}
|
| 556 |
+
dtypes: {df.dtypes.to_string()}
|
| 557 |
+
""")
|
| 558 |
+
elif op == "head":
|
| 559 |
+
n = int(args[0]) if args else 5
|
| 560 |
+
buf = io.StringIO()
|
| 561 |
+
df.head(n).to_json(buf, orient="records", lines=True)
|
| 562 |
+
result = buf.getvalue()
|
| 563 |
+
elif op == "select":
|
| 564 |
+
cols = [c.strip() for c in args[0].split(",")]
|
| 565 |
+
buf = io.StringIO()
|
| 566 |
+
df[cols].to_json(buf, orient="records", lines=True)
|
| 567 |
+
result = buf.getvalue()
|
| 568 |
+
elif op == "filter":
|
| 569 |
+
expr = args[0]
|
| 570 |
+
buf = io.StringIO()
|
| 571 |
+
df.query(expr, engine="python").to_json(buf, orient="records", lines=True)
|
| 572 |
+
result = buf.getvalue()
|
| 573 |
+
elif op == "describe":
|
| 574 |
+
buf = io.StringIO()
|
| 575 |
+
df.describe(include="all").to_json(buf, orient="records", lines=True)
|
| 576 |
+
result = buf.getvalue()
|
| 577 |
+
elif op == "to_json":
|
| 578 |
+
buf = io.StringIO()
|
| 579 |
+
df.to_json(buf, orient="records", lines=True)
|
| 580 |
+
result = buf.getvalue()
|
| 581 |
+
else:
|
| 582 |
+
raise ValueError(f"Unsupported operation: {operation}")
|
| 583 |
+
|
| 584 |
+
result, truncated = _safe_truncate(result)
|
| 585 |
+
|
| 586 |
+
info = {
|
| 587 |
+
"file": str(path),
|
| 588 |
+
"sheet": sheet,
|
| 589 |
+
"truncated": truncated,
|
| 590 |
+
"rows_returned": result.count("\n") - 1
|
| 591 |
+
}
|
| 592 |
+
return (
|
| 593 |
+
f"OPERATION: {operation}\n"
|
| 594 |
+
f"RESULT:\n{result}\n"
|
| 595 |
+
f"INFO:\n{json.dumps(info, indent=2)}"
|
| 596 |
+
)
|