serverdaun commited on
Commit
fb8f1a6
·
1 Parent(s): 3a2d999

Add tabular data interaction tools for CSV, Excel, and Parquet files, including summary, head, select, filter, describe, and to_json operations.

Browse files
Files changed (1) hide show
  1. tools.py +105 -0
tools.py CHANGED
@@ -15,6 +15,10 @@ from typing import Dict
15
  import shutil
16
  import subprocess as sp
17
  import tempfile
 
 
 
 
18
 
19
  #=========================================
20
  # Search Tools
@@ -489,3 +493,104 @@ def execute_source_file(path: str, timeout: int=10) -> str:
489
 
490
  finally:
491
  shutil.rmtree(work)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  import shutil
16
  import subprocess as sp
17
  import tempfile
18
+ import pandas as pd
19
+ import textwrap
20
+ import io
21
+ import json
22
 
23
  #=========================================
24
  # Search Tools
 
493
 
494
  finally:
495
  shutil.rmtree(work)
496
+
497
+ #=========================================
498
+ # Tabular data tools
499
+ #=========================================
500
+ MAX_BYTES_RETURN = 200000
501
+
502
+ # Helper functions
503
+ def _load_table(path: Path, sheet: str) -> pd.DataFrame:
504
+ """
505
+ Load a table from a file.
506
+ Args:
507
+ path (Path): The path to the file.
508
+ sheet (str): The sheet to load.
509
+ Returns:
510
+ pd.DataFrame: The loaded table.
511
+ """
512
+ ext = path.suffix.lower()
513
+ if ext in (".csv", ".tsv"):
514
+ return pd.read_csv(path)
515
+ if ext in (".xlsx", ".xls"):
516
+ return pd.read_excel(path, sheet_name=sheet)
517
+ if ext in (".parquet"):
518
+ return pd.read_parquet(path)
519
+ raise ValueError(f"Unsupported file extension: {ext}")
520
+
521
+ def _safe_truncate(text: str, limit: int = MAX_BYTES_RETURN) -> tuple[str, bool]:
522
+ """
523
+ Truncate text to a given limit.
524
+ Args:
525
+ text (str): The text to truncate.
526
+ limit (int): The limit in bytes.
527
+ Returns:
528
+ tuple[str, bool]: The truncated text and a boolean indicating if truncation occurred.
529
+ """
530
+ utf8 = text.encode("utf-8")
531
+ truncated = len(utf8) > limit
532
+ if truncated:
533
+ utf8 = utf8[:limit]
534
+ return utf8.decode("utf-8", errors="ignore"), truncated
535
+
536
+
537
+ @tool
538
+ def interact_tabular(file_path: str, operation: str = "summary", sheet: str = "Sheet1") -> str:
539
+ """
540
+ Interact with a tabular data file, such as a CSV, Excel, or Parquet file.
541
+ Args:
542
+ path (str): The path to the file.
543
+ operation (str): The operation to perform: summary | head [N] | select col1,col2 | filter <expr>
544
+ describe | to_json
545
+ sheet (str): The sheet to load.
546
+ Returns:
547
+ str: The result of the operation.
548
+ """
549
+ path = Path(file_path).expanduser().resolve(strict=True)
550
+ df = _load_table(path, sheet)
551
+ op, *args = operation.lower().split(maxsplit=1)
552
+ if op == "summary":
553
+ result = textwrap.dedent(f"""\
554
+ rows: {len(df)}
555
+ columns: {", ".join(df.columns)}
556
+ dtypes: {df.dtypes.to_string()}
557
+ """)
558
+ elif op == "head":
559
+ n = int(args[0]) if args else 5
560
+ buf = io.StringIO()
561
+ df.head(n).to_json(buf, orient="records", lines=True)
562
+ result = buf.getvalue()
563
+ elif op == "select":
564
+ cols = [c.strip() for c in args[0].split(",")]
565
+ buf = io.StringIO()
566
+ df[cols].to_json(buf, orient="records", lines=True)
567
+ result = buf.getvalue()
568
+ elif op == "filter":
569
+ expr = args[0]
570
+ buf = io.StringIO()
571
+ df.query(expr, engine="python").to_json(buf, orient="records", lines=True)
572
+ result = buf.getvalue()
573
+ elif op == "describe":
574
+ buf = io.StringIO()
575
+ df.describe(include="all").to_json(buf, orient="records", lines=True)
576
+ result = buf.getvalue()
577
+ elif op == "to_json":
578
+ buf = io.StringIO()
579
+ df.to_json(buf, orient="records", lines=True)
580
+ result = buf.getvalue()
581
+ else:
582
+ raise ValueError(f"Unsupported operation: {operation}")
583
+
584
+ result, truncated = _safe_truncate(result)
585
+
586
+ info = {
587
+ "file": str(path),
588
+ "sheet": sheet,
589
+ "truncated": truncated,
590
+ "rows_returned": result.count("\n") - 1
591
+ }
592
+ return (
593
+ f"OPERATION: {operation}\n"
594
+ f"RESULT:\n{result}\n"
595
+ f"INFO:\n{json.dumps(info, indent=2)}"
596
+ )