| """ |
| tools/read_file.py —— 工具④:读取本地文件(万能读取器) |
| |
| 有些题目会附带一个文件(Excel 表格、PDF、Word 文档、Python 代码、纯文本等)。 |
| 这个工具负责把这些文件的内容读成文字交给大模型。它会先看文件后缀名,再决定用什么方式读: |
| 不同格式的文件读法不一样,所以下面用一连串 if 分别处理。 |
| |
| 注意:图片要用 visual_qa(看图)、音频要用 transcribe_audio(转写),它们不归这个工具管。 |
| """ |
|
|
| import os |
|
|
| from langchain_core.tools import tool |
|
|
|
|
| @tool |
| def read_file(file_path: str) -> str: |
| """Read a local file and return its content as text. Handles spreadsheets |
| (.xlsx/.xls/.csv/.tsv), PDFs (.pdf), Word documents (.docx) and any plain-text or code |
| file (.txt/.py/.json/.md/...). For images use `visual_qa`; for audio use |
| `transcribe_audio`. Returns the full text so you can reason over it or parse it with |
| `python_repl`.""" |
| |
| if not os.path.exists(file_path): |
| return f"File not found: {file_path}" |
| ext = os.path.splitext(file_path)[1].lower() |
| try: |
| |
| if ext in (".xlsx", ".xls"): |
| import pandas as pd |
|
|
| |
| sheets = pd.read_excel(file_path, sheet_name=None) |
| |
| return "\n\n".join( |
| f"## Sheet: {name}\n{df.to_csv(index=False)}" for name, df in sheets.items() |
| ) |
|
|
| |
| if ext in (".csv", ".tsv"): |
| import pandas as pd |
|
|
| |
| sep = "\t" if ext == ".tsv" else "," |
| return pd.read_csv(file_path, sep=sep).to_csv(index=False) |
|
|
| |
| if ext == ".pdf": |
| from pypdf import PdfReader |
|
|
| reader = PdfReader(file_path) |
| |
| return "\n".join((page.extract_text() or "") for page in reader.pages) |
|
|
| |
| if ext == ".docx": |
| import docx |
|
|
| document = docx.Document(file_path) |
| |
| return "\n".join(p.text for p in document.paragraphs) |
|
|
| |
| |
| with open(file_path, "r", encoding="utf-8", errors="replace") as f: |
| return f.read() |
| except Exception as e: |
| |
| return f"Error reading file '{file_path}': {e}" |
|
|