Spaces:

BiGuan
/

Agent

Sleeping

App Files Files Community

Agent / tools /read_file.py

BiGuan

Upload 13 files

e15103f verified 30 days ago

Raw

History Blame Contribute Delete

3.25 kB

	"""
	tools/read_file.py —— 工具④：读取本地文件（万能读取器）

	有些题目会附带一个文件（Excel 表格、PDF、Word 文档、Python 代码、纯文本等）。
	这个工具负责把这些文件的内容读成文字交给大模型。它会先看文件后缀名，再决定用什么方式读：
	不同格式的文件读法不一样，所以下面用一连串 if 分别处理。

	注意：图片要用 visual_qa（看图）、音频要用 transcribe_audio（转写），它们不归这个工具管。
	"""

	import os

	from langchain_core.tools import tool


	@tool
	def read_file(file_path: str) -> str:
	"""Read a local file and return its content as text. Handles spreadsheets
	(.xlsx/.xls/.csv/.tsv), PDFs (.pdf), Word documents (.docx) and any plain-text or code
	file (.txt/.py/.json/.md/...). For images use `visual_qa`; for audio use
	`transcribe_audio`. Returns the full text so you can reason over it or parse it with
	`python_repl`."""
	# 先确认文件真的存在，不存在就直接返回提示（避免后面读取时报错崩溃）。
	if not os.path.exists(file_path):
	return f"File not found: {file_path}"
	ext = os.path.splitext(file_path)[1].lower() # 取出后缀名（如 ".xlsx"），转小写
	try:
	# —— Excel 表格 ——
	if ext in (".xlsx", ".xls"):
	import pandas as pd # pandas 是处理表格数据的常用库

	# sheet_name=None 表示"读取工作簿里的所有工作表"，结果是 {表名: 表格数据} 的字典。
	sheets = pd.read_excel(file_path, sheet_name=None)
	# 把每张表都转成 CSV 文字（逗号分隔），拼起来一起返回。
	return "\n\n".join(
	f"## Sheet: {name}\n{df.to_csv(index=False)}" for name, df in sheets.items()
	)

	# —— CSV / TSV 文本表格 ——
	if ext in (".csv", ".tsv"):
	import pandas as pd

	# CSV 用逗号分隔，TSV 用制表符(Tab)分隔，这里据后缀选对分隔符。
	sep = "\t" if ext == ".tsv" else ","
	return pd.read_csv(file_path, sep=sep).to_csv(index=False)

	# —— PDF 文档 ——
	if ext == ".pdf":
	from pypdf import PdfReader

	reader = PdfReader(file_path)
	# 逐页抽取文字再用换行拼起来（有的页面抽不出文字就当空字符串处理）。
	return "\n".join((page.extract_text() or "") for page in reader.pages)

	# —— Word 文档 ——
	if ext == ".docx":
	import docx

	document = docx.Document(file_path)
	# 逐段落取文字再拼起来。
	return "\n".join(p.text for p in document.paragraphs)

	# —— 其它情况：当作普通纯文本/代码文件，直接按文本读 ——
	# errors="replace" 表示遇到无法识别的字符时用占位符替代，而不是报错。
	with open(file_path, "r", encoding="utf-8", errors="replace") as f:
	return f.read()
	except Exception as e:
	# 任何读取错误都转成一句说明返回，保证程序不崩。
	return f"Error reading file '{file_path}': {e}"