"""
tools/read_file.py —— 工具④：读取本地文件（万能读取器）

有些题目会附带一个文件（Excel 表格、PDF、Word 文档、Python 代码、纯文本等）。
这个工具负责把这些文件的内容读成文字交给大模型。它会先看文件后缀名，再决定用什么方式读：
不同格式的文件读法不一样，所以下面用一连串 if 分别处理。

注意：图片要用 visual_qa（看图）、音频要用 transcribe_audio（转写），它们不归这个工具管。
"""

import os

from langchain_core.tools import tool


@tool
def read_file(file_path: str) -> str:
    """Read a local file and return its content as text. Handles spreadsheets
    (.xlsx/.xls/.csv/.tsv), PDFs (.pdf), Word documents (.docx) and any plain-text or code
    file (.txt/.py/.json/.md/...). For images use `visual_qa`; for audio use
    `transcribe_audio`. Returns the full text so you can reason over it or parse it with
    `python_repl`."""
    # 先确认文件真的存在，不存在就直接返回提示（避免后面读取时报错崩溃）。
    if not os.path.exists(file_path):
        return f"File not found: {file_path}"
    ext = os.path.splitext(file_path)[1].lower()   # 取出后缀名（如 ".xlsx"），转小写
    try:
        # —— Excel 表格 ——
        if ext in (".xlsx", ".xls"):
            import pandas as pd   # pandas 是处理表格数据的常用库

            # sheet_name=None 表示"读取工作簿里的所有工作表"，结果是 {表名: 表格数据} 的字典。
            sheets = pd.read_excel(file_path, sheet_name=None)
            # 把每张表都转成 CSV 文字（逗号分隔），拼起来一起返回。
            return "\n\n".join(
                f"## Sheet: {name}\n{df.to_csv(index=False)}" for name, df in sheets.items()
            )

        # —— CSV / TSV 文本表格 ——
        if ext in (".csv", ".tsv"):
            import pandas as pd

            # CSV 用逗号分隔，TSV 用制表符(Tab)分隔，这里据后缀选对分隔符。
            sep = "\t" if ext == ".tsv" else ","
            return pd.read_csv(file_path, sep=sep).to_csv(index=False)

        # —— PDF 文档 ——
        if ext == ".pdf":
            from pypdf import PdfReader

            reader = PdfReader(file_path)
            # 逐页抽取文字再用换行拼起来（有的页面抽不出文字就当空字符串处理）。
            return "\n".join((page.extract_text() or "") for page in reader.pages)

        # —— Word 文档 ——
        if ext == ".docx":
            import docx

            document = docx.Document(file_path)
            # 逐段落取文字再拼起来。
            return "\n".join(p.text for p in document.paragraphs)

        # —— 其它情况：当作普通纯文本/代码文件，直接按文本读 ——
        # errors="replace" 表示遇到无法识别的字符时用占位符替代，而不是报错。
        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    except Exception as e:
        # 任何读取错误都转成一句说明返回，保证程序不崩。
        return f"Error reading file '{file_path}': {e}"