"""GAIA 첨부 파일 처리 + 질문↔task_id 인덱스.

CodeAgent의 시그니처 제약(__call__이 question만 받음) 때문에 task_id를 직접
주입할 수 없어, 모듈 전역 mutable 컨테이너 + prefetch 인덱스로 우회한다.

흐름:
    1) BasicAgent.__init__ 단계에 prefetch_question_index() → /questions 1회 호출
       해서 {질문본문: task_id} 사전을 만들고 set_question_index() 로 등록.
    2) BasicAgent.__call__ 진입 시 set_current_task(question) 으로 현재 문제의
       task_id와 질문 본문을 _CURRENT_TASK 에 세팅.
    3) 에이전트가 get_attached_file() 을 인자 없이 호출하면 _CURRENT_TASK 의
       task_id로 채점 서버에서 파일을 받아오고, 타입별로 처리:
       - 텍스트/CSV/JSON/code: UTF-8 디코딩
       - Excel(.xlsx): 시트별 CSV
       - PDF: 페이지별 텍스트 추출 (pypdf)
       - 이미지: VLM(Qwen2.5-VL-7B)으로 현재 질문 컨텍스트에 맞춰 분석
       - 오디오: Whisper(large-v3) 전사
"""
import io
import re
import requests
from smolagents import tool

# 채점 서버 URL을 여기서도 한 번 정의 (app.py와 동일 값).
# tools 모듈을 독립적으로 사용하더라도 의미가 통하도록 분리해 둔다.
_DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

# BasicAgent.__call__ 진입 시 갱신되는 mutable 컨테이너.
# question은 이미지 VLM 호출 시 컨텍스트(prompt)로 사용된다.
_CURRENT_TASK = {"id": None, "question": None}
# question.strip() -> task_id 사전.
_QUESTION_INDEX: dict = {}


def prefetch_question_index() -> dict:
    """채점 서버 /questions 를 한 번 호출해 {질문본문: task_id} 사전을 빌드한다.
    실패해도 빈 dict를 반환해 에이전트가 첨부 없는 문제만이라도 풀 수 있게 한다."""
    try:
        r = requests.get(f"{_DEFAULT_API_URL}/questions", timeout=15)
        r.raise_for_status()
        idx = {}
        for item in r.json():
            qt = (item.get("question") or "").strip()
            tid = item.get("task_id")
            if qt and tid:
                if qt in idx and idx[qt] != tid:
                    print(
                        "Warning: duplicate question text in prefetch index — "
                        f"task_id {idx[qt]!r} will be overwritten by {tid!r}"
                    )
                idx[qt] = tid
        return idx
    except Exception as e:
        print(f"Warning: could not prefetch question index: {e}")
        return {}


def set_question_index(idx: dict) -> None:
    """BasicAgent.__init__에서 prefetch 결과를 모듈 전역에 박아주는 세터."""
    global _QUESTION_INDEX
    _QUESTION_INDEX = idx


def set_current_task(question: str):
    """BasicAgent.__call__ 진입 시 현재 문제의 task_id와 질문 본문을 모듈 전역에 세팅.
    질문 본문은 이미지 첨부의 VLM 호출에 prompt 컨텍스트로 전달된다.
    매칭 실패 시 task_id로 None이 들어가지만 question은 그대로 저장된다."""
    tid = _QUESTION_INDEX.get(question.strip())
    _CURRENT_TASK["id"] = tid
    _CURRENT_TASK["question"] = question
    return tid


# --- 파일 타입 분기 헬퍼 ---

def _extract_filename(headers, url: str) -> str:
    """Content-Disposition 헤더에서 filename을 뽑거나, URL 끝부분으로 폴백.
    채점 서버가 Content-Type을 octet-stream으로 줄 때 확장자로 보강하기 위함."""
    cd = headers.get("Content-Disposition", "")
    # filename* (RFC 5987) 와 filename= 양쪽 다 처리.
    m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";\r\n]+)"?', cd)
    if m:
        return m.group(1).strip().strip('"')
    return url.rsplit("/", 1)[-1]


def _is_excel(content_type: str, ext: str) -> bool:
    if ext in ("xlsx", "xls"):
        return True
    ct = content_type.lower()
    return "spreadsheet" in ct or ct.endswith("xlsx") or ct.endswith("xls") or "excel" in ct


def _is_pdf(content_type: str, ext: str) -> bool:
    return ext == "pdf" or "pdf" in content_type.lower()


def _is_image(content_type: str, ext: str) -> bool:
    return ext in ("png", "jpg", "jpeg", "webp", "gif", "bmp") \
        or content_type.lower().startswith("image/")


def _is_audio(content_type: str, ext: str) -> bool:
    return ext in ("mp3", "wav", "m4a", "ogg", "flac") \
        or content_type.lower().startswith("audio/")


# --- 타입별 핸들러 ---

def _handle_excel(content: bytes, content_type: str) -> str:
    """xlsx → 시트별 CSV로 직렬화. GAIA에 매출/판매 데이터 문제가 자주 나온다."""
    try:
        import pandas as _pd
        bio = io.BytesIO(content)
        sheets = _pd.read_excel(bio, sheet_name=None)
        parts = []
        for name, df in sheets.items():
            parts.append(f"--- Sheet: {name} ---\n{df.to_csv(index=False)}")
        combined = "\n\n".join(parts)
        if len(combined) > 12000:
            combined = combined[:12000] + "\n...[truncated]"
        return f"[Content-Type: {content_type}]\n{combined}"
    except Exception as e:
        return f"Excel parse error: {e}"


def _handle_pdf(content: bytes, content_type: str) -> str:
    """pypdf로 PDF 본문 텍스트 추출. 페이지별로 구분해서 반환.
    스캔 PDF(이미지로 된)는 텍스트가 비거나 깨질 수 있는데, 그 경우는
    LLM이 위키/웹검색으로 폴백하도록 시스템 프롬프트가 유도한다."""
    try:
        from pypdf import PdfReader
        bio = io.BytesIO(content)
        reader = PdfReader(bio)
        parts = []
        for i, page in enumerate(reader.pages):
            try:
                txt = page.extract_text() or ""
            except Exception as pe:
                txt = f"(extraction failed: {pe})"
            parts.append(f"--- Page {i+1} ---\n{txt}")
        combined = "\n\n".join(parts)
        if len(combined) > 12000:
            combined = combined[:12000] + "\n...[truncated]"
        return f"[PDF, {len(reader.pages)} pages, Content-Type: {content_type}]\n{combined}"
    except Exception as e:
        return f"PDF parse error: {e}"


def _handle_image(content: bytes, content_type: str) -> str:
    """VLM(Qwen2.5-VL-7B)으로 현재 질문 컨텍스트에 맞춰 이미지를 분석한다.

    HF Inference API의 OpenAI 호환 chat_completion으로 base64 data URL을 전송한다.
    질문 컨텍스트가 있으면 그걸 그대로 prompt에 박아 정답에 직접 도움이 되는
    부분만 뽑아내도록 유도(generic 캡션은 디테일을 놓침). 호출 실패 시 에러
    문자열을 반환해 에이전트가 다른 전략으로 폴백할 수 있게 한다.

    HF_TOKEN 환경변수가 필요하다. Space 배포 시에는 Space secrets에 등록해야 함.
    """
    try:
        import base64
        from huggingface_hub import InferenceClient

        question = (_CURRENT_TASK.get("question") or "").strip()
        # 데이터 URL 구성. content_type이 image/* 가 아닐 수도 있어 안전하게 폴백.
        mime = content_type.split(";")[0].strip()
        if not mime.startswith("image/"):
            mime = "image/png"
        b64 = base64.b64encode(content).decode("utf-8")
        data_url = f"data:{mime};base64,{b64}"

        if question:
            prompt = (
                "Analyze the attached image and answer the following question. "
                "Read any text, numbers, or labels visible in the image. "
                "If it is a chart or table, extract the relevant data values precisely.\n\n"
                f"Question: {question}"
            )
        else:
            prompt = (
                "Describe the attached image in detail, including any visible text, "
                "numbers, or labels."
            )

        client = InferenceClient(provider="auto")  # HF_TOKEN 환경변수 사용
        resp = client.chat_completion(
            model="Qwen/Qwen2.5-VL-7B-Instruct",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": data_url}},
                    ],
                }
            ],
            max_tokens=1024,
        )
        analysis = resp.choices[0].message.content
        return (
            f"[Image analysis (Content-Type: {content_type}, {len(content)} bytes)]\n"
            f"{analysis}"
        )
    except Exception as e:
        return (
            f"Image attached (Content-Type: {content_type}, {len(content)} bytes). "
            f"VLM analysis failed: {e}"
        )


def _handle_audio(content: bytes, content_type: str) -> str:
    """Whisper(large-v3)로 오디오 전사. GAIA 오디오는 보통 짧은 발화라 한 번 호출로 충분.

    HF_TOKEN 환경변수가 필요하다. Space 배포 시에는 Space secrets에 등록해야 함.
    """
    try:
        from huggingface_hub import InferenceClient
        client = InferenceClient(provider="auto")
        result = client.automatic_speech_recognition(
            audio=content,
            model="openai/whisper-large-v3",
        )
        # huggingface_hub 버전에 따라 dict 또는 dataclass-like 객체로 반환되므로
        # 양쪽 모두 처리한다.
        if hasattr(result, "text"):
            transcription = result.text
        elif isinstance(result, dict):
            transcription = result.get("text", str(result))
        else:
            transcription = str(result)
        return (
            f"[Audio transcription (Content-Type: {content_type}, {len(content)} bytes)]\n"
            f"{transcription}"
        )
    except Exception as e:
        return (
            f"Audio attached (Content-Type: {content_type}, {len(content)} bytes). "
            f"Transcription failed: {e}"
        )


@tool
def get_attached_file() -> str:
    """Download the file attached to the CURRENT GAIA task and return its content.
    Takes no arguments — the current task_id is auto-resolved from the question.

    Use this whenever the question references a file, spreadsheet, image, audio, PDF, code listing,
    CSV, or any external resource. Returns:
      - Text/CSV/JSON/code: the decoded text (truncated to ~12k chars).
      - Excel (.xlsx): each sheet rendered as CSV (truncated).
      - PDF: extracted text per page (truncated).
      - Image (PNG/JPEG/WEBP/GIF/BMP): a vision-language model analysis focused on the current question.
      - Audio (MP3/WAV/M4A/OGG/FLAC): a Whisper transcription.
      - Other binary: a metadata description (size + content-type).
    """
    # 시그니처 제약 때문에 task_id 인자를 받지 않고, 모듈 전역 _CURRENT_TASK 에서 가져온다.
    # 이 값은 BasicAgent.__call__ 진입 시 set_current_task()로 세팅된다.
    task_id = _CURRENT_TASK.get("id")
    if not task_id:
        return "No task context available — likely no file attached for this question."
    try:
        url = f"{_DEFAULT_API_URL}/files/{task_id}"
        r = requests.get(url, timeout=30)
        if r.status_code == 404:
            return "No file attached to this task."
        r.raise_for_status()
        content_type = r.headers.get("Content-Type", "")
        filename = _extract_filename(r.headers, url)
        ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""

        # 1) 명확한 바이너리 타입은 먼저 처리.
        # 일부 PDF/SVG는 UTF-8 decode가 되어도 원시 텍스트로 반환하면 품질이 크게 떨어진다.
        if _is_excel(content_type, ext):
            return _handle_excel(r.content, content_type)

        if _is_pdf(content_type, ext):
            return _handle_pdf(r.content, content_type)

        if _is_image(content_type, ext):
            return _handle_image(r.content, content_type)

        if _is_audio(content_type, ext):
            return _handle_audio(r.content, content_type)

        # 2) 텍스트 계열이면 UTF-8로 반환.
        try:
            text = r.content.decode("utf-8")
            if len(text) > 12000:
                text = text[:12000] + "\n...[truncated]"
            return f"[Content-Type: {content_type}]\n{text}"
        except UnicodeDecodeError:
            pass

        # 3) 알 수 없는 바이너리 — 메타데이터만 반환.
        return (
            f"Binary file (Content-Type: {content_type}, "
            f"size: {len(r.content)} bytes). Cannot display as text. "
            f"URL: {url}"
        )
    except Exception as e:
        return f"get_attached_file error: {e}"