Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools /attachments.py

jwlee-ai

Upload folder using huggingface_hub

4a5f5e9 verified 10 days ago

raw

history blame contribute delete

12.7 kB

	"""GAIA 첨부 파일 처리 + 질문↔task_id 인덱스.

	CodeAgent의 시그니처 제약(__call__이 question만 받음) 때문에 task_id를 직접
	주입할 수 없어, 모듈 전역 mutable 컨테이너 + prefetch 인덱스로 우회한다.

	흐름:
	1) BasicAgent.__init__ 단계에 prefetch_question_index() → /questions 1회 호출
	해서 {질문본문: task_id} 사전을 만들고 set_question_index() 로 등록.
	2) BasicAgent.__call__ 진입 시 set_current_task(question) 으로 현재 문제의
	task_id와 질문 본문을 _CURRENT_TASK 에 세팅.
	3) 에이전트가 get_attached_file() 을 인자 없이 호출하면 _CURRENT_TASK 의
	task_id로 채점 서버에서 파일을 받아오고, 타입별로 처리:
	- 텍스트/CSV/JSON/code: UTF-8 디코딩
	- Excel(.xlsx): 시트별 CSV
	- PDF: 페이지별 텍스트 추출 (pypdf)
	- 이미지: VLM(Qwen2.5-VL-7B)으로 현재 질문 컨텍스트에 맞춰 분석
	- 오디오: Whisper(large-v3) 전사
	"""
	import io
	import re
	import requests
	from smolagents import tool

	# 채점 서버 URL을 여기서도 한 번 정의 (app.py와 동일 값).
	# tools 모듈을 독립적으로 사용하더라도 의미가 통하도록 분리해 둔다.
	_DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

	# BasicAgent.__call__ 진입 시 갱신되는 mutable 컨테이너.
	# question은 이미지 VLM 호출 시 컨텍스트(prompt)로 사용된다.
	_CURRENT_TASK = {"id": None, "question": None}
	# question.strip() -> task_id 사전.
	_QUESTION_INDEX: dict = {}


	def prefetch_question_index() -> dict:
	"""채점 서버 /questions 를 한 번 호출해 {질문본문: task_id} 사전을 빌드한다.
	실패해도 빈 dict를 반환해 에이전트가 첨부 없는 문제만이라도 풀 수 있게 한다."""
	try:
	r = requests.get(f"{_DEFAULT_API_URL}/questions", timeout=15)
	r.raise_for_status()
	idx = {}
	for item in r.json():
	qt = (item.get("question") or "").strip()
	tid = item.get("task_id")
	if qt and tid:
	if qt in idx and idx[qt] != tid:
	print(
	"Warning: duplicate question text in prefetch index — "
	f"task_id {idx[qt]!r} will be overwritten by {tid!r}"
	)
	idx[qt] = tid
	return idx
	except Exception as e:
	print(f"Warning: could not prefetch question index: {e}")
	return {}


	def set_question_index(idx: dict) -> None:
	"""BasicAgent.__init__에서 prefetch 결과를 모듈 전역에 박아주는 세터."""
	global _QUESTION_INDEX
	_QUESTION_INDEX = idx


	def set_current_task(question: str):
	"""BasicAgent.__call__ 진입 시 현재 문제의 task_id와 질문 본문을 모듈 전역에 세팅.
	질문 본문은 이미지 첨부의 VLM 호출에 prompt 컨텍스트로 전달된다.
	매칭 실패 시 task_id로 None이 들어가지만 question은 그대로 저장된다."""
	tid = _QUESTION_INDEX.get(question.strip())
	_CURRENT_TASK["id"] = tid
	_CURRENT_TASK["question"] = question
	return tid


	# --- 파일 타입 분기 헬퍼 ---

	def _extract_filename(headers, url: str) -> str:
	"""Content-Disposition 헤더에서 filename을 뽑거나, URL 끝부분으로 폴백.
	채점 서버가 Content-Type을 octet-stream으로 줄 때 확장자로 보강하기 위함."""
	cd = headers.get("Content-Disposition", "")
	# filename* (RFC 5987) 와 filename= 양쪽 다 처리.
	m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";\r\n]+)"?', cd)
	if m:
	return m.group(1).strip().strip('"')
	return url.rsplit("/", 1)[-1]


	def _is_excel(content_type: str, ext: str) -> bool:
	if ext in ("xlsx", "xls"):
	return True
	ct = content_type.lower()
	return "spreadsheet" in ct or ct.endswith("xlsx") or ct.endswith("xls") or "excel" in ct


	def _is_pdf(content_type: str, ext: str) -> bool:
	return ext == "pdf" or "pdf" in content_type.lower()


	def _is_image(content_type: str, ext: str) -> bool:
	return ext in ("png", "jpg", "jpeg", "webp", "gif", "bmp") \
	or content_type.lower().startswith("image/")


	def _is_audio(content_type: str, ext: str) -> bool:
	return ext in ("mp3", "wav", "m4a", "ogg", "flac") \
	or content_type.lower().startswith("audio/")


	# --- 타입별 핸들러 ---

	def _handle_excel(content: bytes, content_type: str) -> str:
	"""xlsx → 시트별 CSV로 직렬화. GAIA에 매출/판매 데이터 문제가 자주 나온다."""
	try:
	import pandas as _pd
	bio = io.BytesIO(content)
	sheets = _pd.read_excel(bio, sheet_name=None)
	parts = []
	for name, df in sheets.items():
	parts.append(f"--- Sheet: {name} ---\n{df.to_csv(index=False)}")
	combined = "\n\n".join(parts)
	if len(combined) > 12000:
	combined = combined[:12000] + "\n...[truncated]"
	return f"[Content-Type: {content_type}]\n{combined}"
	except Exception as e:
	return f"Excel parse error: {e}"


	def _handle_pdf(content: bytes, content_type: str) -> str:
	"""pypdf로 PDF 본문 텍스트 추출. 페이지별로 구분해서 반환.
	스캔 PDF(이미지로 된)는 텍스트가 비거나 깨질 수 있는데, 그 경우는
	LLM이 위키/웹검색으로 폴백하도록 시스템 프롬프트가 유도한다."""
	try:
	from pypdf import PdfReader
	bio = io.BytesIO(content)
	reader = PdfReader(bio)
	parts = []
	for i, page in enumerate(reader.pages):
	try:
	txt = page.extract_text() or ""
	except Exception as pe:
	txt = f"(extraction failed: {pe})"
	parts.append(f"--- Page {i+1} ---\n{txt}")
	combined = "\n\n".join(parts)
	if len(combined) > 12000:
	combined = combined[:12000] + "\n...[truncated]"
	return f"[PDF, {len(reader.pages)} pages, Content-Type: {content_type}]\n{combined}"
	except Exception as e:
	return f"PDF parse error: {e}"


	def _handle_image(content: bytes, content_type: str) -> str:
	"""VLM(Qwen2.5-VL-7B)으로 현재 질문 컨텍스트에 맞춰 이미지를 분석한다.

	HF Inference API의 OpenAI 호환 chat_completion으로 base64 data URL을 전송한다.
	질문 컨텍스트가 있으면 그걸 그대로 prompt에 박아 정답에 직접 도움이 되는
	부분만 뽑아내도록 유도(generic 캡션은 디테일을 놓침). 호출 실패 시 에러
	문자열을 반환해 에이전트가 다른 전략으로 폴백할 수 있게 한다.

	HF_TOKEN 환경변수가 필요하다. Space 배포 시에는 Space secrets에 등록해야 함.
	"""
	try:
	import base64
	from huggingface_hub import InferenceClient

	question = (_CURRENT_TASK.get("question") or "").strip()
	# 데이터 URL 구성. content_type이 image/* 가 아닐 수도 있어 안전하게 폴백.
	mime = content_type.split(";")[0].strip()
	if not mime.startswith("image/"):
	mime = "image/png"
	b64 = base64.b64encode(content).decode("utf-8")
	data_url = f"data:{mime};base64,{b64}"

	if question:
	prompt = (
	"Analyze the attached image and answer the following question. "
	"Read any text, numbers, or labels visible in the image. "
	"If it is a chart or table, extract the relevant data values precisely.\n\n"
	f"Question: {question}"
	)
	else:
	prompt = (
	"Describe the attached image in detail, including any visible text, "
	"numbers, or labels."
	)

	client = InferenceClient(provider="auto") # HF_TOKEN 환경변수 사용
	resp = client.chat_completion(
	model="Qwen/Qwen2.5-VL-7B-Instruct",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": data_url}},
	],
	}
	],
	max_tokens=1024,
	)
	analysis = resp.choices[0].message.content
	return (
	f"[Image analysis (Content-Type: {content_type}, {len(content)} bytes)]\n"
	f"{analysis}"
	)
	except Exception as e:
	return (
	f"Image attached (Content-Type: {content_type}, {len(content)} bytes). "
	f"VLM analysis failed: {e}"
	)


	def _handle_audio(content: bytes, content_type: str) -> str:
	"""Whisper(large-v3)로 오디오 전사. GAIA 오디오는 보통 짧은 발화라 한 번 호출로 충분.

	HF_TOKEN 환경변수가 필요하다. Space 배포 시에는 Space secrets에 등록해야 함.
	"""
	try:
	from huggingface_hub import InferenceClient
	client = InferenceClient(provider="auto")
	result = client.automatic_speech_recognition(
	audio=content,
	model="openai/whisper-large-v3",
	)
	# huggingface_hub 버전에 따라 dict 또는 dataclass-like 객체로 반환되므로
	# 양쪽 모두 처리한다.
	if hasattr(result, "text"):
	transcription = result.text
	elif isinstance(result, dict):
	transcription = result.get("text", str(result))
	else:
	transcription = str(result)
	return (
	f"[Audio transcription (Content-Type: {content_type}, {len(content)} bytes)]\n"
	f"{transcription}"
	)
	except Exception as e:
	return (
	f"Audio attached (Content-Type: {content_type}, {len(content)} bytes). "
	f"Transcription failed: {e}"
	)


	@tool
	def get_attached_file() -> str:
	"""Download the file attached to the CURRENT GAIA task and return its content.
	Takes no arguments — the current task_id is auto-resolved from the question.

	Use this whenever the question references a file, spreadsheet, image, audio, PDF, code listing,
	CSV, or any external resource. Returns:
	- Text/CSV/JSON/code: the decoded text (truncated to ~12k chars).
	- Excel (.xlsx): each sheet rendered as CSV (truncated).
	- PDF: extracted text per page (truncated).
	- Image (PNG/JPEG/WEBP/GIF/BMP): a vision-language model analysis focused on the current question.
	- Audio (MP3/WAV/M4A/OGG/FLAC): a Whisper transcription.
	- Other binary: a metadata description (size + content-type).
	"""
	# 시그니처 제약 때문에 task_id 인자를 받지 않고, 모듈 전역 _CURRENT_TASK 에서 가져온다.
	# 이 값은 BasicAgent.__call__ 진입 시 set_current_task()로 세팅된다.
	task_id = _CURRENT_TASK.get("id")
	if not task_id:
	return "No task context available — likely no file attached for this question."
	try:
	url = f"{_DEFAULT_API_URL}/files/{task_id}"
	r = requests.get(url, timeout=30)
	if r.status_code == 404:
	return "No file attached to this task."
	r.raise_for_status()
	content_type = r.headers.get("Content-Type", "")
	filename = _extract_filename(r.headers, url)
	ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""

	# 1) 명확한 바이너리 타입은 먼저 처리.
	# 일부 PDF/SVG는 UTF-8 decode가 되어도 원시 텍스트로 반환하면 품질이 크게 떨어진다.
	if _is_excel(content_type, ext):
	return _handle_excel(r.content, content_type)

	if _is_pdf(content_type, ext):
	return _handle_pdf(r.content, content_type)

	if _is_image(content_type, ext):
	return _handle_image(r.content, content_type)

	if _is_audio(content_type, ext):
	return _handle_audio(r.content, content_type)

	# 2) 텍스트 계열이면 UTF-8로 반환.
	try:
	text = r.content.decode("utf-8")
	if len(text) > 12000:
	text = text[:12000] + "\n...[truncated]"
	return f"[Content-Type: {content_type}]\n{text}"
	except UnicodeDecodeError:
	pass

	# 3) 알 수 없는 바이너리 — 메타데이터만 반환.
	return (
	f"Binary file (Content-Type: {content_type}, "
	f"size: {len(r.content)} bytes). Cannot display as text. "
	f"URL: {url}"
	)
	except Exception as e:
	return f"get_attached_file error: {e}"