Final_Assignment_Template

Sleeping

File size: 4,257 Bytes

import argparse
import os
import time
from datetime import datetime

import requests
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
from langfuse import get_client
from langfuse.langchain import CallbackHandler

from agent import build_graph
from regexs import extract_last_ai_text, normalize_gaia_answer, strip_final_answer_prefix

BASE = os.getenv("SCORING_API_URL", "https://agents-course-unit4-scoring.hf.space").rstrip("/")


def file_name(item: dict) -> str:
    return (item.get("file_name") or "").strip()


def has_file(item: dict) -> bool:
    return bool(file_name(item))


TOOL_USE_FAILED = os.getenv("TOOL_USE_FAILED", "tool_use_failed")
MAX_RETRIES = int(os.getenv("AGENT_MAX_RETRIES", "2"))


_GAIA_FILES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "gaia_files")


def answer(graph, item: dict, cfg: dict) -> str:
    msg = f"question: {item['question']}"
    if fn := file_name(item):
        full_path = os.path.join(_GAIA_FILES_DIR, fn)
        msg += f"\nfile_path: {full_path}"

    out = graph.invoke({"messages": [HumanMessage(content=msg)]}, config=cfg)
    raw = normalize_gaia_answer(extract_last_ai_text(out["messages"]))
    return strip_final_answer_prefix(raw)


def main() -> None:
    load_dotenv()
    p = argparse.ArgumentParser()
    p.add_argument("--limit", type=int, default=0)
    p.add_argument("--sleep", type=float, default=float(os.getenv("GROQ_EVAL_SLEEP_SECONDS", "2")))
    args = p.parse_args()

    questions = requests.get(f"{BASE}/questions", timeout=30).json()
    if args.limit:
        questions = questions[: args.limit]

    graph = build_graph()
    base_cfg = {"recursion_limit": int(os.getenv("LANGGRAPH_RECURSION_LIMIT", "80"))}
    answers = []
    run_id = datetime.now().strftime("run_%Y-%m-%d_%H-%M")
    lf = CallbackHandler() if os.getenv("TRACE_WITH_LANGFUSE") else None

    for i, item in enumerate(questions, 1):
        print(f"[{i}/{len(questions)}] {item['task_id'][:8]}…")

        result = "error when calling the agent"

        if has_file(item) and ".py" not in file_name(item) and ".xlsx" not in file_name(item) and ".mp3" not in file_name(item):
            answers.append({"task_id": item["task_id"], "submitted_answer": "has file, not processed yet"})
            print(f"la question {i} a un fichier ({file_name(item)!r}), donc non traitée.")
            continue
        
        for attempt in range(MAX_RETRIES + 1):
            try:
                cfg = base_cfg if not lf else {
                    **base_cfg,
                    "callbacks": [lf],
                    "run_name": f"{run_id} question {i:02d}",
                    "metadata": {"langfuse_session_id": run_id},
                }
                result = answer(graph, item, cfg)
                break
            except Exception as e:
                if TOOL_USE_FAILED in str(e) and attempt < MAX_RETRIES:
                    print(f"    tentative {attempt + 1}/{MAX_RETRIES} : erreur tool calling, retry…")
                else:
                    print(f"    tentative {attempt + 1}/{MAX_RETRIES} : erreur non-tool-calling : {e}")
                time.sleep(args.sleep)
        answers.append({"task_id": item["task_id"], "submitted_answer": result})
        
        if args.sleep and i < len(questions):
            time.sleep(args.sleep)


    for i, ans in enumerate(answers):
        print(f"[{i+1}/{len(answers)}], {ans['submitted_answer']} \n")
    
    try:
        resp = requests.post(
            f"{BASE}/submit",
            json={
                "username": os.environ["HF_USERNAME"],
                "agent_code": os.environ["AGENT_CODE_URL"],
                "answers": answers,
            },
            timeout=120,
        )
        r = resp.json()
    except requests.JSONDecodeError:
        print(f"Submit failed: HTTP {resp.status_code}\n{resp.text[:500]}")
        return
    if resp.status_code != 200 or "score" not in r:
        print(f"Submit error (HTTP {resp.status_code}): {r}")
        return


    print( f"SCORE: {r['score']}% ({r['correct_count']}/{r['total_attempted']})")
    if msg := r.get("message"):
        print(msg)

    if lf:
        get_client().flush()


if __name__ == "__main__":
    main()