| """ |
| LLM Client Module - Multi-Provider LLM Integration |
| Author: @mangobee |
| Date: 2026-01-02 |
| |
| Handles all LLM calls for: |
| - Planning (question analysis and execution plan generation) |
| - Tool selection (function calling) |
| - Answer synthesis (factoid answer generation from evidence) |
| - Conflict resolution (evaluating contradictory information) |
| |
| Based on Level 5 decision: Gemini 2.0 Flash (primary/free) + Claude Sonnet 4.5 (fallback/paid) |
| Based on Level 6 decision: LLM function calling for tool selection |
| Pattern: Matches Stage 2 tools (Gemini primary, Claude fallback) |
| """ |
|
|
| import os |
| import logging |
| import time |
| from typing import List, Dict, Optional, Any, Callable |
| from anthropic import Anthropic |
| import google.generativeai as genai |
| from huggingface_hub import InferenceClient |
| from groq import Groq |
|
|
| |
| |
| |
|
|
| |
| CLAUDE_MODEL = "claude-sonnet-4-5-20250929" |
|
|
| |
| GEMINI_MODEL = "gemini-2.0-flash-exp" |
|
|
| |
| HF_MODEL = "Qwen/Qwen2.5-72B-Instruct" |
| |
|
|
| |
| GROQ_MODEL = "qwen/qwen3-32b" |
| |
|
|
| |
| TEMPERATURE = 0 |
| MAX_TOKENS = 4096 |
|
|
| |
| |
| |
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
|
|
|
|
| def retry_with_backoff(func: Callable, max_retries: int = 3) -> Any: |
| """ |
| Retry function with exponential backoff on quota errors. |
| |
| Handles: |
| - 429 rate limit errors |
| - Quota exceeded errors |
| - Respects retry_after header if present |
| |
| Args: |
| func: Function to retry (should be a lambda or callable with no args) |
| max_retries: Maximum number of retry attempts (default: 3) |
| |
| Returns: |
| Result of successful function call |
| |
| Raises: |
| Exception: If all retries exhausted or non-quota error encountered |
| """ |
| for attempt in range(max_retries): |
| try: |
| return func() |
| except Exception as e: |
| error_str = str(e).lower() |
|
|
| |
| is_quota_error = ( |
| "429" in str(e) |
| or "quota" in error_str |
| or "rate limit" in error_str |
| or "too many requests" in error_str |
| ) |
|
|
| if is_quota_error and attempt < max_retries - 1: |
| |
| wait_time = 2**attempt |
| logger.warning( |
| f"Quota/rate limit error (attempt {attempt + 1}/{max_retries}): {e}. " |
| f"Retrying in {wait_time}s..." |
| ) |
| time.sleep(wait_time) |
| continue |
|
|
| |
| raise |
|
|
|
|
| |
| |
| |
|
|
|
|
| def create_claude_client() -> Anthropic: |
| """Initialize Anthropic client with API key from environment.""" |
| api_key = os.getenv("ANTHROPIC_API_KEY") |
| if not api_key: |
| raise ValueError("ANTHROPIC_API_KEY environment variable not set") |
|
|
| logger.info(f"Initializing Anthropic client with model: {CLAUDE_MODEL}") |
| return Anthropic(api_key=api_key) |
|
|
|
|
| def create_gemini_client(): |
| """Initialize Gemini client with API key from environment.""" |
| api_key = os.getenv("GOOGLE_API_KEY") |
| if not api_key: |
| raise ValueError("GOOGLE_API_KEY environment variable not set") |
|
|
| genai.configure(api_key=api_key) |
| logger.info(f"Initializing Gemini client with model: {GEMINI_MODEL}") |
| return genai.GenerativeModel(GEMINI_MODEL) |
|
|
|
|
| def create_hf_client() -> InferenceClient: |
| """Initialize HuggingFace Inference API client with token from environment.""" |
| hf_token = os.getenv("HF_TOKEN") |
| if not hf_token: |
| raise ValueError("HF_TOKEN environment variable not set") |
|
|
| logger.info(f"Initializing HuggingFace Inference client with model: {HF_MODEL}") |
| return InferenceClient(model=HF_MODEL, token=hf_token) |
|
|
|
|
| def create_groq_client() -> Groq: |
| """Initialize Groq client with API key from environment.""" |
| api_key = os.getenv("GROQ_API_KEY") |
| if not api_key: |
| raise ValueError("GROQ_API_KEY environment variable not set") |
|
|
| logger.info(f"Initializing Groq client with model: {GROQ_MODEL}") |
| return Groq(api_key=api_key) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def plan_question_claude( |
| question: str, |
| available_tools: Dict[str, Dict], |
| file_paths: Optional[List[str]] = None, |
| ) -> str: |
| """Analyze question and generate execution plan using Claude.""" |
| client = create_claude_client() |
|
|
| |
| tool_descriptions = [] |
| for name, info in available_tools.items(): |
| tool_descriptions.append( |
| f"- {name}: {info['description']} (Category: {info['category']})" |
| ) |
| tools_text = "\n".join(tool_descriptions) |
|
|
| |
| file_context = "" |
| if file_paths: |
| file_context = f"\n\nAvailable files:\n" + "\n".join( |
| [f"- {fp}" for fp in file_paths] |
| ) |
|
|
| |
| system_prompt = """You are a planning agent for answering complex questions. |
| |
| Your task is to analyze the question and create a step-by-step execution plan. |
| |
| Consider: |
| 1. What information is needed to answer the question? |
| 2. Which tools can provide that information? |
| 3. In what order should tools be executed? |
| 4. What parameters need to be extracted from the question? |
| |
| Generate a concise plan with numbered steps.""" |
|
|
| user_prompt = f"""Question: {question}{file_context} |
| |
| Available tools: |
| {tools_text} |
| |
| Create an execution plan to answer this question. Format as numbered steps.""" |
|
|
| logger.info(f"[plan_question_claude] Calling Claude for planning") |
|
|
| response = client.messages.create( |
| model=CLAUDE_MODEL, |
| max_tokens=MAX_TOKENS, |
| temperature=TEMPERATURE, |
| system=system_prompt, |
| messages=[{"role": "user", "content": user_prompt}], |
| ) |
|
|
| plan = response.content[0].text |
| logger.info(f"[plan_question_claude] Generated plan ({len(plan)} chars)") |
|
|
| return plan |
|
|
|
|
| |
| |
| |
|
|
|
|
| def plan_question_gemini( |
| question: str, |
| available_tools: Dict[str, Dict], |
| file_paths: Optional[List[str]] = None, |
| ) -> str: |
| """Analyze question and generate execution plan using Gemini.""" |
| model = create_gemini_client() |
|
|
| |
| tool_descriptions = [] |
| for name, info in available_tools.items(): |
| tool_descriptions.append( |
| f"- {name}: {info['description']} (Category: {info['category']})" |
| ) |
| tools_text = "\n".join(tool_descriptions) |
|
|
| |
| file_context = "" |
| if file_paths: |
| file_context = f"\n\nAvailable files:\n" + "\n".join( |
| [f"- {fp}" for fp in file_paths] |
| ) |
|
|
| |
| prompt = f"""You are a planning agent for answering complex questions. |
| |
| Your task is to analyze the question and create a step-by-step execution plan. |
| |
| Consider: |
| 1. What information is needed to answer the question? |
| 2. Which tools can provide that information? |
| 3. In what order should tools be executed? |
| 4. What parameters need to be extracted from the question? |
| |
| Generate a concise plan with numbered steps. |
| |
| Question: {question}{file_context} |
| |
| Available tools: |
| {tools_text} |
| |
| Create an execution plan to answer this question. Format as numbered steps.""" |
|
|
| logger.info(f"[plan_question_gemini] Calling Gemini for planning") |
|
|
| response = model.generate_content( |
| prompt, |
| generation_config=genai.types.GenerationConfig( |
| temperature=TEMPERATURE, max_output_tokens=MAX_TOKENS |
| ), |
| ) |
|
|
| plan = response.text |
| logger.info(f"[plan_question_gemini] Generated plan ({len(plan)} chars)") |
|
|
| return plan |
|
|
|
|
| |
| |
| |
|
|
|
|
| def plan_question_hf( |
| question: str, |
| available_tools: Dict[str, Dict], |
| file_paths: Optional[List[str]] = None, |
| ) -> str: |
| """Analyze question and generate execution plan using HuggingFace Inference API.""" |
| client = create_hf_client() |
|
|
| |
| tool_descriptions = [] |
| for name, info in available_tools.items(): |
| tool_descriptions.append( |
| f"- {name}: {info['description']} (Category: {info['category']})" |
| ) |
| tools_text = "\n".join(tool_descriptions) |
|
|
| |
| file_context = "" |
| if file_paths: |
| file_context = f"\n\nAvailable files:\n" + "\n".join( |
| [f"- {fp}" for fp in file_paths] |
| ) |
|
|
| |
| system_prompt = """You are a planning agent for answering complex questions. |
| |
| Your task is to analyze the question and create a step-by-step execution plan. |
| |
| Consider: |
| 1. What information is needed to answer the question? |
| 2. Which tools can provide that information? |
| 3. In what order should tools be executed? |
| 4. What parameters need to be extracted from the question? |
| |
| Generate a concise plan with numbered steps.""" |
|
|
| user_prompt = f"""Question: {question}{file_context} |
| |
| Available tools: |
| {tools_text} |
| |
| Create an execution plan to answer this question. Format as numbered steps.""" |
|
|
| logger.info(f"[plan_question_hf] Calling HuggingFace ({HF_MODEL}) for planning") |
|
|
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ] |
|
|
| response = client.chat_completion( |
| messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE |
| ) |
|
|
| plan = response.choices[0].message.content |
| logger.info(f"[plan_question_hf] Generated plan ({len(plan)} chars)") |
|
|
| return plan |
|
|
|
|
| |
| |
| |
|
|
|
|
| def plan_question_groq( |
| question: str, |
| available_tools: Dict[str, Dict], |
| file_paths: Optional[List[str]] = None, |
| ) -> str: |
| """Analyze question and generate execution plan using Groq.""" |
| client = create_groq_client() |
|
|
| |
| tool_descriptions = [] |
| for name, info in available_tools.items(): |
| tool_descriptions.append( |
| f"- {name}: {info['description']} (Category: {info['category']})" |
| ) |
| tools_text = "\n".join(tool_descriptions) |
|
|
| |
| file_context = "" |
| if file_paths: |
| file_context = f"\n\nAvailable files:\n" + "\n".join( |
| [f"- {fp}" for fp in file_paths] |
| ) |
|
|
| |
| system_prompt = """You are a planning agent for answering complex questions. |
| |
| Your task is to analyze the question and create a step-by-step execution plan. |
| |
| Consider: |
| 1. What information is needed to answer the question? |
| 2. Which tools can provide that information? |
| 3. In what order should tools be executed? |
| 4. What parameters need to be extracted from the question? |
| |
| Generate a concise plan with numbered steps.""" |
|
|
| user_prompt = f"""Question: {question}{file_context} |
| |
| Available tools: |
| {tools_text} |
| |
| Create an execution plan to answer this question. Format as numbered steps.""" |
|
|
| logger.info(f"[plan_question_groq] Calling Groq ({GROQ_MODEL}) for planning") |
|
|
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ] |
|
|
| response = client.chat.completions.create( |
| model=GROQ_MODEL, |
| messages=messages, |
| max_tokens=MAX_TOKENS, |
| temperature=TEMPERATURE, |
| ) |
|
|
| plan = response.choices[0].message.content |
| logger.info(f"[plan_question_groq] Generated plan ({len(plan)} chars)") |
|
|
| return plan |
|
|
|
|
| |
| |
| |
|
|
|
|
| def plan_question( |
| question: str, |
| available_tools: Dict[str, Dict], |
| file_paths: Optional[List[str]] = None, |
| ) -> str: |
| """ |
| Analyze question and generate execution plan using LLM. |
| |
| Pattern: Try Gemini first (free tier), HuggingFace (free tier), Groq (free tier), then Claude (paid) if all fail. |
| 4-tier fallback ensures availability even with quota limits. |
| Each provider call wrapped with retry logic (3 attempts with exponential backoff). |
| |
| Args: |
| question: GAIA question text |
| available_tools: Tool registry (name -> {description, category, parameters}) |
| file_paths: Optional list of file paths for file-based questions |
| |
| Returns: |
| Execution plan as structured text |
| """ |
| try: |
| return retry_with_backoff( |
| lambda: plan_question_gemini(question, available_tools, file_paths) |
| ) |
| except Exception as gemini_error: |
| logger.warning( |
| f"[plan_question] Gemini failed: {gemini_error}, trying HuggingFace fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: plan_question_hf(question, available_tools, file_paths) |
| ) |
| except Exception as hf_error: |
| logger.warning( |
| f"[plan_question] HuggingFace failed: {hf_error}, trying Groq fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: plan_question_groq(question, available_tools, file_paths) |
| ) |
| except Exception as groq_error: |
| logger.warning( |
| f"[plan_question] Groq failed: {groq_error}, trying Claude fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: plan_question_claude( |
| question, available_tools, file_paths |
| ) |
| ) |
| except Exception as claude_error: |
| logger.error( |
| f"[plan_question] All LLMs failed. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}" |
| ) |
| raise Exception( |
| f"Planning failed with all LLMs. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def select_tools_claude( |
| question: str, plan: str, available_tools: Dict[str, Dict] |
| ) -> List[Dict[str, Any]]: |
| """Use Claude function calling to select tools and extract parameters.""" |
| client = create_claude_client() |
|
|
| |
| tool_schemas = [] |
| for name, info in available_tools.items(): |
| tool_schemas.append( |
| { |
| "name": name, |
| "description": info["description"], |
| "input_schema": { |
| "type": "object", |
| "properties": info.get("parameters", {}), |
| "required": info.get("required_params", []), |
| }, |
| } |
| ) |
|
|
| system_prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters. |
| |
| Few-shot examples: |
| - "How many albums did The Beatles release?" → web_search(query="Beatles discography number of albums") |
| - "What is 25 * 37 + 100?" → calculator(expression="25 * 37 + 100") |
| - "Analyze the image at example.com/pic.jpg" → vision(image_url="example.com/pic.jpg") |
| - "What's in the uploaded Excel file?" → parse_file(file_path="<provided_path>") |
| |
| Execute the plan step by step. Extract correct parameters from the question. |
| |
| Plan: |
| {plan}""" |
|
|
| user_prompt = f"""Question: {question} |
| |
| Select and call the tools needed according to the plan. Use exact parameter names from tool schemas.""" |
|
|
| logger.info( |
| f"[select_tools_claude] Calling Claude with function calling for {len(tool_schemas)} tools" |
| ) |
|
|
| response = client.messages.create( |
| model=CLAUDE_MODEL, |
| max_tokens=MAX_TOKENS, |
| temperature=TEMPERATURE, |
| system=system_prompt, |
| messages=[{"role": "user", "content": user_prompt}], |
| tools=tool_schemas, |
| ) |
|
|
| |
| tool_calls = [] |
| for content_block in response.content: |
| if content_block.type == "tool_use": |
| tool_calls.append( |
| { |
| "tool": content_block.name, |
| "params": content_block.input, |
| "id": content_block.id, |
| } |
| ) |
|
|
| logger.info(f"[select_tools_claude] Claude selected {len(tool_calls)} tool(s)") |
|
|
| return tool_calls |
|
|
|
|
| |
| |
| |
|
|
|
|
| def select_tools_gemini( |
| question: str, plan: str, available_tools: Dict[str, Dict] |
| ) -> List[Dict[str, Any]]: |
| """Use Gemini function calling to select tools and extract parameters.""" |
| model = create_gemini_client() |
|
|
| |
| tools = [] |
| for name, info in available_tools.items(): |
| tools.append( |
| genai.protos.Tool( |
| function_declarations=[ |
| genai.protos.FunctionDeclaration( |
| name=name, |
| description=info["description"], |
| parameters=genai.protos.Schema( |
| type=genai.protos.Type.OBJECT, |
| properties={ |
| param_name: genai.protos.Schema( |
| type=genai.protos.Type.STRING, |
| description=param_info.get("description", ""), |
| ) |
| for param_name, param_info in info.get( |
| "parameters", {} |
| ).items() |
| }, |
| required=info.get("required_params", []), |
| ), |
| ) |
| ] |
| ) |
| ) |
|
|
| prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters. |
| |
| Few-shot examples: |
| - "How many albums did The Beatles release?" → web_search(query="Beatles discography number of albums") |
| - "What is 25 * 37 + 100?" → calculator(expression="25 * 37 + 100") |
| - "Analyze the image at example.com/pic.jpg" → vision(image_url="example.com/pic.jpg") |
| - "What's in the uploaded Excel file?" → parse_file(file_path="<provided_path>") |
| |
| Execute the plan step by step. Extract correct parameters from the question. |
| |
| Plan: |
| {plan} |
| |
| Question: {question} |
| |
| Select and call the tools needed according to the plan. Use exact parameter names from tool schemas.""" |
|
|
| logger.info( |
| f"[select_tools_gemini] Calling Gemini with function calling for {len(available_tools)} tools" |
| ) |
|
|
| response = model.generate_content( |
| prompt, |
| tools=tools, |
| generation_config=genai.types.GenerationConfig( |
| temperature=TEMPERATURE, max_output_tokens=MAX_TOKENS |
| ), |
| ) |
|
|
| |
| tool_calls = [] |
| for part in response.parts: |
| if hasattr(part, "function_call") and part.function_call: |
| fc = part.function_call |
| tool_calls.append( |
| { |
| "tool": fc.name, |
| "params": dict(fc.args), |
| "id": f"gemini_{len(tool_calls)}", |
| } |
| ) |
|
|
| logger.info(f"[select_tools_gemini] Gemini selected {len(tool_calls)} tool(s)") |
|
|
| return tool_calls |
|
|
|
|
| |
| |
| |
|
|
|
|
| def select_tools_hf( |
| question: str, plan: str, available_tools: Dict[str, Dict] |
| ) -> List[Dict[str, Any]]: |
| """Use HuggingFace Inference API with function calling to select tools and extract parameters.""" |
| client = create_hf_client() |
|
|
| |
| tools = [] |
| for name, info in available_tools.items(): |
| tool_schema = { |
| "type": "function", |
| "function": { |
| "name": name, |
| "description": info["description"], |
| "parameters": { |
| "type": "object", |
| "properties": {}, |
| "required": info.get("required_params", []), |
| }, |
| }, |
| } |
|
|
| |
| for param_name, param_info in info.get("parameters", {}).items(): |
| tool_schema["function"]["parameters"]["properties"][param_name] = { |
| "type": param_info.get("type", "string"), |
| "description": param_info.get("description", ""), |
| } |
|
|
| tools.append(tool_schema) |
|
|
| system_prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters. |
| |
| Few-shot examples: |
| - "How many albums did The Beatles release?" → web_search(query="Beatles discography number of albums") |
| - "What is 25 * 37 + 100?" → calculator(expression="25 * 37 + 100") |
| - "Analyze the image at example.com/pic.jpg" → vision(image_url="example.com/pic.jpg") |
| - "What's in the uploaded Excel file?" → parse_file(file_path="<provided_path>") |
| |
| Execute the plan step by step. Extract correct parameters from the question. |
| |
| Plan: |
| {plan}""" |
|
|
| user_prompt = f"""Question: {question} |
| |
| Select and call the tools needed according to the plan. Use exact parameter names from tool schemas.""" |
|
|
| logger.info( |
| f"[select_tools_hf] Calling HuggingFace with function calling for {len(tools)} tools" |
| ) |
|
|
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ] |
|
|
| |
| response = client.chat_completion( |
| messages=messages, tools=tools, max_tokens=MAX_TOKENS, temperature=TEMPERATURE |
| ) |
|
|
| |
| tool_calls = [] |
| if ( |
| hasattr(response.choices[0].message, "tool_calls") |
| and response.choices[0].message.tool_calls |
| ): |
| for tool_call in response.choices[0].message.tool_calls: |
| import json |
|
|
| tool_calls.append( |
| { |
| "tool": tool_call.function.name, |
| "params": json.loads(tool_call.function.arguments), |
| "id": tool_call.id, |
| } |
| ) |
|
|
| logger.info(f"[select_tools_hf] HuggingFace selected {len(tool_calls)} tool(s)") |
|
|
| return tool_calls |
|
|
|
|
| |
| |
| |
|
|
|
|
| def select_tools_groq( |
| question: str, plan: str, available_tools: Dict[str, Dict] |
| ) -> List[Dict[str, Any]]: |
| """Use Groq with function calling to select tools and extract parameters.""" |
| client = create_groq_client() |
|
|
| |
| tools = [] |
| for name, info in available_tools.items(): |
| tool_schema = { |
| "type": "function", |
| "function": { |
| "name": name, |
| "description": info["description"], |
| "parameters": { |
| "type": "object", |
| "properties": {}, |
| "required": info.get("required_params", []), |
| }, |
| }, |
| } |
|
|
| |
| for param_name, param_info in info.get("parameters", {}).items(): |
| tool_schema["function"]["parameters"]["properties"][param_name] = { |
| "type": param_info.get("type", "string"), |
| "description": param_info.get("description", ""), |
| } |
|
|
| tools.append(tool_schema) |
|
|
| system_prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters. |
| |
| Few-shot examples: |
| - "How many albums did The Beatles release?" → web_search(query="Beatles discography number of albums") |
| - "What is 25 * 37 + 100?" → calculator(expression="25 * 37 + 100") |
| - "Analyze the image at example.com/pic.jpg" → vision(image_url="example.com/pic.jpg") |
| - "What's in the uploaded Excel file?" → parse_file(file_path="<provided_path>") |
| |
| Execute the plan step by step. Extract correct parameters from the question. |
| |
| Plan: |
| {plan}""" |
|
|
| user_prompt = f"""Question: {question} |
| |
| Select and call the tools needed according to the plan. Use exact parameter names from tool schemas.""" |
|
|
| logger.info( |
| f"[select_tools_groq] Calling Groq with function calling for {len(tools)} tools" |
| ) |
|
|
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ] |
|
|
| |
| response = client.chat.completions.create( |
| model=GROQ_MODEL, |
| messages=messages, |
| tools=tools, |
| max_tokens=MAX_TOKENS, |
| temperature=TEMPERATURE, |
| ) |
|
|
| |
| tool_calls = [] |
| if ( |
| hasattr(response.choices[0].message, "tool_calls") |
| and response.choices[0].message.tool_calls |
| ): |
| for tool_call in response.choices[0].message.tool_calls: |
| import json |
|
|
| tool_calls.append( |
| { |
| "tool": tool_call.function.name, |
| "params": json.loads(tool_call.function.arguments), |
| "id": tool_call.id, |
| } |
| ) |
|
|
| logger.info(f"[select_tools_groq] Groq selected {len(tool_calls)} tool(s)") |
|
|
| return tool_calls |
|
|
|
|
| |
| |
| |
|
|
|
|
| def select_tools_with_function_calling( |
| question: str, plan: str, available_tools: Dict[str, Dict] |
| ) -> List[Dict[str, Any]]: |
| """ |
| Use LLM function calling to dynamically select tools and extract parameters. |
| |
| Pattern: Try Gemini first (free tier), HuggingFace (free tier), Groq (free tier), then Claude (paid) if all fail. |
| 4-tier fallback ensures availability even with quota limits. |
| Each provider call wrapped with retry logic (3 attempts with exponential backoff). |
| |
| Args: |
| question: GAIA question text |
| plan: Execution plan from planning phase |
| available_tools: Tool registry |
| |
| Returns: |
| List of tool calls with extracted parameters |
| """ |
| try: |
| return retry_with_backoff( |
| lambda: select_tools_gemini(question, plan, available_tools) |
| ) |
| except Exception as gemini_error: |
| logger.warning( |
| f"[select_tools] Gemini failed: {gemini_error}, trying HuggingFace fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: select_tools_hf(question, plan, available_tools) |
| ) |
| except Exception as hf_error: |
| logger.warning( |
| f"[select_tools] HuggingFace failed: {hf_error}, trying Groq fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: select_tools_groq(question, plan, available_tools) |
| ) |
| except Exception as groq_error: |
| logger.warning( |
| f"[select_tools] Groq failed: {groq_error}, trying Claude fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: select_tools_claude(question, plan, available_tools) |
| ) |
| except Exception as claude_error: |
| logger.error( |
| f"[select_tools] All LLMs failed. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}" |
| ) |
| raise Exception( |
| f"Tool selection failed with all LLMs. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def synthesize_answer_claude(question: str, evidence: List[str]) -> str: |
| """Synthesize factoid answer from evidence using Claude.""" |
| client = create_claude_client() |
|
|
| |
| evidence_text = "\n\n".join( |
| [f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)] |
| ) |
|
|
| system_prompt = """You are an answer synthesis agent for the GAIA benchmark. |
| |
| Your task is to extract a factoid answer from the provided evidence. |
| |
| CRITICAL - Answer format requirements: |
| 1. Answers must be factoids: a number, a few words, or a comma-separated list |
| 2. Be concise - no explanations, just the answer |
| 3. If evidence conflicts, evaluate source credibility and recency |
| 4. If evidence is insufficient, state "Unable to answer" |
| |
| Examples of good factoid answers: |
| - "42" |
| - "Paris" |
| - "Albert Einstein" |
| - "red, blue, green" |
| - "1969-07-20" |
| |
| Examples of bad answers (too verbose): |
| - "The answer is 42 because..." |
| - "Based on the evidence, it appears that..." |
| """ |
|
|
| user_prompt = f"""Question: {question} |
| |
| {evidence_text} |
| |
| Extract the factoid answer from the evidence above. Return only the factoid, nothing else.""" |
|
|
| logger.info(f"[synthesize_answer_claude] Calling Claude for answer synthesis") |
|
|
| response = client.messages.create( |
| model=CLAUDE_MODEL, |
| max_tokens=256, |
| temperature=TEMPERATURE, |
| system=system_prompt, |
| messages=[{"role": "user", "content": user_prompt}], |
| ) |
|
|
| answer = response.content[0].text.strip() |
| logger.info(f"[synthesize_answer_claude] Generated answer: {answer}") |
|
|
| return answer |
|
|
|
|
| |
| |
| |
|
|
|
|
| def synthesize_answer_gemini(question: str, evidence: List[str]) -> str: |
| """Synthesize factoid answer from evidence using Gemini.""" |
| model = create_gemini_client() |
|
|
| |
| evidence_text = "\n\n".join( |
| [f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)] |
| ) |
|
|
| prompt = f"""You are an answer synthesis agent for the GAIA benchmark. |
| |
| Your task is to extract a factoid answer from the provided evidence. |
| |
| CRITICAL - Answer format requirements: |
| 1. Answers must be factoids: a number, a few words, or a comma-separated list |
| 2. Be concise - no explanations, just the answer |
| 3. If evidence conflicts, evaluate source credibility and recency |
| 4. If evidence is insufficient, state "Unable to answer" |
| |
| Examples of good factoid answers: |
| - "42" |
| - "Paris" |
| - "Albert Einstein" |
| - "red, blue, green" |
| - "1969-07-20" |
| |
| Examples of bad answers (too verbose): |
| - "The answer is 42 because..." |
| - "Based on the evidence, it appears that..." |
| |
| Question: {question} |
| |
| {evidence_text} |
| |
| Extract the factoid answer from the evidence above. Return only the factoid, nothing else.""" |
|
|
| logger.info(f"[synthesize_answer_gemini] Calling Gemini for answer synthesis") |
|
|
| response = model.generate_content( |
| prompt, |
| generation_config=genai.types.GenerationConfig( |
| temperature=TEMPERATURE, |
| max_output_tokens=256, |
| ), |
| ) |
|
|
| answer = response.text.strip() |
| logger.info(f"[synthesize_answer_gemini] Generated answer: {answer}") |
|
|
| return answer |
|
|
|
|
| |
| |
| |
|
|
|
|
| def synthesize_answer_hf(question: str, evidence: List[str]) -> str: |
| """Synthesize factoid answer from evidence using HuggingFace Inference API.""" |
| client = create_hf_client() |
|
|
| |
| evidence_text = "\n\n".join( |
| [f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)] |
| ) |
|
|
| system_prompt = """You are an answer synthesis agent for the GAIA benchmark. |
| |
| Your task is to extract a factoid answer from the provided evidence. |
| |
| CRITICAL - Answer format requirements: |
| 1. Answers must be factoids: a number, a few words, or a comma-separated list |
| 2. Be concise - no explanations, just the answer |
| 3. If evidence conflicts, evaluate source credibility and recency |
| 4. If evidence is insufficient, state "Unable to answer" |
| |
| Examples of good factoid answers: |
| - "42" |
| - "Paris" |
| - "Albert Einstein" |
| - "red, blue, green" |
| - "1969-07-20" |
| |
| Examples of bad answers (too verbose): |
| - "The answer is 42 because..." |
| - "Based on the evidence, it appears that..." |
| """ |
|
|
| user_prompt = f"""Question: {question} |
| |
| {evidence_text} |
| |
| Extract the factoid answer from the evidence above. Return only the factoid, nothing else.""" |
|
|
| logger.info(f"[synthesize_answer_hf] Calling HuggingFace for answer synthesis") |
|
|
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ] |
|
|
| response = client.chat_completion( |
| messages=messages, |
| max_tokens=256, |
| temperature=TEMPERATURE, |
| ) |
|
|
| answer = response.choices[0].message.content.strip() |
| logger.info(f"[synthesize_answer_hf] Generated answer: {answer}") |
|
|
| return answer |
|
|
|
|
| |
| |
| |
|
|
|
|
| def synthesize_answer_groq(question: str, evidence: List[str]) -> str: |
| """Synthesize factoid answer from evidence using Groq.""" |
| client = create_groq_client() |
|
|
| |
| evidence_text = "\n\n".join( |
| [f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)] |
| ) |
|
|
| system_prompt = """You are an answer synthesis agent for the GAIA benchmark. |
| |
| Your task is to extract a factoid answer from the provided evidence. |
| |
| CRITICAL - Answer format requirements: |
| 1. Answers must be factoids: a number, a few words, or a comma-separated list |
| 2. Be concise - no explanations, just the answer |
| 3. If evidence conflicts, evaluate source credibility and recency |
| 4. If evidence is insufficient, state "Unable to answer" |
| |
| Examples of good factoid answers: |
| - "42" |
| - "Paris" |
| - "Albert Einstein" |
| - "red, blue, green" |
| - "1969-07-20" |
| |
| Examples of bad answers (too verbose): |
| - "The answer is 42 because..." |
| - "Based on the evidence, it appears that..." |
| """ |
|
|
| user_prompt = f"""Question: {question} |
| |
| {evidence_text} |
| |
| Extract the factoid answer from the evidence above. Return only the factoid, nothing else.""" |
|
|
| logger.info(f"[synthesize_answer_groq] Calling Groq for answer synthesis") |
|
|
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ] |
|
|
| response = client.chat.completions.create( |
| model=GROQ_MODEL, |
| messages=messages, |
| max_tokens=256, |
| temperature=TEMPERATURE, |
| ) |
|
|
| answer = response.choices[0].message.content.strip() |
| logger.info(f"[synthesize_answer_groq] Generated answer: {answer}") |
|
|
| return answer |
|
|
|
|
| |
| |
| |
|
|
|
|
| def synthesize_answer(question: str, evidence: List[str]) -> str: |
| """ |
| Synthesize factoid answer from collected evidence using LLM. |
| |
| Pattern: Try Gemini first (free tier), HuggingFace (free tier), Groq (free tier), then Claude (paid) if all fail. |
| 4-tier fallback ensures availability even with quota limits. |
| Each provider call wrapped with retry logic (3 attempts with exponential backoff). |
| |
| Args: |
| question: Original GAIA question |
| evidence: List of evidence strings from tool executions |
| |
| Returns: |
| Factoid answer string |
| """ |
| try: |
| return retry_with_backoff(lambda: synthesize_answer_gemini(question, evidence)) |
| except Exception as gemini_error: |
| logger.warning( |
| f"[synthesize_answer] Gemini failed: {gemini_error}, trying HuggingFace fallback" |
| ) |
| try: |
| return retry_with_backoff(lambda: synthesize_answer_hf(question, evidence)) |
| except Exception as hf_error: |
| logger.warning( |
| f"[synthesize_answer] HuggingFace failed: {hf_error}, trying Groq fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: synthesize_answer_groq(question, evidence) |
| ) |
| except Exception as groq_error: |
| logger.warning( |
| f"[synthesize_answer] Groq failed: {groq_error}, trying Claude fallback" |
| ) |
| try: |
| return retry_with_backoff( |
| lambda: synthesize_answer_claude(question, evidence) |
| ) |
| except Exception as claude_error: |
| logger.error( |
| f"[synthesize_answer] All LLMs failed. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}" |
| ) |
| raise Exception( |
| f"Answer synthesis failed with all LLMs. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def resolve_conflicts(evidence: List[str]) -> Dict[str, Any]: |
| """ |
| Detect and resolve conflicts in evidence using LLM reasoning. |
| |
| Optional function for advanced conflict handling. |
| Currently integrated into synthesize_answer(). |
| Uses same Gemini primary, Claude fallback pattern. |
| |
| Args: |
| evidence: List of evidence strings that may conflict |
| |
| Returns: |
| Dictionary with conflict analysis |
| """ |
| try: |
| |
| model = create_gemini_client() |
| evidence_text = "\n\n".join( |
| [f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)] |
| ) |
|
|
| prompt = f"""You are a conflict detection agent. |
| |
| Analyze the provided evidence and identify any contradictions or conflicts. |
| |
| Evaluate: |
| 1. Are there contradictory facts? |
| 2. Which sources are more credible? |
| 3. Which information is more recent? |
| 4. How should conflicts be resolved? |
| |
| Analyze this evidence for conflicts: |
| |
| {evidence_text} |
| |
| Respond in JSON format: |
| {{ |
| "has_conflicts": true/false, |
| "conflicts": ["description of conflict 1", ...], |
| "resolution": "recommended resolution strategy" |
| }}""" |
|
|
| logger.info(f"[resolve_conflicts] Analyzing with Gemini") |
|
|
| response = model.generate_content(prompt) |
|
|
| result = {"has_conflicts": False, "conflicts": [], "resolution": response.text} |
|
|
| return result |
|
|
| except Exception as gemini_error: |
| logger.warning( |
| f"[resolve_conflicts] Gemini failed: {gemini_error}, trying Claude" |
| ) |
|
|
| |
| client = create_claude_client() |
| evidence_text = "\n\n".join( |
| [f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)] |
| ) |
|
|
| system_prompt = """You are a conflict detection agent. |
| |
| Analyze the provided evidence and identify any contradictions or conflicts. |
| |
| Evaluate: |
| 1. Are there contradictory facts? |
| 2. Which sources are more credible? |
| 3. Which information is more recent? |
| 4. How should conflicts be resolved?""" |
|
|
| user_prompt = f"""Analyze this evidence for conflicts: |
| |
| {evidence_text} |
| |
| Respond in JSON format: |
| {{ |
| "has_conflicts": true/false, |
| "conflicts": ["description of conflict 1", ...], |
| "resolution": "recommended resolution strategy" |
| }}""" |
|
|
| response = client.messages.create( |
| model=CLAUDE_MODEL, |
| max_tokens=MAX_TOKENS, |
| temperature=TEMPERATURE, |
| system=system_prompt, |
| messages=[{"role": "user", "content": user_prompt}], |
| ) |
|
|
| result = { |
| "has_conflicts": False, |
| "conflicts": [], |
| "resolution": response.content[0].text, |
| } |
|
|
| return result |
|
|