Spaces:

gimjungwook
/

getitdone-api

Running

File size: 10,757 Bytes

4dc70fb

#!/usr/bin/env python3
"""
Test script for Gemini Extended Thinking + Tool Calling integration.
Tests that thinking and tool calls work together correctly.
"""
import asyncio
import os
import sys

# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

from dotenv import load_dotenv
load_dotenv()

from opencode_api.provider.gemini import GeminiProvider
from opencode_api.provider.provider import Message


# Define test tools schema
TEST_TOOLS = [
    {
        "name": "web_search",
        "description": "Search the web for current information. Use this when you need to find up-to-date information about any topic.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query"
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "calculator",
        "description": "Perform mathematical calculations. Use this for any math operations.",
        "parameters": {
            "type": "object",
            "properties": {
                "expression": {
                    "type": "string",
                    "description": "The mathematical expression to evaluate"
                }
            },
            "required": ["expression"]
        }
    }
]


async def test_thinking_only():
    """Test that thinking works without tools."""
    print("\n" + "="*60)
    print("TEST 1: Thinking Only (No Tools)")
    print("="*60)
    
    provider = GeminiProvider(api_key=os.getenv("GOOGLE_API_KEY"))
    
    messages = [
        Message(role="user", content="3개의 상자가 있어. 하나에는 금, 하나에는 은, 하나에는 돌이 있어. 상자1: '여기에 금이 없다', 상자2: '여기에 금이 있다', 상자3: '금은 상자1에 있다'. 정확히 하나만 진실이야. 금은 어디?")
    ]
    
    reasoning_chunks = []
    text_chunks = []
    tool_calls = []
    usage = None
    
    async for chunk in provider.stream(
        model_id="gemini-2.5-pro",
        messages=messages,
        tools=None,  # No tools
        system="You are a helpful assistant. Think step by step."
    ):
        if chunk.type == "reasoning":
            reasoning_chunks.append(chunk.text)
            print(f"[REASONING] {chunk.text[:100]}..." if len(chunk.text) > 100 else f"[REASONING] {chunk.text}")
        elif chunk.type == "text":
            text_chunks.append(chunk.text)
            print(f"[TEXT] {chunk.text}", end="", flush=True)
        elif chunk.type == "tool_call":
            tool_calls.append(chunk.tool_call)
            print(f"\n[TOOL_CALL] {chunk.tool_call.name}({chunk.tool_call.arguments})")
        elif chunk.type == "done":
            usage = chunk.usage
            print(f"\n[DONE] stop_reason={chunk.stop_reason}")
        elif chunk.type == "error":
            print(f"\n[ERROR] {chunk.error}")
    
    print("\n--- Summary ---")
    print(f"Reasoning chunks: {len(reasoning_chunks)}")
    print(f"Text chunks: {len(text_chunks)}")
    print(f"Tool calls: {len(tool_calls)}")
    if usage:
        print(f"Usage: input={usage.get('input_tokens', 0)}, output={usage.get('output_tokens', 0)}, thinking={usage.get('thinking_tokens', 0)}")
    
    return len(reasoning_chunks) > 0


async def test_tool_calling():
    """Test that tool calling works."""
    print("\n" + "="*60)
    print("TEST 2: Tool Calling (Should trigger web_search)")
    print("="*60)
    
    provider = GeminiProvider(api_key=os.getenv("GOOGLE_API_KEY"))
    
    messages = [
        Message(role="user", content="2024년 노벨 물리학상 수상자가 누구야? 웹 검색해서 알려줘.")
    ]
    
    reasoning_chunks = []
    text_chunks = []
    tool_calls = []
    usage = None
    
    async for chunk in provider.stream(
        model_id="gemini-2.5-pro",
        messages=messages,
        tools=TEST_TOOLS,
        system="You are a helpful assistant. Use tools when needed."
    ):
        if chunk.type == "reasoning":
            reasoning_chunks.append(chunk.text)
            print(f"[REASONING] {chunk.text[:100]}..." if len(chunk.text) > 100 else f"[REASONING] {chunk.text}")
        elif chunk.type == "text":
            text_chunks.append(chunk.text)
            print(f"[TEXT] {chunk.text}", end="", flush=True)
        elif chunk.type == "tool_call":
            tool_calls.append(chunk.tool_call)
            print(f"\n[TOOL_CALL] {chunk.tool_call.name}({chunk.tool_call.arguments})")
        elif chunk.type == "done":
            usage = chunk.usage
            print(f"\n[DONE] stop_reason={chunk.stop_reason}")
        elif chunk.type == "error":
            print(f"\n[ERROR] {chunk.error}")
    
    print("\n--- Summary ---")
    print(f"Reasoning chunks: {len(reasoning_chunks)}")
    print(f"Text chunks: {len(text_chunks)}")
    print(f"Tool calls: {len(tool_calls)}")
    for tc in tool_calls:
        print(f"  - {tc.name}: {tc.arguments}")
    if usage:
        print(f"Usage: input={usage.get('input_tokens', 0)}, output={usage.get('output_tokens', 0)}, thinking={usage.get('thinking_tokens', 0)}")
    
    return len(tool_calls) > 0


async def test_thinking_with_tools():
    """Test that thinking AND tool calling work together."""
    print("\n" + "="*60)
    print("TEST 3: Thinking + Tool Calling Together")
    print("="*60)
    
    provider = GeminiProvider(api_key=os.getenv("GOOGLE_API_KEY"))
    
    # Complex question that requires both thinking and tool use
    messages = [
        Message(role="user", content="복잡한 수학 문제야: (17 * 23) + (45 / 9) - 12^2 를 계산해줘. 계산기 도구를 사용해서 정확한 답을 구해줘.")
    ]
    
    reasoning_chunks = []
    text_chunks = []
    tool_calls = []
    usage = None
    
    async for chunk in provider.stream(
        model_id="gemini-2.5-pro",
        messages=messages,
        tools=TEST_TOOLS,
        system="You are a helpful assistant. Use the calculator tool for mathematical operations. Think through the problem step by step."
    ):
        if chunk.type == "reasoning":
            reasoning_chunks.append(chunk.text)
            print(f"[REASONING] {chunk.text[:100]}..." if len(chunk.text) > 100 else f"[REASONING] {chunk.text}")
        elif chunk.type == "text":
            text_chunks.append(chunk.text)
            print(f"[TEXT] {chunk.text}", end="", flush=True)
        elif chunk.type == "tool_call":
            tool_calls.append(chunk.tool_call)
            print(f"\n[TOOL_CALL] {chunk.tool_call.name}({chunk.tool_call.arguments})")
        elif chunk.type == "done":
            usage = chunk.usage
            print(f"\n[DONE] stop_reason={chunk.stop_reason}")
        elif chunk.type == "error":
            print(f"\n[ERROR] {chunk.error}")
    
    print("\n--- Summary ---")
    print(f"Reasoning chunks: {len(reasoning_chunks)}")
    print(f"Text chunks: {len(text_chunks)}")
    print(f"Tool calls: {len(tool_calls)}")
    for tc in tool_calls:
        print(f"  - {tc.name}: {tc.arguments}")
    if usage:
        print(f"Usage: input={usage.get('input_tokens', 0)}, output={usage.get('output_tokens', 0)}, thinking={usage.get('thinking_tokens', 0)}")
    
    # Success if we got both reasoning and tool calls, OR just tool calls (model might not always think)
    return len(tool_calls) > 0


async def test_flash_model():
    """Test that Flash model also works with thinking + tools."""
    print("\n" + "="*60)
    print("TEST 4: Gemini 2.5 Flash with Thinking + Tools")
    print("="*60)
    
    provider = GeminiProvider(api_key=os.getenv("GOOGLE_API_KEY"))
    
    messages = [
        Message(role="user", content="오늘 서울 날씨 어때? 웹에서 검색해줘.")
    ]
    
    reasoning_chunks = []
    text_chunks = []
    tool_calls = []
    usage = None
    
    async for chunk in provider.stream(
        model_id="gemini-2.5-flash",
        messages=messages,
        tools=TEST_TOOLS,
        system="You are a helpful assistant. Use tools when needed."
    ):
        if chunk.type == "reasoning":
            reasoning_chunks.append(chunk.text)
            print(f"[REASONING] {chunk.text[:100]}..." if len(chunk.text) > 100 else f"[REASONING] {chunk.text}")
        elif chunk.type == "text":
            text_chunks.append(chunk.text)
            print(f"[TEXT] {chunk.text}", end="", flush=True)
        elif chunk.type == "tool_call":
            tool_calls.append(chunk.tool_call)
            print(f"\n[TOOL_CALL] {chunk.tool_call.name}({chunk.tool_call.arguments})")
        elif chunk.type == "done":
            usage = chunk.usage
            print(f"\n[DONE] stop_reason={chunk.stop_reason}")
        elif chunk.type == "error":
            print(f"\n[ERROR] {chunk.error}")
    
    print("\n--- Summary ---")
    print(f"Reasoning chunks: {len(reasoning_chunks)}")
    print(f"Text chunks: {len(text_chunks)}")
    print(f"Tool calls: {len(tool_calls)}")
    if usage:
        print(f"Usage: input={usage.get('input_tokens', 0)}, output={usage.get('output_tokens', 0)}, thinking={usage.get('thinking_tokens', 0)}")
    
    return True  # Flash might not always use tools


async def main():
    print("="*60)
    print("Gemini Extended Thinking + Tool Calling Integration Test")
    print("="*60)
    
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        print("ERROR: GOOGLE_API_KEY not set in environment")
        return
    
    print(f"API Key: {api_key[:10]}...{api_key[-4:]}")
    
    results = {}
    
    # Run tests
    try:
        results["thinking_only"] = await test_thinking_only()
    except Exception as e:
        print(f"TEST 1 FAILED: {e}")
        results["thinking_only"] = False
    
    try:
        results["tool_calling"] = await test_tool_calling()
    except Exception as e:
        print(f"TEST 2 FAILED: {e}")
        results["tool_calling"] = False
    
    try:
        results["thinking_with_tools"] = await test_thinking_with_tools()
    except Exception as e:
        print(f"TEST 3 FAILED: {e}")
        results["thinking_with_tools"] = False
    
    try:
        results["flash_model"] = await test_flash_model()
    except Exception as e:
        print(f"TEST 4 FAILED: {e}")
        results["flash_model"] = False
    
    # Final summary
    print("\n" + "="*60)
    print("FINAL RESULTS")
    print("="*60)
    for test_name, passed in results.items():
        status = "PASS" if passed else "FAIL"
        print(f"  {test_name}: {status}")
    
    all_passed = all(results.values())
    print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}")


if __name__ == "__main__":
    asyncio.run(main())