Spaces:

AishaSurve
/

codebase-agent

Running

File size: 2,787 Bytes

8e72e1f

"""
Test-generation agent.

A minimal tool-calling loop (no LangGraph). The LLM is given two tools
(search_code, get_definition) and asked to generate pytest tests for a target
function. It decides which tools to call to gather the function's real source
and its dependencies, then writes the tests grounded in that actual code.

This is the "agent" capability: the model plans and acts via tools, rather than
answering in a single shot.
"""
import json
import os

from dotenv import load_dotenv
from openai import OpenAI

from src.agent.tools import TOOL_SCHEMAS

load_dotenv()

SYSTEM_PROMPT = """You are a senior Python engineer that writes pytest unit tests.

You have tools to explore a real codebase:
- search_code(query): find relevant code
- get_definition(name): fetch the full source of a function/class

Workflow:
1. Use get_definition (and search_code if needed) to read the ACTUAL source of
   the target function and anything it depends on. Never guess at the code.
2. Then write focused pytest tests covering the main behavior and edge cases.

Return ONLY the final pytest code in a single Python code block. Base the tests
strictly on the real code you retrieved.
"""

MAX_TOOL_ROUNDS = 5


class TestAgent:

    def __init__(self, tools, model="gpt-4.1-mini"):
        self.tools = tools  # a CodeTools instance
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.model = model

    def generate_tests(self, target):
        """target: a function/class name, e.g. 'create_access_token'."""
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"Generate pytest tests for `{target}`."},
        ]

        for _ in range(MAX_TOOL_ROUNDS):
            response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                tools=TOOL_SCHEMAS,
                temperature=0,
            )
            msg = response.choices[0].message

            # No tool calls -> the model produced its final answer (the tests).
            if not msg.tool_calls:
                return msg.content

            # Otherwise, run each requested tool and feed results back.
            messages.append(msg)
            for call in msg.tool_calls:
                args = json.loads(call.function.arguments)
                result = self.tools.dispatch(call.function.name, args)
                messages.append({
                    "role": "tool",
                    "tool_call_id": call.id,
                    "content": json.dumps(result)[:6000],  # keep tool payload bounded
                })

        # Safety net if it never stopped calling tools.
        return "Could not finish generating tests within the tool-call limit."