Spaces:

Navigam
/

jira-to-code

Sleeping

File size: 14,908 Bytes

# inference.py — ReAct Agent for Jira-to-Code Environment
#
# Architecture:
#   Phase 1: Episodic Memory — persistent messages[] across the episode
#   Phase 2: ReAct Pattern — "thought" key forces reasoning before action
#   Phase 3: Robust Parsing — JSON extraction with markdown-fence stripping
#   Phase 4: Self-Correction — negative rewards inject corrective prompts
#   Phase 5: Multi-Task Loop — evaluates all 6 tasks in one run

import argparse
import json
import os
import re
import textwrap
import time
from typing import List, Optional

from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

# Our environment for local/direct testing
from server.env import JiraToCodeEnv
from src.jira_to_code.models import JiraCodeAction

# --- HACKATHON MANDATORY CONFIGURATION ---
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")

BENCHMARK = "jira-to-code"
# MAX_STEPS is now dynamic based on task level
SUCCESS_SCORE_THRESHOLD = 0.9  # Account for step penalties
ALL_TASKS = list(JiraToCodeEnv.TASKS.keys())
MAX_HISTORY_MESSAGES = 30  # Context-window safety: trim if exceeded
MAX_RETRIES = 5            # Rate limit retry attempts
RETRY_BASE_DELAY = 2       # Base delay in seconds for exponential backoff

# --- SYSTEM PROMPT (ReAct + Reward-Aware) ---
SYSTEM_PROMPT = textwrap.dedent("""\
You are an expert software engineer resolving Jira tickets.
You operate in a sandboxed workspace. You can read files, write code, list files, run tests, and submit your solution.

## Rules
1. ALWAYS respond with ONLY a valid JSON object. No markdown fences, no explanations outside JSON.
2. You MUST include a "thought" key FIRST to reason about your plan before acting.
3. Work step-by-step: list files, read the code, understand the bug/requirement, write a fix, run tests, then submit.
4. If tests fail, carefully read the traceback and fix your code before re-submitting.
5. Only use "submit" when you are confident all tests will pass.
6. Be efficient — each step has a small penalty. Aim to solve in the fewest steps possible.
7. Read the test file to understand exactly what is expected before writing code.

## Valid action_types
- "list_files" — List all files in the workspace (file_path and content should be null)
- "read_file" — Read a file's contents (requires file_path, content should be null)
- "write_file" — Write/overwrite a file (requires file_path and content)
- "run_tests" — Run pytest on the workspace (file_path and content should be null)
- "submit" — Final submission, runs tests and ends the episode (file_path and content should be null)

## Reward Structure
- list_files / read_file: 0.01 (initial exploration)
- write_file: +0.05 (reward for taking action)
- run_tests (all pass): +0.5 | run_tests (partial): proportional | run_tests (crash): 0.01
- submit (all pass): +1.0 | submit (partial): proportional
- Every step: 0.01 minimum reward (be efficient!)

## JSON Schema
{
  "thought": "Your reasoning about what to do next and why",
  "action_type": "one of: list_files, read_file, write_file, run_tests, submit",
  "file_path": "string or null",
  "content": "string or null"
}

## Strategy Guide
1. First, list_files to see the workspace structure.
2. Read the test file to understand the exact expected behavior.
3. Read the source file to understand the current (buggy/incomplete) code.
4. Write the fix/implementation.
5. Run tests to verify.
6. If tests pass, submit. If not, read the error, fix, and retry.
""").strip()


# --- MANDATORY LOGGING FUNCTIONS ---
def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    error_val = error if error else "null"
    done_val = str(done).lower()
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} "
        f"done={done_val} error={error_val}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.3f} rewards={rewards_str}",
        flush=True,
    )


# --- PHASE 3: ROBUST JSON PARSING ---
def extract_json(raw_text: str) -> dict:
    """
    Extract a JSON object from LLM output, handling:
    - Markdown code fences (```json ... ```)
    - Leading/trailing whitespace and text
    - Nested braces via brace-counting
    """
    cleaned = raw_text.strip()
    cleaned = re.sub(r'^```(?:json)?\s*', '', cleaned)
    cleaned = re.sub(r'\s*```\s*$', '', cleaned)
    cleaned = cleaned.strip()

    # Try direct parse first
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        pass

    # Fallback: find the first balanced {...} block via brace counting
    start = cleaned.find('{')
    if start == -1:
        raise ValueError("No JSON object found in response")

    depth = 0
    in_string = False
    escape_next = False
    for i in range(start, len(cleaned)):
        c = cleaned[i]
        if escape_next:
            escape_next = False
            continue
        if c == '\\' and in_string:
            escape_next = True
            continue
        if c == '"' and not escape_next:
            in_string = not in_string
            continue
        if in_string:
            continue
        if c == '{':
            depth += 1
        elif c == '}':
            depth -= 1
            if depth == 0:
                return json.loads(cleaned[start:i + 1])

    raise ValueError("Unbalanced braces in JSON")


def parse_action(raw_text: str) -> JiraCodeAction:
    """Parse LLM output into a JiraCodeAction, extracting JSON robustly."""
    action_dict = extract_json(raw_text)
    # Remove the 'thought' key — it's for reasoning only, not part of the action model
    action_dict.pop("thought", None)
    return JiraCodeAction(**action_dict)


# --- PHASE 1 & 2: BUILD OBSERVATION MESSAGE ---
def build_observation_message(step: int, obs, reward: float) -> str:
    """Format environment observation as a user message for the conversation history."""
    parts = [
        f"--- Step {step} Observation ---",
        f"Ticket: {obs.jira_ticket}",
        f"Files in workspace: {', '.join(obs.file_tree) if obs.file_tree else 'None'}",
    ]
    if obs.current_file_content is not None:
        parts.append(f"File Content:\n```\n{obs.current_file_content}\n```")
    if obs.test_output:
        parts.append(f"Test Output:\n```\n{obs.test_output}\n```")
    if obs.error:
        parts.append(f"Error: {obs.error}")
    parts.append(f"Reward: {reward:.2f}")
    parts.append("Respond with your next action as JSON.")
    return "\n".join(parts)


def trim_history(messages: list, max_messages: int = MAX_HISTORY_MESSAGES) -> None:
    """Trim oldest non-system messages if history exceeds max to avoid context overflow."""
    while len(messages) > max_messages:
        # Keep index 0 (system prompt), remove index 1
        messages.pop(1)


# --- MAIN AGENT LOOP FOR ONE TASK ---
def run_agent_episode(client: OpenAI, task_name: str) -> tuple:
    """
    Run a full agent episode for one task.
    Returns: (score, steps_taken, rewards, success)
    """
    os.environ["JIRA_TASK_LEVEL"] = task_name
    env = JiraToCodeEnv()

    rewards: List[float] = []
    steps_taken = 0
    score = 0.0
    success = False

    log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)

    try:
        obs = env.reset()

        task_max_steps = 10 if "easy" in task_name else 20

        # Phase 1: Episodic memory — persistent conversation history
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": build_observation_message(0, obs, 0.0)},
        ]

        for step in range(1, task_max_steps + 1):
            trim_history(messages)

            # Call the LLM with rate-limit retry + exponential backoff
            raw_text = None
            for attempt in range(MAX_RETRIES):
                try:
                    completion = client.chat.completions.create(
                        model=MODEL_NAME,
                        messages=messages,
                        temperature=0.2,
                        max_tokens=2048,
                    )
                    raw_text = (completion.choices[0].message.content or "").strip()
                    break  # Success
                except Exception as exc:
                    exc_str = str(exc)
                    is_rate_limit = "429" in exc_str or "rate" in exc_str.lower()
                    if is_rate_limit and attempt < MAX_RETRIES - 1:
                        delay = RETRY_BASE_DELAY * (2 ** attempt)
                        print(f"  [RATE LIMIT] Retry {attempt + 1}/{MAX_RETRIES} in {delay}s...", flush=True)
                        time.sleep(delay)
                        continue
                    # Non-rate-limit error or final attempt — give up
                    messages.append({
                        "role": "user",
                        "content": f"API ERROR: {exc}. Please try again with a valid JSON action.",
                    })
                    log_step(step=step, action=f"API_ERROR: {exc}", reward=0.0, done=False, error=exc_str)
                    rewards.append(0.0)
                    steps_taken = step
                    break

            if raw_text is None:
                continue  # Skip to next step if all retries failed

            # Phase 1: Append assistant response to history
            messages.append({"role": "assistant", "content": raw_text})

            # Phase 3: Robust parsing with safe fallback
            try:
                action = parse_action(raw_text)
                action_log = action.model_dump_json()
            except Exception as exc:
                # Parse failure — No-Op fallback + corrective injection
                action = JiraCodeAction(action_type="list_files")
                action_log = f"PARSE_ERROR: {exc}"

                # Phase 4: Inject corrective message
                messages.append({
                    "role": "user",
                    "content": (
                        f"ERROR: Your last response was not valid JSON.\n"
                        f"Parse error: {exc}\n"
                        f"You MUST respond with ONLY a valid JSON object. "
                        f"No markdown, no explanations.\nTry again."
                    ),
                })

            # Take step in environment
            obs, reward, done, _ = env.step(action)
            error = obs.error

            # Ensure individual step rewards are strictly positive (min 0.01)
            reward = max(reward, 0.01)

            rewards.append(reward)
            steps_taken = step

            # Escape newlines for single-line logging
            safe_action_str = action_log.replace('\n', '\\n').replace('\r', '')
            log_step(step=step, action=safe_action_str, reward=reward, done=done, error=error)

            if done:
                break

            # Phase 1: Append observation to conversation history
            obs_message = build_observation_message(step, obs, reward)

            # Phase 4: Self-correction prompt injection on low/negative reward or error
            if reward <= 0.01 or obs.error:
                obs_message += (
                    f"\n\nLOW/NEGATIVE RESULT (reward={reward:.2f})."
                    f"\nCarefully analyze the error/test output above."
                    f"\nIdentify the root cause and write a fix."
                    f"\nDo NOT repeat the same action that just failed."
                )
            elif reward >= 0.4:
                obs_message += (
                    "\n\nTests are passing! If all tests pass, use 'submit' to finalize."
                )

            messages.append({"role": "user", "content": obs_message})

        # Calculate final score (clamp strictly between 0 and 1)
        score = min(max(sum(rewards), 0.01), 0.99)
        success = score >= SUCCESS_SCORE_THRESHOLD

    finally:
        env.close()
        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

    return score, steps_taken, rewards, success


# --- PHASE 5: MULTI-TASK EVALUATION ---
def main() -> None:
    parser = argparse.ArgumentParser(description="Jira-to-Code ReAct Agent")
    parser.add_argument(
        "--tasks",
        type=str,
        default=None,
        help=(
            "Comma-separated list of tasks to run. "
            f"Available: {', '.join(ALL_TASKS)}. "
            "Default: all tasks."
        ),
    )
    args = parser.parse_args()

    import random

    # Determine which tasks to run
    if args.tasks:
        tasks = [t.strip() for t in args.tasks.split(",")]
        invalid = [t for t in tasks if t not in ALL_TASKS]
        if invalid:
            print(f"ERROR: Unknown tasks: {invalid}", flush=True)
            print(f"Available: {ALL_TASKS}", flush=True)
            return
    else:
        # Baseline inference: 1 easy, 1 medium, 1 hard randomly sampled
        easies = [t for t in ALL_TASKS if "easy" in t]
        mediums = [t for t in ALL_TASKS if "medium" in t]
        hards = [t for t in ALL_TASKS if "hard" in t]
        
        tasks = []
        if easies: tasks.append(random.choice(easies))
        if mediums: tasks.append(random.choice(mediums))
        if hards: tasks.append(random.choice(hards))

    print(f"Running tasks: {tasks}", flush=True)

    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)

    total_score = 0.0
    results = []

    for task in tasks:
        score, steps, rewards, success = run_agent_episode(client, task)
        results.append({
            "task": task,
            "score": score,
            "steps": steps,
            "success": success,
        })
        total_score += score

        print("Waiting 20 seconds before next task to respect API limits...", flush=True)
        time.sleep(20)

    # Summary
    print("\n" + "=" * 50, flush=True)
    print("EVALUATION SUMMARY", flush=True)
    print("=" * 50, flush=True)
    for r in results:
        status = "PASS" if r["success"] else "FAIL"
        print(
            f"  {r['task']:10s} | score={r['score']:.3f} | "
            f"steps={r['steps']:2d} | {status}",
            flush=True,
        )
    avg_score = total_score / len(tasks)
    print(f"  {'AVERAGE':10s} | score={avg_score:.3f}", flush=True)
    print(f"  {'TOTAL':10s} | score={total_score:.3f} / {len(tasks):.1f}", flush=True)
    print("=" * 50, flush=True)


if __name__ == "__main__":
    main()