File size: 2,942 Bytes
cca9a63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Discrete reward logic for the ChipForge environment.
"""

import os
import openai
from dotenv import load_dotenv

load_dotenv()

api_key = os.environ.get("API_KEY")
base_url = os.environ.get("API_BASE_URL")
model = os.environ.get("MODEL_NAME")

from .constants import STEP_COST

def eval_tool_reward(status: str, attempts: int) -> float:
    """
    Evaluates raw step reward based on tool pass/fail and attempt count.
    Pass conditions: run_simulation uses 'pass', run_lint uses 'clean',
    run_synthesis uses 'pass'.
    """
    if status in ("pass", "clean"):
        if attempts == 1:
            return 0.3
        elif attempts == 2:
            return 0.2
        else:
            return 0.1
    return -0.5

def eval_llm_submit(design_code: str, testbench_code: str, golden_code: str, task_desc: str) -> float:
    """
    Uses an LLM judge to evaluate the design and testbench against the goal.
    Returns a score between -1.0 and 1.0. 
    """
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("WARNING: OPENAI_API_KEY not found. Falling back to length-based heuristic.")
        return 0.5 if (len(design_code.strip()) > 10 and len(testbench_code.strip()) > 10) else -1.0

    client = openai.OpenAI(api_key=api_key , base_url=base_url)
    
    system_prompt = (
        "You are an expert Verilog evaluation judge. Evaluate the provided design code and testbench against the task description and golden code. "
        "Return ONLY a single float value between -1.0 and 1.0 representing the quality and correctness of the submission. "
        "1.0 = Perfect, entirely correct. 0.0 = Partially correct. -1.0 = Completely incorrect. Do not output any other text."
    )
    
    user_prompt = (
        f"Task Description:\n{task_desc}\n\n"
        f"Golden Reference Code:\n{golden_code}\n\n"
        f"Submitted Design Code:\n{design_code}\n\n"
        f"Submitted Testbench Code:\n{testbench_code}"
    )
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.0,
            max_tokens=10
        )
        score_str = response.choices[0].message.content.strip()
        score = float(score_str)
        return max(-1.0, min(1.0, score))
    except Exception as e:
        print(f"LLM Evaluation failed: {e}")
        # Fallback to basic length validation if LLM fails
        if len(design_code.strip()) > 10 and len(testbench_code.strip()) > 10:
            return 0.5
        return -1.0

def normalize_reward(raw_reward: float) -> float:
    """
    Maps the raw reward into a [0, 1] scale across the action bounds.
    Theoretical limits: raw [-1.0, 1.0] -> normalized [0.0, 1.0]
    """
    mapped = (raw_reward + 1.0) / 2.0
    return max(0.0, min(1.0, mapped))