chiforge / server /reward.py
SanketAI's picture
Upload folder using huggingface_hub
cca9a63 verified
"""
Discrete reward logic for the ChipForge environment.
"""
import os
import openai
from dotenv import load_dotenv
load_dotenv()
api_key = os.environ.get("API_KEY")
base_url = os.environ.get("API_BASE_URL")
model = os.environ.get("MODEL_NAME")
from .constants import STEP_COST
def eval_tool_reward(status: str, attempts: int) -> float:
"""
Evaluates raw step reward based on tool pass/fail and attempt count.
Pass conditions: run_simulation uses 'pass', run_lint uses 'clean',
run_synthesis uses 'pass'.
"""
if status in ("pass", "clean"):
if attempts == 1:
return 0.3
elif attempts == 2:
return 0.2
else:
return 0.1
return -0.5
def eval_llm_submit(design_code: str, testbench_code: str, golden_code: str, task_desc: str) -> float:
"""
Uses an LLM judge to evaluate the design and testbench against the goal.
Returns a score between -1.0 and 1.0.
"""
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("WARNING: OPENAI_API_KEY not found. Falling back to length-based heuristic.")
return 0.5 if (len(design_code.strip()) > 10 and len(testbench_code.strip()) > 10) else -1.0
client = openai.OpenAI(api_key=api_key , base_url=base_url)
system_prompt = (
"You are an expert Verilog evaluation judge. Evaluate the provided design code and testbench against the task description and golden code. "
"Return ONLY a single float value between -1.0 and 1.0 representing the quality and correctness of the submission. "
"1.0 = Perfect, entirely correct. 0.0 = Partially correct. -1.0 = Completely incorrect. Do not output any other text."
)
user_prompt = (
f"Task Description:\n{task_desc}\n\n"
f"Golden Reference Code:\n{golden_code}\n\n"
f"Submitted Design Code:\n{design_code}\n\n"
f"Submitted Testbench Code:\n{testbench_code}"
)
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
temperature=0.0,
max_tokens=10
)
score_str = response.choices[0].message.content.strip()
score = float(score_str)
return max(-1.0, min(1.0, score))
except Exception as e:
print(f"LLM Evaluation failed: {e}")
# Fallback to basic length validation if LLM fails
if len(design_code.strip()) > 10 and len(testbench_code.strip()) > 10:
return 0.5
return -1.0
def normalize_reward(raw_reward: float) -> float:
"""
Maps the raw reward into a [0, 1] scale across the action bounds.
Theoretical limits: raw [-1.0, 1.0] -> normalized [0.0, 1.0]
"""
mapped = (raw_reward + 1.0) / 2.0
return max(0.0, min(1.0, mapped))