Spaces:

SanketAI
/

chiforge

Sleeping

App Files Files Community

chiforge / server /reward.py

SanketAI

Upload folder using huggingface_hub

cca9a63 verified about 2 months ago

raw

history blame contribute delete

2.94 kB

	"""
	Discrete reward logic for the ChipForge environment.
	"""

	import os
	import openai
	from dotenv import load_dotenv

	load_dotenv()

	api_key = os.environ.get("API_KEY")
	base_url = os.environ.get("API_BASE_URL")
	model = os.environ.get("MODEL_NAME")

	from .constants import STEP_COST

	def eval_tool_reward(status: str, attempts: int) -> float:
	"""
	Evaluates raw step reward based on tool pass/fail and attempt count.
	Pass conditions: run_simulation uses 'pass', run_lint uses 'clean',
	run_synthesis uses 'pass'.
	"""
	if status in ("pass", "clean"):
	if attempts == 1:
	return 0.3
	elif attempts == 2:
	return 0.2
	else:
	return 0.1
	return -0.5

	def eval_llm_submit(design_code: str, testbench_code: str, golden_code: str, task_desc: str) -> float:
	"""
	Uses an LLM judge to evaluate the design and testbench against the goal.
	Returns a score between -1.0 and 1.0.
	"""
	api_key = os.environ.get("OPENAI_API_KEY")
	if not api_key:
	print("WARNING: OPENAI_API_KEY not found. Falling back to length-based heuristic.")
	return 0.5 if (len(design_code.strip()) > 10 and len(testbench_code.strip()) > 10) else -1.0

	client = openai.OpenAI(api_key=api_key , base_url=base_url)

	system_prompt = (
	"You are an expert Verilog evaluation judge. Evaluate the provided design code and testbench against the task description and golden code. "
	"Return ONLY a single float value between -1.0 and 1.0 representing the quality and correctness of the submission. "
	"1.0 = Perfect, entirely correct. 0.0 = Partially correct. -1.0 = Completely incorrect. Do not output any other text."
	)

	user_prompt = (
	f"Task Description:\n{task_desc}\n\n"
	f"Golden Reference Code:\n{golden_code}\n\n"
	f"Submitted Design Code:\n{design_code}\n\n"
	f"Submitted Testbench Code:\n{testbench_code}"
	)

	try:
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	],
	temperature=0.0,
	max_tokens=10
	)
	score_str = response.choices[0].message.content.strip()
	score = float(score_str)
	return max(-1.0, min(1.0, score))
	except Exception as e:
	print(f"LLM Evaluation failed: {e}")
	# Fallback to basic length validation if LLM fails
	if len(design_code.strip()) > 10 and len(testbench_code.strip()) > 10:
	return 0.5
	return -1.0

	def normalize_reward(raw_reward: float) -> float:
	"""
	Maps the raw reward into a [0, 1] scale across the action bounds.
	Theoretical limits: raw [-1.0, 1.0] -> normalized [0.0, 1.0]
	"""
	mapped = (raw_reward + 1.0) / 2.0
	return max(0.0, min(1.0, mapped))