Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- models.py +1 -1
- server/app.py +15 -15
- server/rust_coder_environment.py +15 -0
models.py
CHANGED
|
@@ -17,7 +17,7 @@ from pydantic import Field
|
|
| 17 |
class RustCoderAction(Action):
|
| 18 |
"""Action for the Rust Coder environment - contains the Rust code to evaluate."""
|
| 19 |
|
| 20 |
-
code: str = Field(
|
| 21 |
|
| 22 |
|
| 23 |
class RustCoderObservation(Observation):
|
|
|
|
| 17 |
class RustCoderAction(Action):
|
| 18 |
"""Action for the Rust Coder environment - contains the Rust code to evaluate."""
|
| 19 |
|
| 20 |
+
code: str = Field(default="", description="Rust source code to compile and run")
|
| 21 |
|
| 22 |
|
| 23 |
class RustCoderObservation(Observation):
|
server/app.py
CHANGED
|
@@ -67,10 +67,10 @@ def evaluate_single(problem_id, code=None):
|
|
| 67 |
try:
|
| 68 |
idx = int(problem_id.split(":")[0]) - 1
|
| 69 |
problem = RustCoderEnvironment().problems[idx]
|
| 70 |
-
|
| 71 |
# 1. Get code from LLM if not provided
|
| 72 |
solution_code = code if code else get_llm_solution(problem["description"])
|
| 73 |
-
|
| 74 |
# 2. Guard: If LLM failed, do not evaluate
|
| 75 |
if solution_code.startswith("// LLM Error"):
|
| 76 |
return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
|
|
@@ -80,7 +80,7 @@ def evaluate_single(problem_id, code=None):
|
|
| 80 |
# Reset to the specifically requested index
|
| 81 |
state = env.reset(start_index=idx)
|
| 82 |
state = env.step(RustCoderAction(code=solution_code))
|
| 83 |
-
|
| 84 |
metrics = {
|
| 85 |
"Total Reward": f"{state.reward:.2f}",
|
| 86 |
"Compilation": "Success" if state.compilation_success else "Failed",
|
|
@@ -96,7 +96,7 @@ def run_benchmark(progress=gr.Progress()):
|
|
| 96 |
env = RustCoderEnvironment()
|
| 97 |
rows = []
|
| 98 |
total_score = 0.0
|
| 99 |
-
|
| 100 |
# Check if token is actually present
|
| 101 |
test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 102 |
if not test_token:
|
|
@@ -106,16 +106,16 @@ def run_benchmark(progress=gr.Progress()):
|
|
| 106 |
progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
|
| 107 |
problem = env.problems[i]
|
| 108 |
code = get_llm_solution(problem["description"])
|
| 109 |
-
|
| 110 |
reward = 0.0
|
| 111 |
compiled = "Failed (LLM Error)"
|
| 112 |
-
|
| 113 |
if not code.startswith("// LLM Error"):
|
| 114 |
env.reset(start_index=i)
|
| 115 |
state = env.step(RustCoderAction(code=code))
|
| 116 |
reward = state.reward
|
| 117 |
compiled = "Success" if state.compilation_success else "Failed"
|
| 118 |
-
|
| 119 |
rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
|
| 120 |
total_score += reward
|
| 121 |
|
|
@@ -129,7 +129,7 @@ def run_benchmark(progress=gr.Progress()):
|
|
| 129 |
def create_dashboard():
|
| 130 |
with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
|
| 131 |
gr.Markdown("# π¦ Rust Coder: LLM Evaluation Dashboard")
|
| 132 |
-
|
| 133 |
with gr.Tab("Individual Task Evaluation"):
|
| 134 |
with gr.Row():
|
| 135 |
with gr.Column(scale=1):
|
|
@@ -137,30 +137,30 @@ def create_dashboard():
|
|
| 137 |
p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
|
| 138 |
dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
|
| 139 |
desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
|
| 140 |
-
|
| 141 |
with gr.Column(scale=1):
|
| 142 |
run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
|
| 143 |
code_display = gr.Code(label="AI Generated Solution", interactive=False)
|
| 144 |
results_json = gr.JSON(label="Metric Breakdown")
|
| 145 |
-
|
| 146 |
def update_desc(p_str):
|
| 147 |
idx = int(p_str.split(":")[0]) - 1
|
| 148 |
p = p_env.problems[idx]
|
| 149 |
return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
|
| 150 |
-
|
| 151 |
dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
|
| 152 |
run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
|
| 153 |
|
| 154 |
with gr.Tab("Full Environment Benchmark"):
|
| 155 |
gr.Markdown("### Complete Environment Suite")
|
| 156 |
gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
|
| 157 |
-
|
| 158 |
b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
|
| 159 |
b_sum = gr.Markdown()
|
| 160 |
b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
|
| 161 |
-
|
| 162 |
b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
|
| 163 |
-
|
| 164 |
return demo
|
| 165 |
|
| 166 |
# Final consolidated Gradio App mounted on the FastAPI server
|
|
@@ -168,7 +168,7 @@ app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
|
|
| 168 |
|
| 169 |
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 170 |
"""Entry point: uv run server or python -m server.app"""
|
| 171 |
-
import uvicorn
|
| 172 |
uvicorn.run(app, host=host, port=port)
|
| 173 |
|
| 174 |
|
|
|
|
| 67 |
try:
|
| 68 |
idx = int(problem_id.split(":")[0]) - 1
|
| 69 |
problem = RustCoderEnvironment().problems[idx]
|
| 70 |
+
|
| 71 |
# 1. Get code from LLM if not provided
|
| 72 |
solution_code = code if code else get_llm_solution(problem["description"])
|
| 73 |
+
|
| 74 |
# 2. Guard: If LLM failed, do not evaluate
|
| 75 |
if solution_code.startswith("// LLM Error"):
|
| 76 |
return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
|
|
|
|
| 80 |
# Reset to the specifically requested index
|
| 81 |
state = env.reset(start_index=idx)
|
| 82 |
state = env.step(RustCoderAction(code=solution_code))
|
| 83 |
+
|
| 84 |
metrics = {
|
| 85 |
"Total Reward": f"{state.reward:.2f}",
|
| 86 |
"Compilation": "Success" if state.compilation_success else "Failed",
|
|
|
|
| 96 |
env = RustCoderEnvironment()
|
| 97 |
rows = []
|
| 98 |
total_score = 0.0
|
| 99 |
+
|
| 100 |
# Check if token is actually present
|
| 101 |
test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 102 |
if not test_token:
|
|
|
|
| 106 |
progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
|
| 107 |
problem = env.problems[i]
|
| 108 |
code = get_llm_solution(problem["description"])
|
| 109 |
+
|
| 110 |
reward = 0.0
|
| 111 |
compiled = "Failed (LLM Error)"
|
| 112 |
+
|
| 113 |
if not code.startswith("// LLM Error"):
|
| 114 |
env.reset(start_index=i)
|
| 115 |
state = env.step(RustCoderAction(code=code))
|
| 116 |
reward = state.reward
|
| 117 |
compiled = "Success" if state.compilation_success else "Failed"
|
| 118 |
+
|
| 119 |
rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
|
| 120 |
total_score += reward
|
| 121 |
|
|
|
|
| 129 |
def create_dashboard():
|
| 130 |
with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
|
| 131 |
gr.Markdown("# π¦ Rust Coder: LLM Evaluation Dashboard")
|
| 132 |
+
|
| 133 |
with gr.Tab("Individual Task Evaluation"):
|
| 134 |
with gr.Row():
|
| 135 |
with gr.Column(scale=1):
|
|
|
|
| 137 |
p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
|
| 138 |
dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
|
| 139 |
desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
|
| 140 |
+
|
| 141 |
with gr.Column(scale=1):
|
| 142 |
run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
|
| 143 |
code_display = gr.Code(label="AI Generated Solution", interactive=False)
|
| 144 |
results_json = gr.JSON(label="Metric Breakdown")
|
| 145 |
+
|
| 146 |
def update_desc(p_str):
|
| 147 |
idx = int(p_str.split(":")[0]) - 1
|
| 148 |
p = p_env.problems[idx]
|
| 149 |
return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
|
| 150 |
+
|
| 151 |
dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
|
| 152 |
run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
|
| 153 |
|
| 154 |
with gr.Tab("Full Environment Benchmark"):
|
| 155 |
gr.Markdown("### Complete Environment Suite")
|
| 156 |
gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
|
| 157 |
+
|
| 158 |
b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
|
| 159 |
b_sum = gr.Markdown()
|
| 160 |
b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
|
| 161 |
+
|
| 162 |
b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
|
| 163 |
+
|
| 164 |
return demo
|
| 165 |
|
| 166 |
# Final consolidated Gradio App mounted on the FastAPI server
|
|
|
|
| 168 |
|
| 169 |
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 170 |
"""Entry point: uv run server or python -m server.app"""
|
| 171 |
+
import uvicorn
|
| 172 |
uvicorn.run(app, host=host, port=port)
|
| 173 |
|
| 174 |
|
server/rust_coder_environment.py
CHANGED
|
@@ -117,6 +117,21 @@ class RustCoderEnvironment(Environment):
|
|
| 117 |
problem = self.problems[self.current_problem_idx]
|
| 118 |
code = action.code
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
# ββ 1. Compilation (40%) ββββββββββββββββββββββββββββββββββββββ
|
| 121 |
compilation_success, compilation_output = self._compile_check(code)
|
| 122 |
r_compilation = 1.0 if compilation_success else 0.0
|
|
|
|
| 117 |
problem = self.problems[self.current_problem_idx]
|
| 118 |
code = action.code
|
| 119 |
|
| 120 |
+
if not code.strip():
|
| 121 |
+
done = self.current_problem_idx >= len(self.problems) - 1
|
| 122 |
+
if not done:
|
| 123 |
+
self.current_problem_idx += 1
|
| 124 |
+
return RustCoderObservation(
|
| 125 |
+
problem_description=problem["description"],
|
| 126 |
+
starter_code=problem.get("starter_code", ""),
|
| 127 |
+
compilation_success=False,
|
| 128 |
+
compilation_output="Error: no code submitted.",
|
| 129 |
+
test_results=[],
|
| 130 |
+
reward_breakdown={"compilation": 0.0, "correctness": 0.0, "coverage": 0.0, "elegance": 0.0, "efficiency": 0.0},
|
| 131 |
+
done=done,
|
| 132 |
+
reward=0.0,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
# ββ 1. Compilation (40%) ββββββββββββββββββββββββββββββββββββββ
|
| 136 |
compilation_success, compilation_output = self._compile_check(code)
|
| 137 |
r_compilation = 1.0 if compilation_success else 0.0
|