Parthiban007 commited on
Commit
7bc8744
Β·
verified Β·
1 Parent(s): 9763ffa

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. models.py +1 -1
  2. server/app.py +15 -15
  3. server/rust_coder_environment.py +15 -0
models.py CHANGED
@@ -17,7 +17,7 @@ from pydantic import Field
17
  class RustCoderAction(Action):
18
  """Action for the Rust Coder environment - contains the Rust code to evaluate."""
19
 
20
- code: str = Field(..., description="Rust source code to compile and run")
21
 
22
 
23
  class RustCoderObservation(Observation):
 
17
  class RustCoderAction(Action):
18
  """Action for the Rust Coder environment - contains the Rust code to evaluate."""
19
 
20
+ code: str = Field(default="", description="Rust source code to compile and run")
21
 
22
 
23
  class RustCoderObservation(Observation):
server/app.py CHANGED
@@ -67,10 +67,10 @@ def evaluate_single(problem_id, code=None):
67
  try:
68
  idx = int(problem_id.split(":")[0]) - 1
69
  problem = RustCoderEnvironment().problems[idx]
70
-
71
  # 1. Get code from LLM if not provided
72
  solution_code = code if code else get_llm_solution(problem["description"])
73
-
74
  # 2. Guard: If LLM failed, do not evaluate
75
  if solution_code.startswith("// LLM Error"):
76
  return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
@@ -80,7 +80,7 @@ def evaluate_single(problem_id, code=None):
80
  # Reset to the specifically requested index
81
  state = env.reset(start_index=idx)
82
  state = env.step(RustCoderAction(code=solution_code))
83
-
84
  metrics = {
85
  "Total Reward": f"{state.reward:.2f}",
86
  "Compilation": "Success" if state.compilation_success else "Failed",
@@ -96,7 +96,7 @@ def run_benchmark(progress=gr.Progress()):
96
  env = RustCoderEnvironment()
97
  rows = []
98
  total_score = 0.0
99
-
100
  # Check if token is actually present
101
  test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
102
  if not test_token:
@@ -106,16 +106,16 @@ def run_benchmark(progress=gr.Progress()):
106
  progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
107
  problem = env.problems[i]
108
  code = get_llm_solution(problem["description"])
109
-
110
  reward = 0.0
111
  compiled = "Failed (LLM Error)"
112
-
113
  if not code.startswith("// LLM Error"):
114
  env.reset(start_index=i)
115
  state = env.step(RustCoderAction(code=code))
116
  reward = state.reward
117
  compiled = "Success" if state.compilation_success else "Failed"
118
-
119
  rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
120
  total_score += reward
121
 
@@ -129,7 +129,7 @@ def run_benchmark(progress=gr.Progress()):
129
  def create_dashboard():
130
  with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
131
  gr.Markdown("# πŸ¦€ Rust Coder: LLM Evaluation Dashboard")
132
-
133
  with gr.Tab("Individual Task Evaluation"):
134
  with gr.Row():
135
  with gr.Column(scale=1):
@@ -137,30 +137,30 @@ def create_dashboard():
137
  p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
138
  dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
139
  desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
140
-
141
  with gr.Column(scale=1):
142
  run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
143
  code_display = gr.Code(label="AI Generated Solution", interactive=False)
144
  results_json = gr.JSON(label="Metric Breakdown")
145
-
146
  def update_desc(p_str):
147
  idx = int(p_str.split(":")[0]) - 1
148
  p = p_env.problems[idx]
149
  return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
150
-
151
  dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
152
  run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
153
 
154
  with gr.Tab("Full Environment Benchmark"):
155
  gr.Markdown("### Complete Environment Suite")
156
  gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
157
-
158
  b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
159
  b_sum = gr.Markdown()
160
  b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
161
-
162
  b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
163
-
164
  return demo
165
 
166
  # Final consolidated Gradio App mounted on the FastAPI server
@@ -168,7 +168,7 @@ app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
168
 
169
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
170
  """Entry point: uv run server or python -m server.app"""
171
- import uvicorn
172
  uvicorn.run(app, host=host, port=port)
173
 
174
 
 
67
  try:
68
  idx = int(problem_id.split(":")[0]) - 1
69
  problem = RustCoderEnvironment().problems[idx]
70
+
71
  # 1. Get code from LLM if not provided
72
  solution_code = code if code else get_llm_solution(problem["description"])
73
+
74
  # 2. Guard: If LLM failed, do not evaluate
75
  if solution_code.startswith("// LLM Error"):
76
  return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
 
80
  # Reset to the specifically requested index
81
  state = env.reset(start_index=idx)
82
  state = env.step(RustCoderAction(code=solution_code))
83
+
84
  metrics = {
85
  "Total Reward": f"{state.reward:.2f}",
86
  "Compilation": "Success" if state.compilation_success else "Failed",
 
96
  env = RustCoderEnvironment()
97
  rows = []
98
  total_score = 0.0
99
+
100
  # Check if token is actually present
101
  test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
102
  if not test_token:
 
106
  progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
107
  problem = env.problems[i]
108
  code = get_llm_solution(problem["description"])
109
+
110
  reward = 0.0
111
  compiled = "Failed (LLM Error)"
112
+
113
  if not code.startswith("// LLM Error"):
114
  env.reset(start_index=i)
115
  state = env.step(RustCoderAction(code=code))
116
  reward = state.reward
117
  compiled = "Success" if state.compilation_success else "Failed"
118
+
119
  rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
120
  total_score += reward
121
 
 
129
  def create_dashboard():
130
  with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
131
  gr.Markdown("# πŸ¦€ Rust Coder: LLM Evaluation Dashboard")
132
+
133
  with gr.Tab("Individual Task Evaluation"):
134
  with gr.Row():
135
  with gr.Column(scale=1):
 
137
  p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
138
  dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
139
  desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
140
+
141
  with gr.Column(scale=1):
142
  run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
143
  code_display = gr.Code(label="AI Generated Solution", interactive=False)
144
  results_json = gr.JSON(label="Metric Breakdown")
145
+
146
  def update_desc(p_str):
147
  idx = int(p_str.split(":")[0]) - 1
148
  p = p_env.problems[idx]
149
  return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
150
+
151
  dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
152
  run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
153
 
154
  with gr.Tab("Full Environment Benchmark"):
155
  gr.Markdown("### Complete Environment Suite")
156
  gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
157
+
158
  b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
159
  b_sum = gr.Markdown()
160
  b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
161
+
162
  b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
163
+
164
  return demo
165
 
166
  # Final consolidated Gradio App mounted on the FastAPI server
 
168
 
169
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
170
  """Entry point: uv run server or python -m server.app"""
171
+ import uvicorn
172
  uvicorn.run(app, host=host, port=port)
173
 
174
 
server/rust_coder_environment.py CHANGED
@@ -117,6 +117,21 @@ class RustCoderEnvironment(Environment):
117
  problem = self.problems[self.current_problem_idx]
118
  code = action.code
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  # ── 1. Compilation (40%) ──────────────────────────────────────
121
  compilation_success, compilation_output = self._compile_check(code)
122
  r_compilation = 1.0 if compilation_success else 0.0
 
117
  problem = self.problems[self.current_problem_idx]
118
  code = action.code
119
 
120
+ if not code.strip():
121
+ done = self.current_problem_idx >= len(self.problems) - 1
122
+ if not done:
123
+ self.current_problem_idx += 1
124
+ return RustCoderObservation(
125
+ problem_description=problem["description"],
126
+ starter_code=problem.get("starter_code", ""),
127
+ compilation_success=False,
128
+ compilation_output="Error: no code submitted.",
129
+ test_results=[],
130
+ reward_breakdown={"compilation": 0.0, "correctness": 0.0, "coverage": 0.0, "elegance": 0.0, "efficiency": 0.0},
131
+ done=done,
132
+ reward=0.0,
133
+ )
134
+
135
  # ── 1. Compilation (40%) ──────────────────────────────────────
136
  compilation_success, compilation_output = self._compile_check(code)
137
  r_compilation = 1.0 if compilation_success else 0.0