Spaces:

Parthiban007
/

rust_coder

Running

App Files Files Community

Parthiban007 commited on 2 days ago

Commit

7bc8744

verified ·

1 Parent(s): 9763ffa

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

models.py +1 -1
server/app.py +15 -15
server/rust_coder_environment.py +15 -0

models.py CHANGED Viewed

@@ -17,7 +17,7 @@ from pydantic import Field
 class RustCoderAction(Action):
     """Action for the Rust Coder environment - contains the Rust code to evaluate."""
-    code: str = Field(..., description="Rust source code to compile and run")
 class RustCoderObservation(Observation):

 class RustCoderAction(Action):
     """Action for the Rust Coder environment - contains the Rust code to evaluate."""
+    code: str = Field(default="", description="Rust source code to compile and run")
 class RustCoderObservation(Observation):

server/app.py CHANGED Viewed

@@ -67,10 +67,10 @@ def evaluate_single(problem_id, code=None):
     try:
         idx = int(problem_id.split(":")[0]) - 1
         problem = RustCoderEnvironment().problems[idx]
         # 1. Get code from LLM if not provided
         solution_code = code if code else get_llm_solution(problem["description"])
         # 2. Guard: If LLM failed, do not evaluate
         if solution_code.startswith("// LLM Error"):
             return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
@@ -80,7 +80,7 @@ def evaluate_single(problem_id, code=None):
         # Reset to the specifically requested index
         state = env.reset(start_index=idx)
         state = env.step(RustCoderAction(code=solution_code))
         metrics = {
             "Total Reward": f"{state.reward:.2f}",
             "Compilation": "Success" if state.compilation_success else "Failed",
@@ -96,7 +96,7 @@ def run_benchmark(progress=gr.Progress()):
         env = RustCoderEnvironment()
         rows = []
         total_score = 0.0
         # Check if token is actually present
         test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
         if not test_token:
@@ -106,16 +106,16 @@ def run_benchmark(progress=gr.Progress()):
             progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
             problem = env.problems[i]
             code = get_llm_solution(problem["description"])
             reward = 0.0
             compiled = "Failed (LLM Error)"
             if not code.startswith("// LLM Error"):
                 env.reset(start_index=i)
                 state = env.step(RustCoderAction(code=code))
                 reward = state.reward
                 compiled = "Success" if state.compilation_success else "Failed"
             rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
             total_score += reward
@@ -129,7 +129,7 @@ def run_benchmark(progress=gr.Progress()):
 def create_dashboard():
     with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
         gr.Markdown("# 🦀 Rust Coder: LLM Evaluation Dashboard")
         with gr.Tab("Individual Task Evaluation"):
             with gr.Row():
                 with gr.Column(scale=1):
@@ -137,30 +137,30 @@ def create_dashboard():
                     p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
                     dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
                     desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
                 with gr.Column(scale=1):
                     run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
                     code_display = gr.Code(label="AI Generated Solution", interactive=False)
                     results_json = gr.JSON(label="Metric Breakdown")
             def update_desc(p_str):
                 idx = int(p_str.split(":")[0]) - 1
                 p = p_env.problems[idx]
                 return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
             dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
             run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
         with gr.Tab("Full Environment Benchmark"):
             gr.Markdown("### Complete Environment Suite")
             gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
             b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
             b_sum = gr.Markdown()
             b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
             b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
     return demo
 # Final consolidated Gradio App mounted on the FastAPI server
@@ -168,7 +168,7 @@ app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
 def main(host: str = "0.0.0.0", port: int = 8000) -> None:
     """Entry point: uv run server or python -m server.app"""
-    import uvicorn
     uvicorn.run(app, host=host, port=port)

     try:
         idx = int(problem_id.split(":")[0]) - 1
         problem = RustCoderEnvironment().problems[idx]
         # 1. Get code from LLM if not provided
         solution_code = code if code else get_llm_solution(problem["description"])
         # 2. Guard: If LLM failed, do not evaluate
         if solution_code.startswith("// LLM Error"):
             return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
         # Reset to the specifically requested index
         state = env.reset(start_index=idx)
         state = env.step(RustCoderAction(code=solution_code))
         metrics = {
             "Total Reward": f"{state.reward:.2f}",
             "Compilation": "Success" if state.compilation_success else "Failed",
         env = RustCoderEnvironment()
         rows = []
         total_score = 0.0
         # Check if token is actually present
         test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
         if not test_token:
             progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
             problem = env.problems[i]
             code = get_llm_solution(problem["description"])
             reward = 0.0
             compiled = "Failed (LLM Error)"
             if not code.startswith("// LLM Error"):
                 env.reset(start_index=i)
                 state = env.step(RustCoderAction(code=code))
                 reward = state.reward
                 compiled = "Success" if state.compilation_success else "Failed"
             rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
             total_score += reward
 def create_dashboard():
     with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
         gr.Markdown("# 🦀 Rust Coder: LLM Evaluation Dashboard")
         with gr.Tab("Individual Task Evaluation"):
             with gr.Row():
                 with gr.Column(scale=1):
                     p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
                     dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
                     desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
                 with gr.Column(scale=1):
                     run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
                     code_display = gr.Code(label="AI Generated Solution", interactive=False)
                     results_json = gr.JSON(label="Metric Breakdown")
             def update_desc(p_str):
                 idx = int(p_str.split(":")[0]) - 1
                 p = p_env.problems[idx]
                 return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
             dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
             run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
         with gr.Tab("Full Environment Benchmark"):
             gr.Markdown("### Complete Environment Suite")
             gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
             b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
             b_sum = gr.Markdown()
             b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
             b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
     return demo
 # Final consolidated Gradio App mounted on the FastAPI server
 def main(host: str = "0.0.0.0", port: int = 8000) -> None:
     """Entry point: uv run server or python -m server.app"""
+    import uvicorn
     uvicorn.run(app, host=host, port=port)

server/rust_coder_environment.py CHANGED Viewed

@@ -117,6 +117,21 @@ class RustCoderEnvironment(Environment):
         problem = self.problems[self.current_problem_idx]
         code = action.code
         # ── 1. Compilation (40%) ──────────────────────────────────────
         compilation_success, compilation_output = self._compile_check(code)
         r_compilation = 1.0 if compilation_success else 0.0

         problem = self.problems[self.current_problem_idx]
         code = action.code
+        if not code.strip():
+            done = self.current_problem_idx >= len(self.problems) - 1
+            if not done:
+                self.current_problem_idx += 1
+            return RustCoderObservation(
+                problem_description=problem["description"],
+                starter_code=problem.get("starter_code", ""),
+                compilation_success=False,
+                compilation_output="Error: no code submitted.",
+                test_results=[],
+                reward_breakdown={"compilation": 0.0, "correctness": 0.0, "coverage": 0.0, "elegance": 0.0, "efficiency": 0.0},
+                done=done,
+                reward=0.0,
+            )
         # ── 1. Compilation (40%) ──────────────────────────────────────
         compilation_success, compilation_output = self._compile_check(code)
         r_compilation = 1.0 if compilation_success else 0.0