krishnachoudhary-hclguvi commited on
Deploy OpenEnv Code Review to HF Spaces
Browse files- Dockerfile +29 -0
- README.md +59 -11
- app.py +27 -0
- checklist.md +30 -0
- code_review_env.py +88 -0
- inference.py +77 -0
- requirements.txt +7 -0
Dockerfile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables to avoid writing .pyc files and buffer stdout
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
+
ENV PYTHONUNBUFFERED=1
|
| 7 |
+
|
| 8 |
+
# Hugging Face Spaces requires running as a non-root user
|
| 9 |
+
RUN useradd -m -u 1000 user
|
| 10 |
+
USER user
|
| 11 |
+
|
| 12 |
+
# Set up the home directory and path
|
| 13 |
+
ENV HOME=/home/user \
|
| 14 |
+
PATH=/home/user/.local/bin:$PATH
|
| 15 |
+
|
| 16 |
+
# Set the working directory inside the container
|
| 17 |
+
WORKDIR $HOME/app
|
| 18 |
+
|
| 19 |
+
# Copy the current directory contents into the container and set ownership
|
| 20 |
+
COPY --chown=user:user . $HOME/app
|
| 21 |
+
|
| 22 |
+
# Install any needed packages specified in requirements.txt
|
| 23 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 24 |
+
|
| 25 |
+
# Expose port 7860 (Hugging Face Spaces default)
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
|
| 28 |
+
# Command to run the Gradio UI
|
| 29 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,11 +1,59 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenEnv Environment Submission
|
| 2 |
+
|
| 3 |
+
This repository contains the submission for the **Meta PyTorch OpenEnv Hackathon — Round 1**.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
Implement an RL-style environment that follows the OpenEnv framework by Meta and Hugging Face. The environment exposes tasks, actions, step execution, and reward scoring.
|
| 7 |
+
|
| 8 |
+
**Domain:** Custom Domain (e.g. Email triage, Scheduling, Code Review)
|
| 9 |
+
|
| 10 |
+
## Project Structure
|
| 11 |
+
```
|
| 12 |
+
openEnv/
|
| 13 |
+
├── inference.py # Main execution script emitting required [START], [STEP], [END] logs.
|
| 14 |
+
├── requirements.txt # Project dependencies
|
| 15 |
+
├── README.md # This file
|
| 16 |
+
├── spec.md # Full Hackathon Specification
|
| 17 |
+
└── checklist.md # Submission Verification Checklist
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## Setup & Execution
|
| 21 |
+
|
| 22 |
+
### Prerequisites
|
| 23 |
+
- Python 3.9+
|
| 24 |
+
- OpenAI Python client (`openai>=1.0.0`)
|
| 25 |
+
|
| 26 |
+
### Installation
|
| 27 |
+
```bash
|
| 28 |
+
pip install -r requirements.txt
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### Environment Variables
|
| 32 |
+
For inference script to run, the following environment variables are supported/required:
|
| 33 |
+
- `HF_TOKEN`: Required. Hugging Face Access Token.
|
| 34 |
+
- `API_BASE_URL`: Base URL for OpenAI client (Default: `https://api.openai.com/v1`)
|
| 35 |
+
- `MODEL_NAME`: The Language Model name (Default: `gpt-3.5-turbo`)
|
| 36 |
+
- `OPENAI_API_KEY`: API Key if hitting OpenAI directly or external OpenAI-compatible APIs.
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
export HF_TOKEN="your_hf_token"
|
| 40 |
+
export OPENAI_API_KEY="your_api_key"
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### Run
|
| 44 |
+
Ensure you output exactly to `stdout` for the metrics collection:
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
python inference.py
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### Output Formatting
|
| 51 |
+
The script outputs logs specifically formatted for the autograder:
|
| 52 |
+
- `[START] task=xyz env=abc model=mymodel`
|
| 53 |
+
- `[STEP] step=1 action=abc reward=0.00 done=false error=null`
|
| 54 |
+
- `[END] success=true steps=5 rewards=0.00,1.00`
|
| 55 |
+
|
| 56 |
+
## Hugging Face Spaces Deployment
|
| 57 |
+
*URL: `https://huggingface.co/spaces/YOUR_USER_ID/YOUR_SPACE_NAME`*
|
| 58 |
+
|
| 59 |
+
This project is configured to run efficiently on Hugging Face Spaces under the **2 vCPU & 8 GB RAM** limitation constraint, with valid docker-based build processes.
|
app.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import subprocess
|
| 3 |
+
|
| 4 |
+
def run_inference():
|
| 5 |
+
try:
|
| 6 |
+
# Run inference.py and capture exact output
|
| 7 |
+
result = subprocess.run(['python', 'inference.py'], capture_output=True, text=True, timeout=30)
|
| 8 |
+
return result.stdout + "\n" + result.stderr
|
| 9 |
+
except subprocess.TimeoutExpired:
|
| 10 |
+
return "Process timed out after 30 seconds."
|
| 11 |
+
except Exception as e:
|
| 12 |
+
return str(e)
|
| 13 |
+
|
| 14 |
+
with gr.Blocks(title="OpenEnv Code Review Hackathon", theme=gr.themes.Soft()) as app:
|
| 15 |
+
gr.Markdown("# OpenEnv Environment: Code Review")
|
| 16 |
+
gr.Markdown("This interface runs the `inference.py` backend and displays the `[START]`, `[STEP]`, `[END]` output strictly required by the hackathon spec.")
|
| 17 |
+
|
| 18 |
+
with gr.Row():
|
| 19 |
+
run_btn = gr.Button("Run Inference Agent")
|
| 20 |
+
|
| 21 |
+
with gr.Row():
|
| 22 |
+
output_display = gr.Textbox(label="Agent Output Log", lines=15, interactive=False)
|
| 23 |
+
|
| 24 |
+
run_btn.click(fn=run_inference, outputs=output_display)
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
app.launch(server_name="0.0.0.0", server_port=7860)
|
checklist.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Submission Verification Checklist
|
| 2 |
+
|
| 3 |
+
Before submitting your project, double-check that all constraints and formats are satisfied.
|
| 4 |
+
|
| 5 |
+
### Required Files
|
| 6 |
+
- [ ] `inference.py` exists in the project root
|
| 7 |
+
- [ ] `requirements.txt` is updated and working
|
| 8 |
+
- [ ] `README.md` features clear instructions and your Demo URL
|
| 9 |
+
- [ ] Demo script/video (if applicable)
|
| 10 |
+
|
| 11 |
+
### Environment & Integrations
|
| 12 |
+
- [ ] `API_BASE_URL` reads properly and falls back to a default value
|
| 13 |
+
- [ ] `MODEL_NAME` reads properly and falls back to a default value
|
| 14 |
+
- [ ] `HF_TOKEN` is verified and successfully read
|
| 15 |
+
- [ ] The OpenAI Python Client SDK is strictly used for all LLM calls (no `requests` module directly)
|
| 16 |
+
|
| 17 |
+
### Evaluation Constraints
|
| 18 |
+
- [ ] Exact output format for `[START]` is used
|
| 19 |
+
- [ ] Exact output format for `[STEP]` is used
|
| 20 |
+
- [ ] Exact output format for `[END]` is used (always emitted)
|
| 21 |
+
- [ ] Rewards log formatted exactly to `2` decimal places (e.g. `1.00`, not `1.0` or `1`)
|
| 22 |
+
- [ ] Booleans printed strictly as lowercase `true` or `false` (e.g., `success=true`, `done=false`)
|
| 23 |
+
|
| 24 |
+
### Hugging Face Space & Operations
|
| 25 |
+
- [ ] Hugging Face Space is Public and deployed in a 'Running' state
|
| 26 |
+
- [ ] Unnecessary unused Spaces are disabled or turned off
|
| 27 |
+
- [ ] The Space/inference runs cleanly within `2 vCPU` and `8 GB RAM` limits
|
| 28 |
+
- [ ] The dockerization / environment does not rely on unpublished local-only dependencies
|
| 29 |
+
|
| 30 |
+
Good luck on Round 1!
|
code_review_env.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
|
| 6 |
+
class CodeReviewEnv:
|
| 7 |
+
def __init__(self, dataset_name="Krish-05/krish-bug-detect-fix", split="train"):
|
| 8 |
+
self.task_name = "code_review_task"
|
| 9 |
+
self.benchmark_name = "krish_bug_detect_benchmark"
|
| 10 |
+
self.dataset_name = dataset_name
|
| 11 |
+
self.split = split
|
| 12 |
+
self.steps_taken = 0
|
| 13 |
+
self.rewards = []
|
| 14 |
+
self.current_sample = None
|
| 15 |
+
self.max_steps = 5
|
| 16 |
+
self._load_dataset()
|
| 17 |
+
|
| 18 |
+
def _load_dataset(self):
|
| 19 |
+
try:
|
| 20 |
+
self.dataset = load_dataset(self.dataset_name, split=self.split)
|
| 21 |
+
self.current_idx = 0
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print(f"Error loading dataset: {e}")
|
| 24 |
+
self.dataset = None
|
| 25 |
+
|
| 26 |
+
def reset(self):
|
| 27 |
+
self.steps_taken = 0
|
| 28 |
+
self.rewards = []
|
| 29 |
+
|
| 30 |
+
if self.dataset is None:
|
| 31 |
+
return "Error: Dataset not loaded."
|
| 32 |
+
|
| 33 |
+
self.current_sample = self.dataset[self.current_idx]
|
| 34 |
+
self.current_idx = (self.current_idx + 1) % len(self.dataset)
|
| 35 |
+
|
| 36 |
+
# NOTE: Adjusting these keys ('instruction', 'input', 'output' or similar)
|
| 37 |
+
# depending on the actual schema of Krish-05/krish-bug-detect-fix
|
| 38 |
+
buggy_code = self.current_sample.get('buggy_code', self.current_sample.get('input', 'No code found'))
|
| 39 |
+
|
| 40 |
+
observation = f"""You are a senior code reviewer. Please review the following code:
|
| 41 |
+
|
| 42 |
+
{buggy_code}
|
| 43 |
+
|
| 44 |
+
Available actions:
|
| 45 |
+
1. COMMENT <line_number> <issue_description>
|
| 46 |
+
2. APPROVE
|
| 47 |
+
3. REQUEST_CHANGES
|
| 48 |
+
"""
|
| 49 |
+
return observation
|
| 50 |
+
|
| 51 |
+
def step(self, action):
|
| 52 |
+
self.steps_taken += 1
|
| 53 |
+
done = False
|
| 54 |
+
reward = 0.0
|
| 55 |
+
|
| 56 |
+
action = action.strip()
|
| 57 |
+
|
| 58 |
+
if action.startswith("COMMENT"):
|
| 59 |
+
# Acknowledge comment but typically delay final reward until the end
|
| 60 |
+
reward = 0.5 # Intermediate reward for finding something to comment on
|
| 61 |
+
obs = "Comment recorded. Any other issues, or are you ready to APPROVE / REQUEST_CHANGES?"
|
| 62 |
+
|
| 63 |
+
elif action == "APPROVE":
|
| 64 |
+
# If the code had bugs but the agent approved, negative reward. Let's assume there's always a bug in this dataset.
|
| 65 |
+
reward = -1.0
|
| 66 |
+
done = True
|
| 67 |
+
obs = "You approved flawed code."
|
| 68 |
+
|
| 69 |
+
elif action == "REQUEST_CHANGES":
|
| 70 |
+
# Good job, they rejected buggy code
|
| 71 |
+
reward = 1.0
|
| 72 |
+
done = True
|
| 73 |
+
obs = "Changes requested successfully."
|
| 74 |
+
|
| 75 |
+
else:
|
| 76 |
+
reward = -0.1
|
| 77 |
+
obs = "Invalid action format. Use COMMENT <line> <text>, APPROVE, or REQUEST_CHANGES."
|
| 78 |
+
if self.steps_taken >= self.max_steps:
|
| 79 |
+
done = True
|
| 80 |
+
|
| 81 |
+
if self.steps_taken >= self.max_steps:
|
| 82 |
+
done = True
|
| 83 |
+
|
| 84 |
+
self.rewards.append(reward)
|
| 85 |
+
formatted_reward = f"{reward:.2f}"
|
| 86 |
+
|
| 87 |
+
return obs, formatted_reward, done, None
|
| 88 |
+
|
inference.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import traceback
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from code_review_env import CodeReviewEnv
|
| 6 |
+
|
| 7 |
+
# -------------------------------------------------------------------
|
| 8 |
+
# Configuration & Environment Variables
|
| 9 |
+
# -------------------------------------------------------------------
|
| 10 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 11 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-3.5-turbo")
|
| 12 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 13 |
+
|
| 14 |
+
def validate_environment():
|
| 15 |
+
"""Ensure required environment variables like HF_TOKEN are present."""
|
| 16 |
+
if not HF_TOKEN:
|
| 17 |
+
print("[STEP] step=0 action=init reward=0.00 done=true error=HF_TOKEN_missing")
|
| 18 |
+
print("[END] success=false steps=0 rewards=")
|
| 19 |
+
sys.exit(1)
|
| 20 |
+
|
| 21 |
+
# -------------------------------------------------------------------
|
| 22 |
+
# Main Inference Loop
|
| 23 |
+
# -------------------------------------------------------------------
|
| 24 |
+
def main():
|
| 25 |
+
validate_environment()
|
| 26 |
+
|
| 27 |
+
# Initialize OpenAI Client (per requirements, use OpenAI Python client)
|
| 28 |
+
client = OpenAI(
|
| 29 |
+
base_url=API_BASE_URL,
|
| 30 |
+
api_key=os.getenv("OPENAI_API_KEY", "dummy_if_not_needed_for_custom_endpoint")
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
env = CodeReviewEnv()
|
| 34 |
+
|
| 35 |
+
# [START] Output
|
| 36 |
+
print(f"[START] task={env.task_name} env={env.benchmark_name} model={MODEL_NAME}")
|
| 37 |
+
|
| 38 |
+
success = False
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
obs = env.reset()
|
| 42 |
+
done = False
|
| 43 |
+
|
| 44 |
+
while not done:
|
| 45 |
+
# Replace dummy action with actual LLM generation using the standard OpenAI client
|
| 46 |
+
response = client.chat.completions.create(
|
| 47 |
+
model=MODEL_NAME,
|
| 48 |
+
messages=[
|
| 49 |
+
{"role": "system", "content": "You are a precise code reviewer. Your ONLY allowed outputs are: 'COMMENT <line> <text>', 'APPROVE', or 'REQUEST_CHANGES'."},
|
| 50 |
+
{"role": "user", "content": obs}
|
| 51 |
+
],
|
| 52 |
+
max_tokens=100
|
| 53 |
+
)
|
| 54 |
+
action_str = response.choices[0].message.content.strip()
|
| 55 |
+
|
| 56 |
+
obs, reward_str, done, error = env.step(action_str)
|
| 57 |
+
|
| 58 |
+
error_str = error if error else "null"
|
| 59 |
+
done_str = "true" if done else "false"
|
| 60 |
+
|
| 61 |
+
# [STEP] Output
|
| 62 |
+
print(f"[STEP] step={env.steps_taken} action={action_str} reward={reward_str} done={done_str} error={error_str}")
|
| 63 |
+
|
| 64 |
+
success = True
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
error_msg = str(e).replace('\n', ' ')
|
| 68 |
+
print(f"[STEP] step={env.steps_taken} action=error reward=0.00 done=true error={error_msg}")
|
| 69 |
+
success = False
|
| 70 |
+
finally:
|
| 71 |
+
# [END] Output MUST ALWAYS be emitted, even on exceptions
|
| 72 |
+
success_str = "true" if success else "false"
|
| 73 |
+
rewards_str = ",".join([f"{r:.2f}" for r in env.rewards])
|
| 74 |
+
print(f"[END] success={success_str} steps={env.steps_taken} rewards={rewards_str}")
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai>=1.0.0
|
| 2 |
+
python-dotenv
|
| 3 |
+
datasets
|
| 4 |
+
gradio
|
| 5 |
+
|
| 6 |
+
# Add any required OpenEnv or domain-specific packages below:
|
| 7 |
+
# openenv
|