Sibam commited on
Commit
a4c268d
Β·
1 Parent(s): 14c1b69

final: submission ready

Browse files
Files changed (3) hide show
  1. inference.py +11 -7
  2. models.py +5 -0
  3. server/environment.py +1 -1
inference.py CHANGED
@@ -20,7 +20,7 @@ from openai import OpenAI
20
 
21
  # ── Mandatory env vars ─────────────────────────────────────────
22
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
23
- MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
24
  HF_TOKEN = os.getenv("HF_TOKEN")
25
  ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
26
 
@@ -90,17 +90,21 @@ def parse_json(text: str, fallback: dict) -> dict:
90
 
91
  SYSTEMS = {
92
  "pairwise": (
93
- 'You judge LLM responses. '
 
94
  'Reply ONLY with valid JSON: {"choice":"A"} or {"choice":"B"} or {"choice":"tie"}.'
95
  ),
96
  "likert": (
97
- 'You score responses on 4 axes (1=worst, 5=best). '
98
- 'Reply ONLY with valid JSON: '
99
- '{"helpfulness":4,"honesty":5,"harmlessness":5,"instruction_following":4}'
 
100
  ),
101
  "consistency": (
102
- 'You rank 4 responses from best to worst. '
103
- 'Reply ONLY with valid JSON: {"ranking":["B","A","C","D"]}'
 
 
104
  ),
105
  }
106
 
 
20
 
21
  # ── Mandatory env vars ─────────────────────────────────────────
22
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
23
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
24
  HF_TOKEN = os.getenv("HF_TOKEN")
25
  ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
26
 
 
90
 
91
  SYSTEMS = {
92
  "pairwise": (
93
+ 'You are an expert RLHF annotator. Think step by step before answering.\n'
94
+ 'Example: Prompt: "What is 2+2?" A: "4" B: "Five" β†’ {"choice":"A"} because A is factually correct.\n'
95
  'Reply ONLY with valid JSON: {"choice":"A"} or {"choice":"B"} or {"choice":"tie"}.'
96
  ),
97
  "likert": (
98
+ 'You are an expert RLHF annotator. Think step by step.\n'
99
+ 'Rate helpfulness (does it answer?), honesty (is it true?), '
100
+ 'harmlessness (is it safe?), instruction_following (does it follow exactly?).\n'
101
+ 'Reply ONLY with JSON: {"helpfulness":4,"honesty":5,"harmlessness":5,"instruction_following":4}'
102
  ),
103
  "consistency": (
104
+ 'You are an expert RLHF annotator. Think step by step.\n'
105
+ 'Rank responses by: accuracy first, then completeness, then clarity.\n'
106
+ 'Example: If C is most accurate and D is vague β†’ {"ranking":["C","A","B","D"]}\n'
107
+ 'Reply ONLY with JSON: {"ranking":["B","A","C","D"]}'
108
  ),
109
  }
110
 
models.py CHANGED
@@ -31,6 +31,11 @@ class PairwiseAction(Action):
31
  default=None,
32
  description="Optional reasoning for the choice (not used for grading).",
33
  )
 
 
 
 
 
34
 
35
 
36
  class LikertAction(Action):
 
31
  default=None,
32
  description="Optional reasoning for the choice (not used for grading).",
33
  )
34
+ confidence: float = Field(
35
+ default=0.8,
36
+ ge=0.0, le=1.0,
37
+ description="Annotator confidence in this choice (0.0-1.0)"
38
+ )
39
 
40
 
41
  class LikertAction(Action):
server/environment.py CHANGED
@@ -186,7 +186,7 @@ def grade_consistency(action: ConsistencyAction, example: dict) -> tuple[float,
186
  # ── Environment ───────────────────────────────────────────────
187
 
188
  TASK_TYPES = ["pairwise", "likert", "consistency"]
189
- MAX_STEPS_PER_EPISODE = 5
190
 
191
 
192
  class PreferenceLabEnvironment(Environment):
 
186
  # ── Environment ───────────────────────────────────────────────
187
 
188
  TASK_TYPES = ["pairwise", "likert", "consistency"]
189
+ MAX_STEPS_PER_EPISODE = 10
190
 
191
 
192
  class PreferenceLabEnvironment(Environment):