Spaces:
Runtime error
Runtime error
| """Evaluate a trained model against the recruiting environment.""" | |
| import argparse | |
| import json | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| from recruitopenenv import RecruitopenenvEnv, RecruitopenenvAction | |
| SYSTEM_PROMPT = """You are a truck driver recruiter using a CRM system. You only know the driver's name. You must discover their qualifications through conversation, record info in the CRM, get approval, and hire them. | |
| You have 4 tools: | |
| ## crm | |
| - read_candidate: Read the current CRM record | |
| - update_stage: Advance pipeline (contacted β interested β approval_pending β offer_sent β hired) | |
| - update_field: Record info (field + value) | |
| - add_note: Add a free-text note | |
| ## messaging | |
| - send_message: Send a message (topic: greeting, call, experience, home_time, pay, equipment, route, deal_breakers, availability, violations, medical_card, references, pitch, offer, negotiate_pay, negotiate_home_time, signing_bonus, address_concern) | |
| - read_reply: Read the driver's response | |
| ## approval | |
| - request_approval: Request approval for a job (needs job_id) | |
| - check_approval: Check approval status | |
| ## workflow | |
| - wait: Advance time (needed for approval processing) | |
| ## Rules | |
| - Must read CRM before messaging | |
| - Must read_reply before sending another message | |
| - Must request_approval and wait before sending offer | |
| - Must follow stage order: lead β contacted β interested β approval_pending β offer_sent β hired | |
| - Record important info in CRM with update_field | |
| Respond with ONLY JSON: | |
| {"tool": "crm", "action": "read_candidate"} | |
| {"tool": "messaging", "action": "send_message", "topic": "experience"} | |
| {"tool": "messaging", "action": "read_reply"} | |
| {"tool": "crm", "action": "update_field", "field": "cdl_class", "value": "A"} | |
| {"tool": "crm", "action": "update_stage", "stage": "contacted"} | |
| {"tool": "approval", "action": "request_approval", "job_id": 2} | |
| {"tool": "workflow", "action": "wait"} | |
| {"tool": "approval", "action": "check_approval"} | |
| {"tool": "messaging", "action": "send_message", "topic": "offer", "job_id": 2} | |
| {"tool": "crm", "action": "update_stage", "stage": "hired"}""" | |
| def format_observation(obs): | |
| parts = [f"Driver: {obs.driver_name}"] | |
| if obs.crm_summary: | |
| parts.append(f"CRM:\n{obs.crm_summary}") | |
| if obs.jobs_summary: | |
| parts.append(f"Jobs:\n{obs.jobs_summary}") | |
| if obs.discovered_info: | |
| parts.append(f"Discovered:\n{obs.discovered_info}") | |
| status = f"Stage: {obs.stage}" | |
| if obs.pending_reply: | |
| status += " | PENDING REPLY" | |
| parts.append(status) | |
| if obs.feedback: | |
| parts.append(f"Result: {obs.feedback}") | |
| return "\n".join(parts) | |
| def parse_action(text): | |
| text = text.strip() | |
| if "```" in text: | |
| for part in text.split("```"): | |
| part = part.strip() | |
| if part.startswith("json"): | |
| part = part[4:].strip() | |
| if part.startswith("{"): | |
| text = part | |
| break | |
| try: | |
| data = json.loads(text) | |
| if isinstance(data, list): | |
| data = data[0] if data else {} | |
| if isinstance(data, dict) and "tool" in data and "action" in data: | |
| return RecruitopenenvAction( | |
| tool=data["tool"], | |
| action=data["action"], | |
| topic=data.get("topic", ""), | |
| job_id=data.get("job_id", -1), | |
| stage=data.get("stage", ""), | |
| field=data.get("field", ""), | |
| value=data.get("value", ""), | |
| ) | |
| except (json.JSONDecodeError, KeyError, IndexError): | |
| pass | |
| text_lower = text.lower() | |
| if "read_candidate" in text_lower: | |
| return RecruitopenenvAction(tool="crm", action="read_candidate") | |
| if "read_reply" in text_lower: | |
| return RecruitopenenvAction(tool="messaging", action="read_reply") | |
| if "check_approval" in text_lower: | |
| return RecruitopenenvAction(tool="approval", action="check_approval") | |
| if "wait" in text_lower: | |
| return RecruitopenenvAction(tool="workflow", action="wait") | |
| return RecruitopenenvAction(tool="crm", action="read_candidate") | |
| def generate(model, tokenizer, messages, device): | |
| prompt = tokenizer.apply_chat_template( | |
| messages, add_generation_prompt=True, tokenize=False | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=128, | |
| temperature=0.1, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| new_tokens = outputs[0][inputs["input_ids"].shape[1]:] | |
| return tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model", default="./recruit-grpo-output", help="Path to trained model") | |
| parser.add_argument("--base-model", default="Qwen/Qwen2.5-1.5B-Instruct", help="Base model for comparison") | |
| parser.add_argument("--env-url", default="http://localhost:8001") | |
| parser.add_argument("--num-episodes", type=int, default=20) | |
| parser.add_argument("--compare", action="store_true", help="Also run base model for comparison") | |
| args = parser.parse_args() | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| models_to_eval = [("TRAINED", args.model)] | |
| if args.compare: | |
| models_to_eval.append(("BASE", args.base_model)) | |
| for label, model_path in models_to_eval: | |
| print(f"\n{'='*50}") | |
| print(f"Evaluating: {label} ({model_path})") | |
| print(f"{'='*50}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, torch_dtype=torch.float16, device_map="auto" | |
| ) | |
| rewards = [] | |
| successes = 0 | |
| total_steps = 0 | |
| with RecruitopenenvEnv(base_url=args.env_url) as env: | |
| for ep in range(args.num_episodes): | |
| result = env.reset() | |
| obs = result.observation | |
| ep_reward = 0.0 | |
| steps = 0 | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| while not result.done and steps < 100: | |
| obs_text = format_observation(obs) | |
| messages.append({"role": "user", "content": obs_text}) | |
| response = generate(model, tokenizer, messages, device) | |
| messages.append({"role": "assistant", "content": response}) | |
| action = parse_action(response) | |
| result = env.step(action) | |
| obs = result.observation | |
| ep_reward += result.reward | |
| steps += 1 | |
| print(f" Step {steps}: {action.tool}.{action.action}" | |
| f"{'(' + action.topic + ')' if action.topic else ''}" | |
| f"{'[job=' + str(action.job_id) + ']' if action.job_id >= 0 else ''}" | |
| f" -> reward={result.reward:.1f}") | |
| rewards.append(ep_reward) | |
| total_steps += steps | |
| hired = obs.stage == "hired" | |
| if hired: | |
| successes += 1 | |
| print(f"Episode {ep+1}: reward={ep_reward:.1f}, steps={steps}, " | |
| f"{'HIRED' if hired else 'FAIL (' + obs.stage + ')'}") | |
| print() | |
| avg_reward = sum(rewards) / len(rewards) | |
| avg_steps = total_steps / args.num_episodes | |
| print(f"\n{'='*40}") | |
| print(f" {label} RESULTS") | |
| print(f"{'='*40}") | |
| print(f"Model: {model_path}") | |
| print(f"Episodes: {args.num_episodes}") | |
| print(f"Avg reward: {avg_reward:.2f}") | |
| print(f"Min reward: {min(rewards):.2f}") | |
| print(f"Max reward: {max(rewards):.2f}") | |
| print(f"Hire rate: {successes}/{args.num_episodes} ({100*successes/args.num_episodes:.1f}%)") | |
| print(f"Avg steps/episode: {avg_steps:.1f}") | |
| print(f"{'='*40}") | |
| del model | |
| torch.cuda.empty_cache() | |
| if __name__ == "__main__": | |
| main() | |