| import os |
| import torch |
| import numpy as np |
| from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead |
| from transformers import AutoTokenizer |
| from datasets import Dataset |
| from peft import PeftModel, AutoPeftModelForCausalLM |
| import sys |
| from transformers import AutoModelForCausalLM |
|
|
| |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../classes'))) |
| from expression import Expression |
| from dataset import RegressionDataset |
|
|
| |
| def compute_reward(expression_str: str) -> float: |
| try: |
| expr = Expression(expression_str) |
| |
| |
| if expr.is_valid_on_dataset(X): |
| score = expr.fit_constants(X, y) |
| return max(0.1 , (float(score) if np.isfinite(score) else -1.0)) |
| else: |
| |
| return -1.0 |
| except Exception as e: |
| |
| return -1.0 |
|
|
| |
| def extract_expression(response: str) -> str: |
| return response.split("expr: ")[1].split("<|endoftext|>")[0].strip() |
|
|
| |
| |
| reg = RegressionDataset('./data/evaluate/srsd-feynman_easy/train', 'feynman-i.18.16.txt', delimiter=' ') |
| X, y = reg.get_numpy() |
|
|
| |
| BASE_MODEL = "augustocsc/Se124M100KInfPrompt_EOS_Merged" |
| LORA_REPO = "augustocsc/Se124M100KInfPrompt_EOS_Merged" |
| TOKENIZER_REPO = LORA_REPO |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| model = AutoModelForCausalLMWithValueHead.from_pretrained(BASE_MODEL) |
| ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(BASE_MODEL) |
| tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO) |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model = model.to(device) |
| ref_model = ref_model.to(device) |
|
|
|
|
|
|
| import os |
| os.environ["CUDA_LAUNCH_BLOCKING"] = "1" |
|
|
|
|
| import numpy as np |
|
|
| def get_safe_functions(X, functions=['log', 'sqrt', 'asin', 'tan', 'abs', 'exp', 'sin', 'cos']): |
| """ |
| Returns a list of functions from `functions` that are safe to use on all columns of X. |
| |
| Parameters: |
| X: np.ndarray of shape (n_samples, n_features) |
| functions: list of function names to check |
| |
| Returns: |
| List of function names that are safe to use given the data |
| """ |
| safe_functions = [] |
|
|
| for fn in functions: |
| if fn in {'sin', 'cos', 'exp', 'abs'}: |
| |
| safe_functions.append(fn) |
|
|
| elif fn == 'log': |
| if np.all(X > 0): |
| safe_functions.append(fn) |
|
|
| elif fn == 'sqrt': |
| if np.all(X >= 0): |
| safe_functions.append(fn) |
|
|
| elif fn == 'asin': |
| if np.all((X >= -1) & (X <= 1)): |
| safe_functions.append(fn) |
|
|
| elif fn == 'tan': |
| |
| |
| cos_vals = np.cos(X) |
| if np.all(np.abs(cos_vals) > 1e-6): |
| safe_functions.append(fn) |
|
|
| |
|
|
| return safe_functions |
|
|
|
|
| safe_functions = get_safe_functions(X) |
|
|
| from tqdm import tqdm |
|
|
| ppo_config = PPOConfig( |
| model_name=None, |
| learning_rate=1e-5, |
| batch_size=1024, |
| mini_batch_size=64, |
| gradient_accumulation_steps=1, |
| ppo_epochs=4, |
| log_with=None, |
| optimize_cuda_cache=True, |
| ) |
|
|
| |
| ppo_trainer = PPOTrainer( |
| config=ppo_config, |
| tokenizer=tokenizer, |
| model=model, |
| ref_model=ref_model, |
| |
| ) |
|
|
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
| PROMPT = f""" |
| vars: {", ".join([f"x_{i+1}" for i in range(X.shape[1])])} |
| oper: *, sin |
| cons: C |
| expr:""" |
|
|
| |
| dummy_dataset = Dataset.from_dict({ |
| "prompt": [PROMPT] * 1024 |
| }) |
|
|
| |
| import datetime |
| import json |
| import subprocess |
| now = datetime.datetime.now() |
| timestamp = now.strftime("%Y-%m-%d_%H-%M") |
|
|
| |
| device = next(model.parameters()).device |
|
|
| |
| |
| inputs = tokenizer([PROMPT] * ppo_config.batch_size, return_tensors="pt", padding=True) |
|
|
| |
| inputs = {key: value.to(device) for key, value in inputs.items()} |
|
|
| |
| subprocess.run("clear", shell=True) |
| |
| queries = [inputs["input_ids"][i] for i in range(inputs["input_ids"].size(0))] |
| all_rewards = [] |
| all_responses = [] |
| for epoch in tqdm(range(10), desc="Training Epochs"): |
| responses = [] |
| constants = [] |
| rewards = [] |
| for i in tqdm(range(ppo_config.batch_size), desc="Batch Progress", leave=False): |
| try: |
| input_ids = inputs["input_ids"][i].unsqueeze(0) |
| attention_mask = inputs["attention_mask"][i].unsqueeze(0) |
|
|
| |
| assert torch.all((input_ids >= 0) & (input_ids < model.config.vocab_size)), \ |
| f"Token inválido detectado: max={input_ids.max().item()}, vocab_size={model.config.vocab_size}" |
|
|
| |
| model.config.pad_token_id = tokenizer.pad_token_id |
| reward = -1 |
| while reward < 0: |
| output = model.generate( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| max_new_tokens=30, |
| do_sample=True, |
| top_k=50, |
| top_p=0.95, |
| temperature=0.5, |
| eos_token_id=tokenizer.eos_token_id, |
| pad_token_id=tokenizer.pad_token_id, |
| return_dict_in_generate=True, |
| output_scores=False |
| ) |
| response_ids = output.sequences[0][input_ids.shape[1]:] |
| response = tokenizer.decode(response_ids, skip_special_tokens=True) |
|
|
| reward = compute_reward(response) |
|
|
|
|
| except Exception as e: |
| print(f"Error at index {i}: {e}") |
| print(f"Input IDs: {input_ids}") |
| print(f"Token range: min={input_ids.min()}, max={input_ids.max()}, vocab_size={model.config.vocab_size}") |
| raise e |
|
|
| responses.append(response) |
| rewards.append(reward) |
| all_responses.extend(responses) |
| all_rewards.extend(rewards) |
|
|
| output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../output")) |
| os.makedirs(output_dir, exist_ok=True) |
| output_file = os.path.join(output_dir, f"responses_{timestamp}.txt") |
|
|
| |
| if not os.path.exists(output_file): |
| with open(output_file, "w") as f: |
| f.write("# Model config:\n") |
| f.write(json.dumps(model.config.to_dict(), indent=2)) |
| f.write("\n# PPO config:\n") |
| f.write(json.dumps(ppo_config.__dict__, indent=2)) |
| f.write("\n# Responses and rewards:\n") |
|
|
| |
| with open(output_file, "a") as f: |
| for expr_str, rew in zip(responses, rewards): |
| f.write(json.dumps({"expression": expr_str, "reward": float(rew)}) + "\n") |
| |
| |
| if any(r >= 0.9 for r in rewards): |
| print("Reward >= 0.9 found, stopping training.") |
| break |
|
|
| |
| |
| import concurrent.futures |
|
|
| |
| |
| |
| |
| |
| |
| |
| rewards = [torch.tensor(reward, dtype=torch.float32, device=device) for reward in rewards] |
| |
| |
| responses = [tokenizer(response, return_tensors="pt", padding=True)["input_ids"].squeeze(0).to(device) for response in responses] |
|
|
| |
| ppo_trainer.step(queries, responses, rewards) |
|
|
| |
| top_k = 3 |
| sorted_responses = sorted(zip(responses, rewards), key=lambda x: -x[1]) |
| print(f"\nEpoch {epoch + 1} melhores expressões:") |
| for i, (expr, score) in enumerate(sorted_responses[:top_k]): |
| print(f"{i+1}. {tokenizer.decode(expr, skip_special_tokens=True)} -> R² = {score:.4f}") |
| |
| avg_reward = torch.mean(torch.stack(rewards)).item() |
| median_reward = torch.median(torch.stack(rewards)).item() |
| count_invalid = sum(1 for r in rewards if r == -1.0) |
| print(f"Average Reward: {avg_reward:.4f}, Median Reward: {median_reward:.4f}, Invalid Count: {count_invalid}") |
|
|
|
|