TroyHow's picture
Upload 6 files
e029b17 verified
envs:
max_steps: 2
task: hotpotqa
data_path: /home/ai-faculty/workspace/datasets/hotpotqa
sort_by_index: false
positions_processor: none
positions_processor_dict:
none:
_target_: envs.text_env.AbsolutePositionProcessor
absolute:
_target_: envs.text_env.AbsolutePositionProcessor
random:
_target_: envs.text_env.RandomPositionProcessor
max_chunks_count: 2000
relative:
_target_: envs.text_env.RelativePositionProcessor
step_size: 20
train_dataset:
_target_: envs.RetrievalHotPotQA
path: ${envs.data_path}
split: train
seed: ${seed}
test_dataset:
_target_: envs.RetrievalHotPotQA
path: ${envs.data_path}
split: eval
seed: ${seed}
env:
_target_: envs.qa_env.QAEnv
max_steps: ${envs.max_steps}
action_embed_length: ${max_action_length}
max_action_length_in_memory: ${max_action_length_in_memory}
separator: ' [SEP] '
sort_by_index: ${envs.sort_by_index}
positions_processor: ${envs.positions_processor_dict.${envs.positions_processor}}
feedback_model: ${feedback.feedback_dict.${feedback.type}}
dataset:
_target_: envs.QADatasetAdapter
dataset: ${envs.train_dataset}
test_env:
_target_: envs.qa_env.QAEnv
max_steps: ${envs.env.max_steps}
action_embed_length: ${max_action_length}
max_action_length_in_memory: ${max_action_length_in_memory}
separator: ${envs.env.separator}
sort_by_index: ${envs.sort_by_index}
positions_processor: ${envs.env.positions_processor}
feedback_model:
_target_: rl.feedback.GroundTruthFeedback
penalize_extra_steps: false
dataset:
_target_: envs.QADatasetAdapter
dataset: ${envs.test_dataset}
feedback:
type: gt
never_terminate: true
ground_truth:
_target_: rl.feedback.GroundTruthFeedback
penalize_extra_steps: true
never_terminate: true
exact_match:
_target_: rl.feedback.AnswerMetricFeedback
llm_generator:
_target_: rl.feedback.LLMGenerator
use_api: ${feedback.use_api}
model_name: ${feedback.model}
sampling_params: ${feedback.sampling_params}
vllm_config: ${feedback.vllm_config}
api_base_url: ${feedback.api_base_url}
api_key: ${feedback.api_key}
max_at_same_time: ${feedback.max_at_same_time}
prepare_messages_func: ${feedback.prompt_formatter}
disable_thinking: false
metric: ${feedback.metric}
completion_threshold: ${feedback.reward_scaling}
reward_scaling: ${feedback.reward_scaling}
never_terminate: ${feedback.never_terminate}
feedback_dict:
gt: ${feedback.ground_truth}
babilong_em: ${feedback.exact_match}
model: Qwen/Qwen3-4B
reward_scaling: 1.0
sampling_params:
max_tokens: 32
temperature: 0.0
top_p: 0.95
vllm_config:
gpu_memory_utilization: 0.4
max_model_len: 2048
dtype: bfloat16
quantization: null
tensor_parallel_size: 1
trust_remote_code: true
use_api: true
api_base_url: http://localhost:10001/v1
api_key: keykey
max_at_same_time: 20
metric:
_target_: prompts_and_metrics.babilong.BabilongExactMatch
prompt_formatter:
_target_: prompts_and_metrics.babilong.BabilongPromptFormatter
babi_task: ${envs.task}
algo:
model:
model_name: intfloat/multilingual-e5-large
revision: main
use_fast_tokenizer: true
predictor:
num_layers: 24
input_dim: 1024
hidden_dim: 512
output_dim: 1
max_seq_len: 5000
interpolate_factor: 1
action_model:
_target_: rl.bert_predictor.BertPredictor
bert:
_target_: transformers.AutoModel.from_pretrained
pretrained_model_name_or_path: ${algo.model.model_name}
revision: ${algo.model.revision}
num_hidden_layers: ${algo.model.predictor.num_layers}
tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: ${algo.model.model_name}
revision: ${algo.model.revision}
use_fast: ${algo.model.use_fast_tokenizer}
model_dim: ${algo.model.predictor.input_dim}
output_size: ${algo.model.predictor.hidden_dim}
n_output: ${algo.model.predictor.output_dim}
action_embed_dict:
absolute:
_target_: rl.bert_predictor.EmbedderWithAbsoluteEncoding
model: ${algo.action_model}
max_seq_len: ${algo.model.predictor.max_seq_len}
random:
_target_: rl.bert_predictor.EmbedderWithAbsoluteEncoding
model: ${algo.action_model}
max_seq_len: ${algo.model.predictor.max_seq_len}
interpolate_factor: ${algo.model.predictor.interpolate_factor}
relative:
_target_: rl.bert_predictor.EmbedderWithRelativeEncoding
model: ${algo.action_model}
max_seq_len: 1000
none:
_target_: rl.bert_predictor.EmbedderNone
model: ${algo.action_model}
pqn:
_target_: rl.pqn.PQN
state_embed:
_target_: rl.bert_predictor.BertPredictor
bert:
_target_: transformers.AutoModel.from_pretrained
pretrained_model_name_or_path: ${algo.model.model_name}
revision: ${algo.model.revision}
num_hidden_layers: ${algo.model.predictor.num_layers}
tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: ${algo.model.model_name}
revision: ${algo.model.revision}
use_fast: ${algo.model.use_fast_tokenizer}
model_dim: ${algo.model.predictor.input_dim}
output_size: ${algo.model.predictor.hidden_dim}
n_output: ${algo.model.predictor.output_dim}
action_embed: ${algo.action_embed_dict.${envs.positions_processor}}
state_embed_target:
_target_: rl.bert_predictor.BertPredictor
bert:
_target_: transformers.AutoModel.from_pretrained
pretrained_model_name_or_path: ${algo.model.model_name}
revision: ${algo.model.revision}
num_hidden_layers: ${algo.model.predictor.num_layers}
tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: ${algo.model.model_name}
revision: ${algo.model.revision}
use_fast: ${algo.model.use_fast_tokenizer}
model_dim: ${algo.model.predictor.input_dim}
output_size: ${algo.model.predictor.hidden_dim}
n_output: ${algo.model.predictor.output_dim}
action_embed_target: ${algo.action_embed_dict.${envs.positions_processor}}
hyperparams:
gamma: 0.99
alpha: 0.05
Lambda: 0.5
tau: 0.02
max_grad_norm: 2.0
accumulate_grads: ${accumulate_grads}
action_embed_length: ${max_action_length}
max_action_length_in_memory: 128
optimizer:
_target_: torch.optim.AdamW
lr: 1.5e-05
betas:
- 0.9
- 0.98
eps: 1.0e-06
weight_decay: 0.0005
scheduler:
_target_: rl.optim.CosineScheduler
total: ${steps_count}
ratio: 0.1
warmup: 1000
logger:
log_dir: runs/Mar25_15-42-26_QRAG_hotpotqa
tensorboard:
_target_: torch.utils.tensorboard.SummaryWriter
comment: _QRAG_${envs.task}
log_dir: runs/Mar25_15-42-26_QRAG_hotpotqa/tb_logs/
seed: 42
device: cuda:0
learning_start: 200
steps_count: 10000
batch_size: 12
accumulate_grads: 8
eval_interval: 50
eval_episodes: 300
envs_parallel: 1
max_action_length: 220
max_action_length_in_memory: 64