| DEFAULT_TEST_DATASET = dict( |
| flickr=dict( |
| filename='./reactiondata/real_test.jsonl', |
| image_folder='./reaction_image', |
| template_file='./config/_base_/dataset/template/reaction.json', |
| type='FlickrDataset'), |
| reg=dict( |
| filename='./reactiondata/train_OCR.jsonl', |
| image_folder='./reaction_image_OCR', |
| template_file='./config/_base_/dataset/template/OCR.json', |
| type='REGDataset')) |
| DEFAULT_TRAIN_DATASET = dict( |
| flickr=dict( |
| filename='./reactiondata/reaction_real_structed.jsonl', |
| image_folder='./reaction_image', |
| template_file='./config/_base_/dataset/template/reaction.json', |
| type='FlickrDataset'), |
| reg=dict( |
| filename='./reactiondata/train_OCR.jsonl', |
| image_folder='./reaction_image_OCR', |
| template_file='./config/_base_/dataset/template/OCR.json', |
| type='REGDataset')) |
| data_args = dict( |
| collator_kwargs=dict(max_length=1024, padding=True), |
| compute_metric=None, |
| gen_kwargs=dict(max_new_tokens=1024, num_beams=1), |
| test=None, |
| train=dict( |
| cfgs=[ |
| dict( |
| filename='./reactiondata/train_OCR.jsonl', |
| image_folder='./reaction_image_OCR', |
| template_file='./config/_base_/dataset/template/OCR.json', |
| type='REGDataset'), |
| dict( |
| filename='./reactiondata/reaction_real_structed.jsonl', |
| image_folder='./reaction_image', |
| template_file='./config/_base_/dataset/template/reaction.json', |
| type='FlickrDataset'), |
| ], |
| probabilities=[ |
| 0.0, |
| 1, |
| ], |
| seed=None, |
| stopping_strategy='first_exhausted', |
| type='InterleaveDateset'), |
| validation=dict( |
| cfgs=[ |
| dict( |
| filename='./reactiondata/real_test.jsonl', |
| image_folder='./reaction_image', |
| template_file='./config/_base_/dataset/template/reaction.json', |
| type='FlickrDataset'), |
| ], |
| type='ConcatDatasetWithShuffle')) |
| model_args = dict( |
| cache_dir=None, |
| conv_args=dict( |
| conv_template='vicuna_v1.1', |
| tokenize_kwargs=dict(truncation_size=2048)), |
| freeze_backbone=False, |
| freeze_mm_mlp_adapter=False, |
| gen_kwargs_set_bos_token_id=True, |
| gen_kwargs_set_eos_token_id=True, |
| gen_kwargs_set_pad_token_id=True, |
| image_token_len=300, |
| mm_use_im_start_end=True, |
| mm_vision_select_layer=-2, |
| model_max_length=2048, |
| model_name_or_path='./exp/reaction_4.2.1', |
| pretrain_mm_mlp_adapter=None, |
| process_func_args=dict( |
| conv=dict(type='ShikraConvProcess'), |
| image=dict(type='ShikraImageProcessor'), |
| target=dict(type='BoxFormatProcess'), |
| text=dict(type='ShikraTextProcess')), |
| sep_image_conv_front=False, |
| target_processor=dict(boxes=dict(type='PlainBoxFormatter')), |
| tune_mm_mlp_adapter=False, |
| type='shikra', |
| version='v1', |
| vision_tower='SenseTime/deformable-detr') |
| training_args = dict( |
| bf16=True, |
| dataloader_num_workers=4, |
| do_eval=False, |
| do_predict=False, |
| do_train=True, |
| evaluation_strategy='no', |
| fsdp='full_shard auto_wrap', |
| fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer', |
| gradient_accumulation_steps=1, |
| gradient_checkpointing=True, |
| learning_rate=2e-05, |
| logging_steps=10, |
| lr_scheduler_type='cosine', |
| num_train_epochs=50, |
| output_dir='./exp/reaction_4.2.2-large', |
| overwrite_output_dir=False, |
| per_device_eval_batch_size=4, |
| per_device_train_batch_size=4, |
| predict_with_generate=True, |
| remove_unused_columns=False, |
| report_to='none', |
| save_steps=10000, |
| save_strategy='steps', |
| save_total_limit=1, |
| seed=42, |
| tf32=True, |
| warmup_ratio=0.03, |
| weight_decay=0.05) |
|
|