Raullen commited on Jan 29

Commit

4fc4792

verified ·

1 Parent(s): 178635f

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
README.md +81 -93
adapter_config.json +4 -4
adapter_model.safetensors +1 -1
all_results.json +10 -6
checkpoint-150/README.md +208 -0
checkpoint-150/adapter_config.json +42 -0
checkpoint-150/adapter_model.safetensors +3 -0
checkpoint-150/added_tokens.json +16 -0
checkpoint-150/chat_template.jinja +7 -0
checkpoint-150/merges.txt +0 -0
checkpoint-150/optimizer.pt +3 -0
checkpoint-150/preprocessor_config.json +39 -0
checkpoint-150/rng_state.pth +3 -0
checkpoint-150/scheduler.pt +3 -0
checkpoint-150/special_tokens_map.json +31 -0
checkpoint-150/tokenizer.json +3 -0
checkpoint-150/tokenizer_config.json +144 -0
checkpoint-150/trainer_state.json +292 -0
checkpoint-150/training_args.bin +3 -0
checkpoint-150/video_preprocessor_config.json +43 -0
checkpoint-150/vocab.json +0 -0
checkpoint-550/README.md +208 -0
checkpoint-550/adapter_config.json +42 -0
checkpoint-550/adapter_model.safetensors +3 -0
checkpoint-550/added_tokens.json +16 -0
checkpoint-550/chat_template.jinja +7 -0
checkpoint-550/merges.txt +0 -0
checkpoint-550/optimizer.pt +3 -0
checkpoint-550/preprocessor_config.json +39 -0
checkpoint-550/rng_state.pth +3 -0
checkpoint-550/scheduler.pt +3 -0
checkpoint-550/special_tokens_map.json +31 -0
checkpoint-550/tokenizer.json +3 -0
checkpoint-550/tokenizer_config.json +144 -0
checkpoint-550/trainer_state.json +980 -0
checkpoint-550/training_args.bin +3 -0
checkpoint-550/video_preprocessor_config.json +43 -0
checkpoint-550/vocab.json +0 -0
checkpoint-600/README.md +208 -0
checkpoint-600/adapter_config.json +42 -0
checkpoint-600/adapter_model.safetensors +3 -0
checkpoint-600/added_tokens.json +16 -0
checkpoint-600/chat_template.jinja +7 -0
checkpoint-600/merges.txt +0 -0
checkpoint-600/optimizer.pt +3 -0
checkpoint-600/preprocessor_config.json +39 -0
checkpoint-600/rng_state.pth +3 -0
checkpoint-600/scheduler.pt +3 -0
checkpoint-600/special_tokens_map.json +31 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-550/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,106 +1,94 @@
 ---
 license: mit
 base_model: Qwen/Qwen2-VL-2B-Instruct
-library_name: peft
 tags:
-  - video-language-model
-  - pet-detection
-  - lora
-  - qwen2-vl
-datasets:
-  - Raullen/petvlm-data
 pipeline_tag: video-text-to-text
 ---
-# PetVLM - Sherlock Pet
-A LoRA adapter for Qwen2-VL-2B-Instruct, fine-tuned to detect naughty vs nice pet behavior with a sarcastic pet detective persona.
-## Model Description
-- **Base Model**: [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
-- **Fine-tuning**: LoRA (rank=16, alpha=32, dropout=0.05)
-- **Training Data**: [Raullen/petvlm-data](https://huggingface.co/datasets/Raullen/petvlm-data) (40 labeled pet videos)
-## Output Format
-- **ALERT!** - Naughty behavior detected (climbing, scratching furniture, stealing food, etc.)
 - **All Clear** - Good behavior (sleeping, eating properly, playing nicely)
-## Usage
-```python
-import torch
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from peft import PeftModel
-from qwen_vl_utils import process_vision_info
-# Load base model
-base_model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen2-VL-2B-Instruct",
-    torch_dtype=torch.bfloat16,
-    device_map="auto"
-)
-# Load LoRA adapter
-model = PeftModel.from_pretrained(base_model, "Raullen/petvlm")
-model.eval()
-processor = AutoProcessor.from_pretrained("Raullen/petvlm")
-# Analyze a video
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "video", "video": "path/to/video.mp4", "nframes": 8},
-            {"type": "text", "text": "What is the pet doing?"}
-        ]
-    }
-]
-text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-image_inputs, video_inputs = process_vision_info(messages)
-inputs = processor(
-    text=[text],
-    images=image_inputs,
-    videos=video_inputs,
-    padding=True,
-    return_tensors="pt"
-).to(model.device)
-with torch.no_grad():
-    output = model.generate(**inputs, max_new_tokens=256)
-response = processor.batch_decode(output[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
-print(response)
-```
-## Training Parameters
-| Parameter | Value |
-|-----------|-------|
-| LoRA rank | 16 |
-| LoRA alpha | 32 |
-| LoRA dropout | 0.05 |
-| Learning rate | 2e-4 |
-| Epochs | 30 |
-| Batch size | 1 |
-## Sample Outputs
-**Naughty:**
-> "ALERT! The acrobatic menace has achieved full Spider-Cat mode, scaling kitchen cabinets like it's an Olympic sport. Recommend securing all elevated surfaces immediately."
-**Nice:**
-> "All Clear. The suspect is peacefully napping by the fireplace like a perfect little angel. Don't be fooled though - this is just the calm before the zoomies."
-## Links
-- **Code**: [github.com/raullenchai/petvlm](https://github.com/raullenchai/petvlm)
-- **Dataset**: [Raullen/petvlm-data](https://huggingface.co/datasets/Raullen/petvlm-data)
-## License
-MIT

 ---
+library_name: peft
 license: mit
 base_model: Qwen/Qwen2-VL-2B-Instruct
 tags:
+- llama-factory
+- lora
+- transformers
+- video-language-model
+- pet-detection
 pipeline_tag: video-text-to-text
+model-index:
+- name: petvlm-v1.1
+  results: []
 ---
+# PetVLM v1.1 - Sherlock Pet
+A video-language model fine-tuned to detect naughty vs nice pet behavior with a sarcastic pet detective persona.
+**Accuracy: 87.5%** on held-out test set (7/8 correct)
+## Model Description
+PetVLM analyzes pet videos and classifies behavior:
+- **ALERT!** - Naughty behavior (climbing furniture, scratching, stealing food)
 - **All Clear** - Good behavior (sleeping, eating properly, playing nicely)
+## Training Data
+- **Dataset Size**: 175 videos (cleaned and balanced)
+- **Label Balance**: 51% ALERT / 49% All Clear
+- **Best Checkpoint**: epoch 3.6
+- **Training Time**: ~13 minutes on RTX 4090
+## Intended Uses
+- Home pet monitoring
+- Pet behavior analysis
+- Entertainment (sarcastic detective persona)
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 1
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 4
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 15.0
+### Training results
+| Training Loss | Epoch   | Step | Validation Loss |
+|:-------------:|:-------:|:----:|:---------------:|
+| 3.2859        | 0.6369  | 25   | 2.8292          |
+| 2.0143        | 1.2548  | 50   | 1.9940          |
+| 1.6829        | 1.8917  | 75   | 1.7856          |
+| 1.4774        | 2.5096  | 100  | 1.7516          |
+| 1.1908        | 3.1274  | 125  | 1.6817          |
+| 1.2046        | 3.7643  | 150  | 1.6712          |
+| 0.8012        | 4.3822  | 175  | 1.7618          |
+| 0.9595        | 5.0     | 200  | 1.7216          |
+| 0.5115        | 5.6369  | 225  | 1.8212          |
+| 0.4666        | 6.2548  | 250  | 2.1777          |
+| 0.4082        | 6.8917  | 275  | 2.0293          |
+| 0.2262        | 7.5096  | 300  | 2.3396          |
+| 0.1774        | 8.1274  | 325  | 2.3459          |
+| 0.1512        | 8.7643  | 350  | 2.4745          |
+| 0.0719        | 9.3822  | 375  | 2.7719          |
+| 0.0741        | 10.0    | 400  | 2.7706          |
+| 0.0679        | 10.6369 | 425  | 2.8739          |
+| 0.036         | 11.2548 | 450  | 3.0192          |
+| 0.03          | 11.8917 | 475  | 3.0430          |
+| 0.0226        | 12.5096 | 500  | 3.1513          |
+| 0.0155        | 13.1274 | 525  | 3.1610          |
+| 0.015         | 13.7643 | 550  | 3.1778          |
+| 0.0169        | 14.3822 | 575  | 3.2098          |
+| 0.0223        | 15.0    | 600  | 3.2078          |
+### Framework versions
+- PEFT 0.17.1
+- Transformers 4.57.1
+- Pytorch 2.10.0+cu128
+- Datasets 4.0.0
+- Tokenizers 0.22.2

adapter_config.json CHANGED Viewed

@@ -27,11 +27,11 @@
   "target_modules": [
     "q_proj",
     "down_proj",
-    "up_proj",
-    "o_proj",
-    "gate_proj",
     "k_proj",
-    "v_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "target_modules": [
     "q_proj",
     "down_proj",
     "k_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f1dd27a313df49d8699dc96b4aa1f47241b6125d126ab2cc20d4d0da9b451aa7
 size 73916992

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e6ca2140cc4e5fe0e7e52a899e43c4d0408c6a76746753531b402c24b163003
 size 73916992

all_results.json CHANGED Viewed

@@ -1,8 +1,12 @@
 {
-    "epoch": 30.0,
-    "total_flos": 5518966123560960.0,
-    "train_loss": 0.2944723299946054,
-    "train_runtime": 391.3174,
-    "train_samples_per_second": 3.067,
-    "train_steps_per_second": 3.067
 }

 {
+    "epoch": 15.0,
+    "eval_loss": 1.6712156534194946,
+    "eval_runtime": 1.0633,
+    "eval_samples_per_second": 16.928,
+    "eval_steps_per_second": 2.821,
+    "total_flos": 9073846488268800.0,
+    "train_loss": 0.6906163080533345,
+    "train_runtime": 779.0869,
+    "train_samples_per_second": 3.023,
+    "train_steps_per_second": 0.77
 }

checkpoint-150/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ''
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

checkpoint-150/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-150/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e6ca2140cc4e5fe0e7e52a899e43c4d0408c6a76746753531b402c24b163003
+size 73916992

checkpoint-150/added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-150/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-150/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a642d17c1c73fad815087952a478c772878909c210fd4914c986672a0bfa548
+size 148053627

checkpoint-150/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

checkpoint-150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd6e6d863e9252d057b8bff34caf1524e75fc35623783624e2dd9cd68bc1abc9
+size 14645

checkpoint-150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3efc26ade01a6604f9a215d672abb2ca177306c58c8b2a458d5dfc6e30502e39
+size 1465

checkpoint-150/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-150/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
+size 11420371

checkpoint-150/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen2VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,292 @@

+{
+  "best_global_step": 150,
+  "best_metric": 1.6712156534194946,
+  "best_model_checkpoint": "/root/Sherlock-Pet/output/sherlock_pet_v2/checkpoint-150",
+  "epoch": 3.7643312101910826,
+  "eval_steps": 25,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.12738853503184713,
+      "grad_norm": 2.745800018310547,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 3.7984,
+      "step": 5
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "grad_norm": 2.8450679779052734,
+      "learning_rate": 1.5e-05,
+      "loss": 3.8072,
+      "step": 10
+    },
+    {
+      "epoch": 0.3821656050955414,
+      "grad_norm": 2.421614408493042,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 3.7454,
+      "step": 15
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "grad_norm": 2.301647186279297,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 3.3677,
+      "step": 20
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "grad_norm": 2.822660446166992,
+      "learning_rate": 4e-05,
+      "loss": 3.2859,
+      "step": 25
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "eval_loss": 2.8292243480682373,
+      "eval_runtime": 1.1711,
+      "eval_samples_per_second": 15.371,
+      "eval_steps_per_second": 2.562,
+      "step": 25
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 3.327336549758911,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 2.8454,
+      "step": 30
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 2.4171416759490967,
+      "learning_rate": 4.999392477398737e-05,
+      "loss": 2.5979,
+      "step": 35
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.1299028396606445,
+      "learning_rate": 4.996924922870762e-05,
+      "loss": 2.5103,
+      "step": 40
+    },
+    {
+      "epoch": 1.127388535031847,
+      "grad_norm": 3.7290937900543213,
+      "learning_rate": 4.992561238637912e-05,
+      "loss": 2.1311,
+      "step": 45
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "grad_norm": 3.363525152206421,
+      "learning_rate": 4.9863047384206835e-05,
+      "loss": 2.0143,
+      "step": 50
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "eval_loss": 1.9939861297607422,
+      "eval_runtime": 1.1275,
+      "eval_samples_per_second": 15.965,
+      "eval_steps_per_second": 2.661,
+      "step": 50
+    },
+    {
+      "epoch": 1.3821656050955413,
+      "grad_norm": 2.934461832046509,
+      "learning_rate": 4.978160173317438e-05,
+      "loss": 2.0247,
+      "step": 55
+    },
+    {
+      "epoch": 1.5095541401273884,
+      "grad_norm": 2.9091031551361084,
+      "learning_rate": 4.968133728196486e-05,
+      "loss": 1.9287,
+      "step": 60
+    },
+    {
+      "epoch": 1.6369426751592355,
+      "grad_norm": 2.8760178089141846,
+      "learning_rate": 4.956233016999379e-05,
+      "loss": 1.7431,
+      "step": 65
+    },
+    {
+      "epoch": 1.7643312101910829,
+      "grad_norm": 3.208024263381958,
+      "learning_rate": 4.9424670769589984e-05,
+      "loss": 1.8979,
+      "step": 70
+    },
+    {
+      "epoch": 1.89171974522293,
+      "grad_norm": 2.7759275436401367,
+      "learning_rate": 4.9268463617368e-05,
+      "loss": 1.6829,
+      "step": 75
+    },
+    {
+      "epoch": 1.89171974522293,
+      "eval_loss": 1.7856135368347168,
+      "eval_runtime": 1.1733,
+      "eval_samples_per_second": 15.342,
+      "eval_steps_per_second": 2.557,
+      "step": 75
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 6.358933448791504,
+      "learning_rate": 4.9093827334844546e-05,
+      "loss": 1.6997,
+      "step": 80
+    },
+    {
+      "epoch": 2.127388535031847,
+      "grad_norm": 4.402135848999023,
+      "learning_rate": 4.8900894538358944e-05,
+      "loss": 1.587,
+      "step": 85
+    },
+    {
+      "epoch": 2.254777070063694,
+      "grad_norm": 3.39581036567688,
+      "learning_rate": 4.8689811738366155e-05,
+      "loss": 1.5455,
+      "step": 90
+    },
+    {
+      "epoch": 2.3821656050955413,
+      "grad_norm": 3.3852970600128174,
+      "learning_rate": 4.8460739228178806e-05,
+      "loss": 1.5951,
+      "step": 95
+    },
+    {
+      "epoch": 2.5095541401273884,
+      "grad_norm": 4.223837375640869,
+      "learning_rate": 4.821385096224268e-05,
+      "loss": 1.4774,
+      "step": 100
+    },
+    {
+      "epoch": 2.5095541401273884,
+      "eval_loss": 1.7516053915023804,
+      "eval_runtime": 1.1669,
+      "eval_samples_per_second": 15.425,
+      "eval_steps_per_second": 2.571,
+      "step": 100
+    },
+    {
+      "epoch": 2.6369426751592355,
+      "grad_norm": 3.7523889541625977,
+      "learning_rate": 4.7949334424038176e-05,
+      "loss": 1.448,
+      "step": 105
+    },
+    {
+      "epoch": 2.7643312101910826,
+      "grad_norm": 4.22092866897583,
+      "learning_rate": 4.7667390483707986e-05,
+      "loss": 1.4961,
+      "step": 110
+    },
+    {
+      "epoch": 2.8917197452229297,
+      "grad_norm": 3.6937355995178223,
+      "learning_rate": 4.736823324551909e-05,
+      "loss": 1.4448,
+      "step": 115
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 8.089461326599121,
+      "learning_rate": 4.7052089885275055e-05,
+      "loss": 1.5377,
+      "step": 120
+    },
+    {
+      "epoch": 3.127388535031847,
+      "grad_norm": 3.593388557434082,
+      "learning_rate": 4.671920047780186e-05,
+      "loss": 1.1908,
+      "step": 125
+    },
+    {
+      "epoch": 3.127388535031847,
+      "eval_loss": 1.6817084550857544,
+      "eval_runtime": 1.1628,
+      "eval_samples_per_second": 15.48,
+      "eval_steps_per_second": 2.58,
+      "step": 125
+    },
+    {
+      "epoch": 3.254777070063694,
+      "grad_norm": 4.622504234313965,
+      "learning_rate": 4.6369817814638475e-05,
+      "loss": 1.3108,
+      "step": 130
+    },
+    {
+      "epoch": 3.3821656050955413,
+      "grad_norm": 5.402882099151611,
+      "learning_rate": 4.600420721207053e-05,
+      "loss": 1.1886,
+      "step": 135
+    },
+    {
+      "epoch": 3.5095541401273884,
+      "grad_norm": 4.38949728012085,
+      "learning_rate": 4.5622646309652794e-05,
+      "loss": 1.1127,
+      "step": 140
+    },
+    {
+      "epoch": 3.6369426751592355,
+      "grad_norm": 4.939103126525879,
+      "learning_rate": 4.522542485937369e-05,
+      "loss": 1.1102,
+      "step": 145
+    },
+    {
+      "epoch": 3.7643312101910826,
+      "grad_norm": 4.718697547912598,
+      "learning_rate": 4.481284450562163e-05,
+      "loss": 1.2046,
+      "step": 150
+    },
+    {
+      "epoch": 3.7643312101910826,
+      "eval_loss": 1.6712156534194946,
+      "eval_runtime": 1.1162,
+      "eval_samples_per_second": 16.126,
+      "eval_steps_per_second": 2.688,
+      "step": 150
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 600,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 15,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2277554611691520.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb46dc44937ba4dbf6e54e6b92a8c41ca12d164348e1ad4fd80375a1f258dd54
+size 6225

checkpoint-150/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

checkpoint-150/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-550/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ''
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

checkpoint-550/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-550/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6ea1d743da91106d7a14ccbe30eafa8e9ff99cf23e9d26be3eb73758fe15dc7
+size 73916992

checkpoint-550/added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-550/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-550/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-550/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b752d0b4e71cb3d1946e391ac6906eb97c14fc9f677532d2697767f3e9b806c3
+size 148053627

checkpoint-550/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

checkpoint-550/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e1851973b8c68a6f82b6396255984ce5e83694a12ce1d695a04bb21de22306d
+size 14645

checkpoint-550/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d59d99262a0f3f58c0510a59464aaddb984b05dc8dea0c6b004171981d01fcf5
+size 1465

checkpoint-550/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-550/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
+size 11420371

checkpoint-550/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen2VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-550/trainer_state.json ADDED Viewed

	@@ -0,0 +1,980 @@

+{
+  "best_global_step": 150,
+  "best_metric": 1.6712156534194946,
+  "best_model_checkpoint": "/root/Sherlock-Pet/output/sherlock_pet_v2/checkpoint-150",
+  "epoch": 13.764331210191083,
+  "eval_steps": 25,
+  "global_step": 550,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.12738853503184713,
+      "grad_norm": 2.745800018310547,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 3.7984,
+      "step": 5
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "grad_norm": 2.8450679779052734,
+      "learning_rate": 1.5e-05,
+      "loss": 3.8072,
+      "step": 10
+    },
+    {
+      "epoch": 0.3821656050955414,
+      "grad_norm": 2.421614408493042,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 3.7454,
+      "step": 15
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "grad_norm": 2.301647186279297,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 3.3677,
+      "step": 20
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "grad_norm": 2.822660446166992,
+      "learning_rate": 4e-05,
+      "loss": 3.2859,
+      "step": 25
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "eval_loss": 2.8292243480682373,
+      "eval_runtime": 1.1711,
+      "eval_samples_per_second": 15.371,
+      "eval_steps_per_second": 2.562,
+      "step": 25
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 3.327336549758911,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 2.8454,
+      "step": 30
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 2.4171416759490967,
+      "learning_rate": 4.999392477398737e-05,
+      "loss": 2.5979,
+      "step": 35
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 6.1299028396606445,
+      "learning_rate": 4.996924922870762e-05,
+      "loss": 2.5103,
+      "step": 40
+    },
+    {
+      "epoch": 1.127388535031847,
+      "grad_norm": 3.7290937900543213,
+      "learning_rate": 4.992561238637912e-05,
+      "loss": 2.1311,
+      "step": 45
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "grad_norm": 3.363525152206421,
+      "learning_rate": 4.9863047384206835e-05,
+      "loss": 2.0143,
+      "step": 50
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "eval_loss": 1.9939861297607422,
+      "eval_runtime": 1.1275,
+      "eval_samples_per_second": 15.965,
+      "eval_steps_per_second": 2.661,
+      "step": 50
+    },
+    {
+      "epoch": 1.3821656050955413,
+      "grad_norm": 2.934461832046509,
+      "learning_rate": 4.978160173317438e-05,
+      "loss": 2.0247,
+      "step": 55
+    },
+    {
+      "epoch": 1.5095541401273884,
+      "grad_norm": 2.9091031551361084,
+      "learning_rate": 4.968133728196486e-05,
+      "loss": 1.9287,
+      "step": 60
+    },
+    {
+      "epoch": 1.6369426751592355,
+      "grad_norm": 2.8760178089141846,
+      "learning_rate": 4.956233016999379e-05,
+      "loss": 1.7431,
+      "step": 65
+    },
+    {
+      "epoch": 1.7643312101910829,
+      "grad_norm": 3.208024263381958,
+      "learning_rate": 4.9424670769589984e-05,
+      "loss": 1.8979,
+      "step": 70
+    },
+    {
+      "epoch": 1.89171974522293,
+      "grad_norm": 2.7759275436401367,
+      "learning_rate": 4.9268463617368e-05,
+      "loss": 1.6829,
+      "step": 75
+    },
+    {
+      "epoch": 1.89171974522293,
+      "eval_loss": 1.7856135368347168,
+      "eval_runtime": 1.1733,
+      "eval_samples_per_second": 15.342,
+      "eval_steps_per_second": 2.557,
+      "step": 75
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 6.358933448791504,
+      "learning_rate": 4.9093827334844546e-05,
+      "loss": 1.6997,
+      "step": 80
+    },
+    {
+      "epoch": 2.127388535031847,
+      "grad_norm": 4.402135848999023,
+      "learning_rate": 4.8900894538358944e-05,
+      "loss": 1.587,
+      "step": 85
+    },
+    {
+      "epoch": 2.254777070063694,
+      "grad_norm": 3.39581036567688,
+      "learning_rate": 4.8689811738366155e-05,
+      "loss": 1.5455,
+      "step": 90
+    },
+    {
+      "epoch": 2.3821656050955413,
+      "grad_norm": 3.3852970600128174,
+      "learning_rate": 4.8460739228178806e-05,
+      "loss": 1.5951,
+      "step": 95
+    },
+    {
+      "epoch": 2.5095541401273884,
+      "grad_norm": 4.223837375640869,
+      "learning_rate": 4.821385096224268e-05,
+      "loss": 1.4774,
+      "step": 100
+    },
+    {
+      "epoch": 2.5095541401273884,
+      "eval_loss": 1.7516053915023804,
+      "eval_runtime": 1.1669,
+      "eval_samples_per_second": 15.425,
+      "eval_steps_per_second": 2.571,
+      "step": 100
+    },
+    {
+      "epoch": 2.6369426751592355,
+      "grad_norm": 3.7523889541625977,
+      "learning_rate": 4.7949334424038176e-05,
+      "loss": 1.448,
+      "step": 105
+    },
+    {
+      "epoch": 2.7643312101910826,
+      "grad_norm": 4.22092866897583,
+      "learning_rate": 4.7667390483707986e-05,
+      "loss": 1.4961,
+      "step": 110
+    },
+    {
+      "epoch": 2.8917197452229297,
+      "grad_norm": 3.6937355995178223,
+      "learning_rate": 4.736823324551909e-05,
+      "loss": 1.4448,
+      "step": 115
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 8.089461326599121,
+      "learning_rate": 4.7052089885275055e-05,
+      "loss": 1.5377,
+      "step": 120
+    },
+    {
+      "epoch": 3.127388535031847,
+      "grad_norm": 3.593388557434082,
+      "learning_rate": 4.671920047780186e-05,
+      "loss": 1.1908,
+      "step": 125
+    },
+    {
+      "epoch": 3.127388535031847,
+      "eval_loss": 1.6817084550857544,
+      "eval_runtime": 1.1628,
+      "eval_samples_per_second": 15.48,
+      "eval_steps_per_second": 2.58,
+      "step": 125
+    },
+    {
+      "epoch": 3.254777070063694,
+      "grad_norm": 4.622504234313965,
+      "learning_rate": 4.6369817814638475e-05,
+      "loss": 1.3108,
+      "step": 130
+    },
+    {
+      "epoch": 3.3821656050955413,
+      "grad_norm": 5.402882099151611,
+      "learning_rate": 4.600420721207053e-05,
+      "loss": 1.1886,
+      "step": 135
+    },
+    {
+      "epoch": 3.5095541401273884,
+      "grad_norm": 4.38949728012085,
+      "learning_rate": 4.5622646309652794e-05,
+      "loss": 1.1127,
+      "step": 140
+    },
+    {
+      "epoch": 3.6369426751592355,
+      "grad_norm": 4.939103126525879,
+      "learning_rate": 4.522542485937369e-05,
+      "loss": 1.1102,
+      "step": 145
+    },
+    {
+      "epoch": 3.7643312101910826,
+      "grad_norm": 4.718697547912598,
+      "learning_rate": 4.481284450562163e-05,
+      "loss": 1.2046,
+      "step": 150
+    },
+    {
+      "epoch": 3.7643312101910826,
+      "eval_loss": 1.6712156534194946,
+      "eval_runtime": 1.1162,
+      "eval_samples_per_second": 16.126,
+      "eval_steps_per_second": 2.688,
+      "step": 150
+    },
+    {
+      "epoch": 3.8917197452229297,
+      "grad_norm": 5.0807881355285645,
+      "learning_rate": 4.438521855612054e-05,
+      "loss": 1.0573,
+      "step": 155
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 8.19217300415039,
+      "learning_rate": 4.3942871744008374e-05,
+      "loss": 1.3037,
+      "step": 160
+    },
+    {
+      "epoch": 4.127388535031847,
+      "grad_norm": 4.953968524932861,
+      "learning_rate": 4.3486139981239304e-05,
+      "loss": 0.879,
+      "step": 165
+    },
+    {
+      "epoch": 4.254777070063694,
+      "grad_norm": 6.172196388244629,
+      "learning_rate": 4.301537010349696e-05,
+      "loss": 0.9412,
+      "step": 170
+    },
+    {
+      "epoch": 4.382165605095541,
+      "grad_norm": 5.977532386779785,
+      "learning_rate": 4.2530919606812216e-05,
+      "loss": 0.8012,
+      "step": 175
+    },
+    {
+      "epoch": 4.382165605095541,
+      "eval_loss": 1.7618356943130493,
+      "eval_runtime": 1.189,
+      "eval_samples_per_second": 15.139,
+      "eval_steps_per_second": 2.523,
+      "step": 175
+    },
+    {
+      "epoch": 4.509554140127388,
+      "grad_norm": 4.794788360595703,
+      "learning_rate": 4.203315637608578e-05,
+      "loss": 0.9443,
+      "step": 180
+    },
+    {
+      "epoch": 4.6369426751592355,
+      "grad_norm": 5.241445541381836,
+      "learning_rate": 4.152245840572153e-05,
+      "loss": 0.8479,
+      "step": 185
+    },
+    {
+      "epoch": 4.764331210191083,
+      "grad_norm": 5.526820182800293,
+      "learning_rate": 4.099921351258292e-05,
+      "loss": 0.8732,
+      "step": 190
+    },
+    {
+      "epoch": 4.89171974522293,
+      "grad_norm": 7.235106945037842,
+      "learning_rate": 4.046381904149024e-05,
+      "loss": 0.8117,
+      "step": 195
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 9.936717987060547,
+      "learning_rate": 3.991668156348261e-05,
+      "loss": 0.9595,
+      "step": 200
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.7216202020645142,
+      "eval_runtime": 1.1216,
+      "eval_samples_per_second": 16.048,
+      "eval_steps_per_second": 2.675,
+      "step": 200
+    },
+    {
+      "epoch": 5.127388535031847,
+      "grad_norm": 5.190134525299072,
+      "learning_rate": 3.935821656707359e-05,
+      "loss": 0.6461,
+      "step": 205
+    },
+    {
+      "epoch": 5.254777070063694,
+      "grad_norm": 6.523595809936523,
+      "learning_rate": 3.878884814273509e-05,
+      "loss": 0.5982,
+      "step": 210
+    },
+    {
+      "epoch": 5.382165605095541,
+      "grad_norm": 6.567984580993652,
+      "learning_rate": 3.8209008660848974e-05,
+      "loss": 0.6205,
+      "step": 215
+    },
+    {
+      "epoch": 5.509554140127388,
+      "grad_norm": 7.569522380828857,
+      "learning_rate": 3.76191384433711e-05,
+      "loss": 0.5558,
+      "step": 220
+    },
+    {
+      "epoch": 5.6369426751592355,
+      "grad_norm": 4.997279644012451,
+      "learning_rate": 3.7019685429456986e-05,
+      "loss": 0.5115,
+      "step": 225
+    },
+    {
+      "epoch": 5.6369426751592355,
+      "eval_loss": 1.8211851119995117,
+      "eval_runtime": 1.1401,
+      "eval_samples_per_second": 15.789,
+      "eval_steps_per_second": 2.631,
+      "step": 225
+    },
+    {
+      "epoch": 5.764331210191083,
+      "grad_norm": 5.285463333129883,
+      "learning_rate": 3.6411104835303166e-05,
+      "loss": 0.6233,
+      "step": 230
+    },
+    {
+      "epoch": 5.89171974522293,
+      "grad_norm": 7.245285987854004,
+      "learning_rate": 3.579385880846232e-05,
+      "loss": 0.5701,
+      "step": 235
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 9.988482475280762,
+      "learning_rate": 3.516841607689501e-05,
+      "loss": 0.652,
+      "step": 240
+    },
+    {
+      "epoch": 6.127388535031847,
+      "grad_norm": 5.565608978271484,
+      "learning_rate": 3.453525159302415e-05,
+      "loss": 0.3898,
+      "step": 245
+    },
+    {
+      "epoch": 6.254777070063694,
+      "grad_norm": 6.4225287437438965,
+      "learning_rate": 3.389484617306292e-05,
+      "loss": 0.4666,
+      "step": 250
+    },
+    {
+      "epoch": 6.254777070063694,
+      "eval_loss": 2.177734136581421,
+      "eval_runtime": 1.0978,
+      "eval_samples_per_second": 16.396,
+      "eval_steps_per_second": 2.733,
+      "step": 250
+    },
+    {
+      "epoch": 6.382165605095541,
+      "grad_norm": 4.287829399108887,
+      "learning_rate": 3.3247686131889574e-05,
+      "loss": 0.3194,
+      "step": 255
+    },
+    {
+      "epoch": 6.509554140127388,
+      "grad_norm": 7.155808448791504,
+      "learning_rate": 3.2594262913746865e-05,
+      "loss": 0.3603,
+      "step": 260
+    },
+    {
+      "epoch": 6.6369426751592355,
+      "grad_norm": 7.35886812210083,
+      "learning_rate": 3.1935072719046115e-05,
+      "loss": 0.4139,
+      "step": 265
+    },
+    {
+      "epoch": 6.764331210191083,
+      "grad_norm": 6.846656322479248,
+      "learning_rate": 3.127061612755961e-05,
+      "loss": 0.398,
+      "step": 270
+    },
+    {
+      "epoch": 6.89171974522293,
+      "grad_norm": 7.843795299530029,
+      "learning_rate": 3.06013977182874e-05,
+      "loss": 0.4082,
+      "step": 275
+    },
+    {
+      "epoch": 6.89171974522293,
+      "eval_loss": 2.0292844772338867,
+      "eval_runtime": 1.1814,
+      "eval_samples_per_second": 15.236,
+      "eval_steps_per_second": 2.539,
+      "step": 275
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 7.938873767852783,
+      "learning_rate": 2.9927925686287006e-05,
+      "loss": 0.3706,
+      "step": 280
+    },
+    {
+      "epoch": 7.127388535031847,
+      "grad_norm": 4.278555393218994,
+      "learning_rate": 2.925071145675733e-05,
+      "loss": 0.2333,
+      "step": 285
+    },
+    {
+      "epoch": 7.254777070063694,
+      "grad_norm": 7.001745700836182,
+      "learning_rate": 2.8570269296669466e-05,
+      "loss": 0.2746,
+      "step": 290
+    },
+    {
+      "epoch": 7.382165605095541,
+      "grad_norm": 7.84188985824585,
+      "learning_rate": 2.788711592423966e-05,
+      "loss": 0.2495,
+      "step": 295
+    },
+    {
+      "epoch": 7.509554140127388,
+      "grad_norm": 7.395780563354492,
+      "learning_rate": 2.720177011654067e-05,
+      "loss": 0.2262,
+      "step": 300
+    },
+    {
+      "epoch": 7.509554140127388,
+      "eval_loss": 2.3396387100219727,
+      "eval_runtime": 1.1699,
+      "eval_samples_per_second": 15.387,
+      "eval_steps_per_second": 2.564,
+      "step": 300
+    },
+    {
+      "epoch": 7.6369426751592355,
+      "grad_norm": 4.985588073730469,
+      "learning_rate": 2.6514752315549847e-05,
+      "loss": 0.2665,
+      "step": 305
+    },
+    {
+      "epoch": 7.764331210191083,
+      "grad_norm": 4.297209739685059,
+      "learning_rate": 2.5826584232932706e-05,
+      "loss": 0.2769,
+      "step": 310
+    },
+    {
+      "epoch": 7.89171974522293,
+      "grad_norm": 6.710094928741455,
+      "learning_rate": 2.5137788453862515e-05,
+      "loss": 0.2252,
+      "step": 315
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 13.744440078735352,
+      "learning_rate": 2.4448888040176365e-05,
+      "loss": 0.2672,
+      "step": 320
+    },
+    {
+      "epoch": 8.127388535031848,
+      "grad_norm": 3.4223597049713135,
+      "learning_rate": 2.3760406133169443e-05,
+      "loss": 0.1774,
+      "step": 325
+    },
+    {
+      "epoch": 8.127388535031848,
+      "eval_loss": 2.345935821533203,
+      "eval_runtime": 1.1092,
+      "eval_samples_per_second": 16.228,
+      "eval_steps_per_second": 2.705,
+      "step": 325
+    },
+    {
+      "epoch": 8.254777070063694,
+      "grad_norm": 4.574872016906738,
+      "learning_rate": 2.3072865556328822e-05,
+      "loss": 0.1316,
+      "step": 330
+    },
+    {
+      "epoch": 8.382165605095542,
+      "grad_norm": 3.8134524822235107,
+      "learning_rate": 2.238678841830867e-05,
+      "loss": 0.1199,
+      "step": 335
+    },
+    {
+      "epoch": 8.509554140127388,
+      "grad_norm": 3.943145751953125,
+      "learning_rate": 2.1702695716448278e-05,
+      "loss": 0.1955,
+      "step": 340
+    },
+    {
+      "epoch": 8.636942675159236,
+      "grad_norm": 5.184530258178711,
+      "learning_rate": 2.1021106941134012e-05,
+      "loss": 0.1369,
+      "step": 345
+    },
+    {
+      "epoch": 8.764331210191083,
+      "grad_norm": 7.904541492462158,
+      "learning_rate": 2.0342539681305602e-05,
+      "loss": 0.1512,
+      "step": 350
+    },
+    {
+      "epoch": 8.764331210191083,
+      "eval_loss": 2.4745495319366455,
+      "eval_runtime": 1.105,
+      "eval_samples_per_second": 16.29,
+      "eval_steps_per_second": 2.715,
+      "step": 350
+    },
+    {
+      "epoch": 8.89171974522293,
+      "grad_norm": 7.2708740234375,
+      "learning_rate": 1.9667509231406334e-05,
+      "loss": 0.1519,
+      "step": 355
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 13.182251930236816,
+      "learning_rate": 1.899652820007576e-05,
+      "loss": 0.1857,
+      "step": 360
+    },
+    {
+      "epoch": 9.127388535031848,
+      "grad_norm": 2.628040313720703,
+      "learning_rate": 1.8330106120881846e-05,
+      "loss": 0.0731,
+      "step": 365
+    },
+    {
+      "epoch": 9.254777070063694,
+      "grad_norm": 2.4514880180358887,
+      "learning_rate": 1.7668749065388385e-05,
+      "loss": 0.0974,
+      "step": 370
+    },
+    {
+      "epoch": 9.382165605095542,
+      "grad_norm": 3.9326188564300537,
+      "learning_rate": 1.70129592588513e-05,
+      "loss": 0.0719,
+      "step": 375
+    },
+    {
+      "epoch": 9.382165605095542,
+      "eval_loss": 2.771934747695923,
+      "eval_runtime": 1.0823,
+      "eval_samples_per_second": 16.632,
+      "eval_steps_per_second": 2.772,
+      "step": 375
+    },
+    {
+      "epoch": 9.509554140127388,
+      "grad_norm": 5.222805976867676,
+      "learning_rate": 1.6363234698835896e-05,
+      "loss": 0.0901,
+      "step": 380
+    },
+    {
+      "epoch": 9.636942675159236,
+      "grad_norm": 4.620385646820068,
+      "learning_rate": 1.5720068777044476e-05,
+      "loss": 0.1384,
+      "step": 385
+    },
+    {
+      "epoch": 9.764331210191083,
+      "grad_norm": 3.4555652141571045,
+      "learning_rate": 1.5083949904641654e-05,
+      "loss": 0.0785,
+      "step": 390
+    },
+    {
+      "epoch": 9.89171974522293,
+      "grad_norm": 5.134110927581787,
+      "learning_rate": 1.44553611413617e-05,
+      "loss": 0.0952,
+      "step": 395
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2.9739015102386475,
+      "learning_rate": 1.383477982867984e-05,
+      "loss": 0.0741,
+      "step": 400
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 2.7705841064453125,
+      "eval_runtime": 1.0966,
+      "eval_samples_per_second": 16.414,
+      "eval_steps_per_second": 2.736,
+      "step": 400
+    },
+    {
+      "epoch": 10.127388535031848,
+      "grad_norm": 2.2853662967681885,
+      "learning_rate": 1.322267722732582e-05,
+      "loss": 0.0545,
+      "step": 405
+    },
+    {
+      "epoch": 10.254777070063694,
+      "grad_norm": 8.12447738647461,
+      "learning_rate": 1.2619518159415139e-05,
+      "loss": 0.0576,
+      "step": 410
+    },
+    {
+      "epoch": 10.382165605095542,
+      "grad_norm": 1.5649133920669556,
+      "learning_rate": 1.202576065546963e-05,
+      "loss": 0.0513,
+      "step": 415
+    },
+    {
+      "epoch": 10.509554140127388,
+      "grad_norm": 6.421708583831787,
+      "learning_rate": 1.1441855606595545e-05,
+      "loss": 0.061,
+      "step": 420
+    },
+    {
+      "epoch": 10.636942675159236,
+      "grad_norm": 4.642502784729004,
+      "learning_rate": 1.0868246422083204e-05,
+      "loss": 0.0679,
+      "step": 425
+    },
+    {
+      "epoch": 10.636942675159236,
+      "eval_loss": 2.8739261627197266,
+      "eval_runtime": 1.1304,
+      "eval_samples_per_second": 15.924,
+      "eval_steps_per_second": 2.654,
+      "step": 425
+    },
+    {
+      "epoch": 10.764331210191083,
+      "grad_norm": 4.243600845336914,
+      "learning_rate": 1.0305368692688174e-05,
+      "loss": 0.0629,
+      "step": 430
+    },
+    {
+      "epoch": 10.89171974522293,
+      "grad_norm": 2.6516973972320557,
+      "learning_rate": 9.753649859849775e-06,
+      "loss": 0.0557,
+      "step": 435
+    },
+    {
+      "epoch": 11.0,
+      "grad_norm": 10.662928581237793,
+      "learning_rate": 9.213508891098064e-06,
+      "loss": 0.0572,
+      "step": 440
+    },
+    {
+      "epoch": 11.127388535031848,
+      "grad_norm": 2.290684223175049,
+      "learning_rate": 8.685355961895784e-06,
+      "loss": 0.0381,
+      "step": 445
+    },
+    {
+      "epoch": 11.254777070063694,
+      "grad_norm": 3.5246331691741943,
+      "learning_rate": 8.169592144156885e-06,
+      "loss": 0.036,
+      "step": 450
+    },
+    {
+      "epoch": 11.254777070063694,
+      "eval_loss": 3.0192348957061768,
+      "eval_runtime": 1.1442,
+      "eval_samples_per_second": 15.732,
+      "eval_steps_per_second": 2.622,
+      "step": 450
+    },
+    {
+      "epoch": 11.382165605095542,
+      "grad_norm": 1.1039093732833862,
+      "learning_rate": 7.666609101678121e-06,
+      "loss": 0.0416,
+      "step": 455
+    },
+    {
+      "epoch": 11.509554140127388,
+      "grad_norm": 1.7566171884536743,
+      "learning_rate": 7.176788792715075e-06,
+      "loss": 0.0469,
+      "step": 460
+    },
+    {
+      "epoch": 11.636942675159236,
+      "grad_norm": 5.540022373199463,
+      "learning_rate": 6.700503179928458e-06,
+      "loss": 0.0593,
+      "step": 465
+    },
+    {
+      "epoch": 11.764331210191083,
+      "grad_norm": 2.7685348987579346,
+      "learning_rate": 6.2381139479208564e-06,
+      "loss": 0.0322,
+      "step": 470
+    },
+    {
+      "epoch": 11.89171974522293,
+      "grad_norm": 1.5013082027435303,
+      "learning_rate": 5.78997222857853e-06,
+      "loss": 0.03,
+      "step": 475
+    },
+    {
+      "epoch": 11.89171974522293,
+      "eval_loss": 3.0430448055267334,
+      "eval_runtime": 1.142,
+      "eval_samples_per_second": 15.762,
+      "eval_steps_per_second": 2.627,
+      "step": 475
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 1.6596020460128784,
+      "learning_rate": 5.356418334426791e-06,
+      "loss": 0.0206,
+      "step": 480
+    },
+    {
+      "epoch": 12.127388535031848,
+      "grad_norm": 1.6733341217041016,
+      "learning_rate": 4.937781500201474e-06,
+      "loss": 0.0269,
+      "step": 485
+    },
+    {
+      "epoch": 12.254777070063694,
+      "grad_norm": 2.3883769512176514,
+      "learning_rate": 4.534379632832692e-06,
+      "loss": 0.0243,
+      "step": 490
+    },
+    {
+      "epoch": 12.382165605095542,
+      "grad_norm": 1.5162509679794312,
+      "learning_rate": 4.146519070030757e-06,
+      "loss": 0.0227,
+      "step": 495
+    },
+    {
+      "epoch": 12.509554140127388,
+      "grad_norm": 1.2816208600997925,
+      "learning_rate": 3.7744943476576562e-06,
+      "loss": 0.0226,
+      "step": 500
+    },
+    {
+      "epoch": 12.509554140127388,
+      "eval_loss": 3.151336908340454,
+      "eval_runtime": 1.1322,
+      "eval_samples_per_second": 15.898,
+      "eval_steps_per_second": 2.65,
+      "step": 500
+    },
+    {
+      "epoch": 12.636942675159236,
+      "grad_norm": 0.6974829435348511,
+      "learning_rate": 3.418587976060653e-06,
+      "loss": 0.0206,
+      "step": 505
+    },
+    {
+      "epoch": 12.764331210191083,
+      "grad_norm": 0.8186230659484863,
+      "learning_rate": 3.0790702255378974e-06,
+      "loss": 0.0287,
+      "step": 510
+    },
+    {
+      "epoch": 12.89171974522293,
+      "grad_norm": 0.8705646395683289,
+      "learning_rate": 2.7561989210989235e-06,
+      "loss": 0.0218,
+      "step": 515
+    },
+    {
+      "epoch": 13.0,
+      "grad_norm": 3.0652782917022705,
+      "learning_rate": 2.4502192466760276e-06,
+      "loss": 0.0268,
+      "step": 520
+    },
+    {
+      "epoch": 13.127388535031848,
+      "grad_norm": 1.0207325220108032,
+      "learning_rate": 2.1613635589349756e-06,
+      "loss": 0.0155,
+      "step": 525
+    },
+    {
+      "epoch": 13.127388535031848,
+      "eval_loss": 3.1609668731689453,
+      "eval_runtime": 1.1337,
+      "eval_samples_per_second": 15.877,
+      "eval_steps_per_second": 2.646,
+      "step": 525
+    },
+    {
+      "epoch": 13.254777070063694,
+      "grad_norm": 1.7514115571975708,
+      "learning_rate": 1.8898512108266569e-06,
+      "loss": 0.017,
+      "step": 530
+    },
+    {
+      "epoch": 13.382165605095542,
+      "grad_norm": 1.1046432256698608,
+      "learning_rate": 1.6358883850134816e-06,
+      "loss": 0.0154,
+      "step": 535
+    },
+    {
+      "epoch": 13.509554140127388,
+      "grad_norm": 0.9812297821044922,
+      "learning_rate": 1.3996679372972304e-06,
+      "loss": 0.0205,
+      "step": 540
+    },
+    {
+      "epoch": 13.636942675159236,
+      "grad_norm": 1.1135008335113525,
+      "learning_rate": 1.1813692501670276e-06,
+      "loss": 0.0249,
+      "step": 545
+    },
+    {
+      "epoch": 13.764331210191083,
+      "grad_norm": 1.5334270000457764,
+      "learning_rate": 9.811580965787965e-07,
+      "loss": 0.015,
+      "step": 550
+    },
+    {
+      "epoch": 13.764331210191083,
+      "eval_loss": 3.177839517593384,
+      "eval_runtime": 1.1297,
+      "eval_samples_per_second": 15.934,
+      "eval_steps_per_second": 2.656,
+      "step": 550
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 600,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 15,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8326402741149696.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-550/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb46dc44937ba4dbf6e54e6b92a8c41ca12d164348e1ad4fd80375a1f258dd54
+size 6225

checkpoint-550/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

checkpoint-550/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-600/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ''
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

checkpoint-600/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-600/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25f3841e9633ffe5a9bd96a030b2ab2bc6fe6ad8e71e2b2ad644485d70f560ef
+size 73916992

checkpoint-600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-600/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d50570db6472692e90fce409d5b27e147e77b0651a5e37fefd2296832a62439
+size 148053627

checkpoint-600/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

checkpoint-600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c5fcf8676def8d8317087706af7bea7f1d82a0c9edc6054cc3b7221c4c6b4b9
+size 14645

checkpoint-600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae3778a43ab33194e3eb557cd3e82c54c63f22364e734be5b5c32d5a5037dd
+size 1465

checkpoint-600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}