Spaces:

TencentARC
/

SEED-Story

Runtime error

App Files Files Community

xinlai commited on Jul 26, 2024

Commit

33e54e3

1 Parent(s): 674d663

models

Browse files

Files changed (21) hide show

app.py +12 -62
configs/clm_models/agent_7b_seedx_pretrained.yaml +18 -0
configs/clm_models/agent_7b_sft.yaml +18 -0
configs/clm_models/llama2chat7b_lora.yaml +37 -0
configs/data/george_sdxl.yaml +19 -0
configs/data/george_sft.yaml +19 -0
configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml +15 -0
configs/detokenizer/detokenizer_sdxl_qwen_vit_pretrained.yaml +15 -0
configs/discrete_model/discrete_identity.yaml +1 -0
configs/processer/qwen_448_transform.yaml +4 -0
configs/processer/qwen_448_transform_keep_ratio.yaml +4 -0
configs/processer/sd_transform_1024.yaml +4 -0
configs/tokenizer/clm_llama_tokenizer.yaml +2 -0
configs/visual_tokenizer/qwen_vitg_448.yaml +10 -0
pretrained/cvlm_llama2_tokenizer/added_tokens.json +68 -0
pretrained/cvlm_llama2_tokenizer/special_tokens_map.json +40 -0
pretrained/cvlm_llama2_tokenizer/tokenizer.model +3 -0
pretrained/cvlm_llama2_tokenizer/tokenizer_config.json +573 -0
pretrained/detokenizer/detokenizer_george_adapted/checkpoint-4000/pytorch_model.bin +3 -0
pretrained/qwen_vit_G.pt +3 -0
pretrained/seed_story/george_sft/pytorch_model.bin +3 -0

app.py CHANGED Viewed

@@ -30,12 +30,8 @@ from diffusers import AutoencoderKL, UNet2DConditionModel, EulerDiscreteSchedule
 pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
-from src.data.any_res import process_anyres_image
 BOI_TOKEN = '<img>'
-BOP_TOKEN = '<patch>'
 EOI_TOKEN = '</img>'
-EOP_TOKEN = '</patch>'
 IMG_TOKEN = '<img_{:05d}>'
 IMG_FLAG = '<image>'
@@ -70,7 +66,7 @@ class Arguments:
     tokenizer: Optional[str] = field(default='configs/tokenizer/clm_llama_tokenizer.yaml',
                                      metadata={"help": "config path of tokenizer used to initialize tokenizer"})
     llm: Optional[str] = field(default='configs/clm_models/llama2chat7b_lora.yaml', metadata={"help": "config path of llm"})
-    visual_encoder: Optional[str] = field(default='configs/visual_tokenzier/qwen_vitg_448.yaml',
                                           metadata={"help": "config path of visual encoder"})
     sd_adapter: Optional[str] = field(
         default='configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml',
@@ -158,10 +154,9 @@ class LLMService:
         self.visual_encoder.to(self.vit_sd_device, dtype=self.dtype)
-        model_id_or_path = "stablediffusionapi/realistic-vision-v51"
-        self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None,
-                                                                       torch_dtype=torch.float16)
-        # self.vae_pipe = self.vae_pipe.to(self.vit_sd_device)
         self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
         self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
@@ -171,7 +166,7 @@ service = LLMService(args)
 @spaces.GPU
-def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox, force_polish):
     with torch.no_grad():
         text_list = text_list.split(IMG_FLAG)
         top_p = 0.5
@@ -300,53 +295,17 @@ def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox, force
                 img_feat = img_gen_feat[img_idx:img_idx + 1]
                 generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
-                if force_polish:
-                    # service.sd_adapter = service.sd_adapter.cpu()
-                    # service.vae_pipe = service.vae_pipe.to(service.vit_sd_device, dtype=service.dtype)
-                    torch.cuda.empty_cache()
-                    service.vae_pipe = service.vae_pipe.to(service.vit_sd_device)
-                    init_image = generated_image.resize((1024, 1024))
-                    prompt = ""
-                    images = service.vae_pipe(prompt=prompt, image=init_image,
-                                              num_inference_steps=50, guidance_scale=8.0, strength=0.38).images
-                    generated_image = images[0]
-                    image_base64 = encode_image(generated_image)
-                    gen_imgs_base64_list.append(image_base64)
-                    # service.vae_pipe = service.vae_pipe.to("cpu")
-                    # service.sd_adapter = service.sd_adapter.to(service.vit_sd_device, dtype=service.dtype)
-                    torch.cuda.empty_cache()
-                    # print('loading visual encoder and llm to GPU, and sd to CPU')
             # a = time.time()
             # service.sd_adapter = service.sd_adapter.cpu()
             # service.visual_encoder = service.visual_encoder.to(service.vit_sd_device, dtype=service.dtype)
             # service.agent = service.agent.to(service.vit_sd_device, dtype=service.dtype)
             # print("Loading finished: ", time.time() - a)
-        if args.has_bbox:
-            bboxes = extract_box(generated_text)
-            if bboxes is not None and len(input_images) > 0:
-                image_viz = visualize_bbox(input_images[-1], bboxes)
-                image_base64 = encode_image(image_viz)
-                gen_imgs_base64_list.append(image_base64)
-                if '<box_start>' in generated_text:
-                    generated_text = re.sub(r'\[\[ <box_start>.*?<box_end>.*?\]\]', 'the green bounding box',
-                                            generated_text)
-                else:
-                    generated_text = re.sub(r'<loc-\d+> <loc-\d+> <loc-\d+> <loc-\d+> <box_end>  \]\]',
-                                            'the green bounding box', generated_text)
-                generated_text += IMG_FLAG
         print(input_text + generated_text)
         return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
-def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_gen, force_bbox, force_polish,
              request: gr.Request):
     print('input_state:', input_state)
@@ -365,10 +324,8 @@ def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_g
     text = prompt['text']
     max_new_tokens = int(max_new_tokens)
     images = prompt['images']
-    force_boi = force_image_gen
-    force_bbox = force_bbox
-    results = generate(text, images, max_new_tokens, force_boi, force_bbox, force_polish)
     print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
     output_state = init_input_state()
@@ -588,25 +545,18 @@ def load_demo(request: gr.Request):
 title = ("""
-# SEED-X-I
-[[Paper]](https://arxiv.org/abs/2404.14396) [[Code]](https://github.com/AILab-CVC/SEED-X) [[Faster Demo]](https://arc.tencent.com/en/ai-demos/multimodal)
-Demo of a general instruction-tuned model SEED-X-I (17B) from the foundation model SEED-X.
-SEED-X-I can follow multimodal instruction (including images with **dynamic resolutions**) and make responses with **images, texts and bounding boxes** in multi-turn conversation.
-SEED-X-I **does not support image manipulation**. If you want to experience **SEED-X-Edit** for high-precision image editing, please refer to [[Inference Code]](https://github.com/AILab-CVC/SEED-X).
-If you want to experience the normal model inference speed, you can use [[Faster Demo]](https://arc.tencent.com/en/ai-demos/multimodal) or run [[Inference Code]](https://github.com/AILab-CVC/SEED-X) locally.
 ## Tips:
 * Check out the conversation examples (at the bottom) for inspiration.
 * You can adjust "Max History Rounds" to try a conversation with up to **three rounds due to insufficient GPU memory**. For more turns, you can download our checkpoints from GitHub and deploy them locally for inference.
 * Our demo supports a mix of images and texts as input. You can freely upload an image or enter text, and then click on "Add Image/Text". You can repeat the former step multiple times, and click on "Submit" for model inference at last.
-* You can click "Force Image Generation" to compel the model to produce images when necessary. For example, our model might struggle to generate images when there is an excessive amount of text-only context.
-* You can click "Force Bounding Box" to compel the model to produce bounding box for object detection.
-* You can click "Force Polishing Generated Image" to compel the model to polish the generated image with image post-processing.
-* SEED-X was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
 """)
 css = """

 pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 BOI_TOKEN = '<img>'
 EOI_TOKEN = '</img>'
 IMG_TOKEN = '<img_{:05d}>'
 IMG_FLAG = '<image>'
     tokenizer: Optional[str] = field(default='configs/tokenizer/clm_llama_tokenizer.yaml',
                                      metadata={"help": "config path of tokenizer used to initialize tokenizer"})
     llm: Optional[str] = field(default='configs/clm_models/llama2chat7b_lora.yaml', metadata={"help": "config path of llm"})
+    visual_encoder: Optional[str] = field(default='configs/visual_tokenizer/qwen_vitg_448.yaml',
                                           metadata={"help": "config path of visual encoder"})
     sd_adapter: Optional[str] = field(
         default='configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml',
         self.visual_encoder.to(self.vit_sd_device, dtype=self.dtype)
+        # model_id_or_path = "stablediffusionapi/realistic-vision-v51"
+        # self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None,
+        #                                                                torch_dtype=torch.float16)
         self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
         self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
 @spaces.GPU
+def generate(text_list, image_list, max_new_tokens):
     with torch.no_grad():
         text_list = text_list.split(IMG_FLAG)
         top_p = 0.5
                 img_feat = img_gen_feat[img_idx:img_idx + 1]
                 generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
             # a = time.time()
             # service.sd_adapter = service.sd_adapter.cpu()
             # service.visual_encoder = service.visual_encoder.to(service.vit_sd_device, dtype=service.dtype)
             # service.agent = service.agent.to(service.vit_sd_device, dtype=service.dtype)
             # print("Loading finished: ", time.time() - a)
         print(input_text + generated_text)
         return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
+def http_bot(dialog_state, input_state, max_new_tokens, max_turns,
              request: gr.Request):
     print('input_state:', input_state)
     text = prompt['text']
     max_new_tokens = int(max_new_tokens)
     images = prompt['images']
+    results = generate(text, images, max_new_tokens)
     print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
     output_state = init_input_state()
 title = ("""
+# SEED-Story
+[[Paper]](https://arxiv.org/abs/2407.08683) [[Code]](https://github.com/TencentARC/SEED-Story)
+Demo of a multimodal story generation model SEED-Story-George. It is trained on StoryStream-Curious George subset.
+SEED-Story is a MLLM capable of generating multimodal long stories consisting of rich and coherent narrative texts, along with images that are consistent in characters and style.
 ## Tips:
 * Check out the conversation examples (at the bottom) for inspiration.
 * You can adjust "Max History Rounds" to try a conversation with up to **three rounds due to insufficient GPU memory**. For more turns, you can download our checkpoints from GitHub and deploy them locally for inference.
 * Our demo supports a mix of images and texts as input. You can freely upload an image or enter text, and then click on "Add Image/Text". You can repeat the former step multiple times, and click on "Submit" for model inference at last.
+* SEED-Story was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
 """)
 css = """

configs/clm_models/agent_7b_seedx_pretrained.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_target_: src.models_clm.models.ContinuousLVLM.from_pretrained
+input_resampler:
+  _target_: src.models.qwen_visual.Resampler
+  grid_size: 8
+  embed_dim: 4096
+  num_heads: 32
+  kv_dim: 4096
+output_resampler:
+  _target_: src.models.qwen_visual.Resampler
+  grid_size: 16
+  embed_dim: 4096
+  num_heads: 32
+  kv_dim: 4096
+lm_loss_scale: 1.0
+rec_loss_scale: 1.0
+pretrained_model_path: pretrained/seedx/checkpoint-30000/pytorch_model.bin

configs/clm_models/agent_7b_sft.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_target_: src.models_clm.models.ContinuousLVLM.from_pretrained
+input_resampler:
+  _target_: src.models.qwen_visual.Resampler
+  grid_size: 8
+  embed_dim: 4096
+  num_heads: 32
+  kv_dim: 4096
+output_resampler:
+  _target_: src.models.qwen_visual.Resampler
+  grid_size: 16
+  embed_dim: 4096
+  num_heads: 32
+  kv_dim: 4096
+lm_loss_scale: 1.0
+rec_loss_scale: 1.0
+pretrained_model_path: pretrained/seed_story/george_sft/pytorch_model.bin

configs/clm_models/llama2chat7b_lora.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+_target_: src.models_clm.peft_models.get_peft_model_with_resize_embedding
+model:
+  _target_: src.models_clm.modeling_llama_xformer.LlamaForCausalLM.from_pretrained
+  # _target_: transformers.LlamaForCausalLM.from_pretrained
+  pretrained_model_name_or_path: luodian/llama-7b-hf
+  low_cpu_mem_usage: True
+peft_config:
+  _target_: peft.LoraConfig
+  _convert_: object
+  r: 16
+  lora_alpha: 32
+  modules_to_save:
+    # - embed_tokens
+    # - lm_head
+    - input_layernorm
+    - post_attention_layernorm
+    - norm
+  target_modules:
+    - q_proj
+    - v_proj
+    - k_proj
+    - o_proj
+    - gate_proj
+    - down_proj
+    - up_proj
+  task_type: CAUSAL_LM
+  lora_dropout: 0.05
+vocab_size: 32066
+# _target_: src.models_clm.peft_models.get_model_with_resize_embedding
+# model:
+#   # _target_: src.models_clm.modeling_llama_xformer.LlamaForCausalLM.from_pretrained
+#   _target_: transformers.LlamaForCausalLM.from_pretrained
+#   pretrained_model_name_or_path: /apdcephfs_cq3/share_1290939/sijiezhao/model_hub/Llama-2-7b-hf
+#   low_cpu_mem_usage: True
+# vocab_size: 32066

configs/data/george_sdxl.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+_target_: src.data.story_telling.build_multi_datapipes
+_recursive_: False
+datapipes:
+  - _target_: src.data.story_telling.build_long_story_datapipe
+    data_dir: data/json/george_train10
+    image_dir: data/image/george_full
+    max_length: 1280
+    batch_size: 4
+    instruction_prompt: "{instruction}"
+    # turn_sep: '\n'
+    min_aspect_ratio: 0.2
+    min_resolution: 128
+    num_img_in_tokens: 64
+    num_img_out_tokens: 64
+    cycle_count: 10000
+    story_len: 10
+sample_weights:
+  - 1.0  # llava

configs/data/george_sft.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+_target_: src.data.story_telling.build_multi_datapipes
+_recursive_: False
+datapipes:
+  - _target_: src.data.story_telling.build_long_story_datapipe
+    data_dir: data/json/george_train10
+    image_dir: data/image/george_full
+    max_length: 1280
+    batch_size: 30
+    instruction_prompt: "{instruction}"
+    # turn_sep: '\n'
+    min_aspect_ratio: 0.2
+    min_resolution: 128
+    num_img_in_tokens: 64
+    num_img_out_tokens: 64
+    cycle_count: 10000
+    story_len: 10
+sample_weights:
+  - 1.0  # llava

configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+_target_: src.models_ipa.adapter_modules.SDXLAdapter.from_pretrained
+resampler:
+  _target_: src.models_ipa.resampler.ResamplerXLV2
+  dim: 1024
+  depth: 4
+  dim_head: 64
+  heads: 16
+  num_queries: 64
+  embedding_dim: 4096
+  output1_dim: 768
+  output2_dim: 1280
+  ff_mult: 4
+pretrained_model_path: pretrained/detokenizer/detokenizer_george_adapted/checkpoint-4000/pytorch_model.bin

configs/detokenizer/detokenizer_sdxl_qwen_vit_pretrained.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+_target_: src.models_ipa.adapter_modules.SDXLAdapter.from_pretrained
+resampler:
+  _target_: src.models_ipa.resampler.ResamplerXLV2
+  dim: 1024
+  depth: 4
+  dim_head: 64
+  heads: 16
+  num_queries: 64
+  embedding_dim: 4096
+  output1_dim: 768
+  output2_dim: 1280
+  ff_mult: 4
+pretrained_model_path: pretrained/detokenizer_pretrained/checkpoint-55000/pytorch_model.bin

configs/discrete_model/discrete_identity.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ _target_: src.models.discrete_models.DiscreteModleIdentity

configs/processer/qwen_448_transform.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_target_: src.processer.transforms.get_transform
+type: clip
+image_size: 448
+keep_ratio: False

configs/processer/qwen_448_transform_keep_ratio.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_target_: src.processer.transforms.get_transform
+type: clip
+image_size: 448
+keep_ratio: True

configs/processer/sd_transform_1024.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_target_: src.processer.transforms.get_transform
+type: sd
+image_size: 1024
+keep_ratio: True

configs/tokenizer/clm_llama_tokenizer.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: transformers.LlamaTokenizer.from_pretrained
2	+ pretrained_model_name_or_path: pretrained/cvlm_llama2_tokenizer

configs/visual_tokenizer/qwen_vitg_448.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+_target_: src.models.qwen_visual.VisionTransformerWithAttnPool.from_pretrained
+heads: 16
+image_size: 448
+image_start_id": 151857
+layers: 48
+mlp_ratio: 4.9231
+output_dim: 4096
+patch_size: 14
+width: 1664
+pretrained_model_path: /dataset/syang/pretrained/qwen_vit_G.pt

pretrained/cvlm_llama2_tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "</img>": 32065,
+  "<img>": 32064,
+  "<img_00000>": 32000,
+  "<img_00001>": 32001,
+  "<img_00002>": 32002,
+  "<img_00003>": 32003,
+  "<img_00004>": 32004,
+  "<img_00005>": 32005,
+  "<img_00006>": 32006,
+  "<img_00007>": 32007,
+  "<img_00008>": 32008,
+  "<img_00009>": 32009,
+  "<img_00010>": 32010,
+  "<img_00011>": 32011,
+  "<img_00012>": 32012,
+  "<img_00013>": 32013,
+  "<img_00014>": 32014,
+  "<img_00015>": 32015,
+  "<img_00016>": 32016,
+  "<img_00017>": 32017,
+  "<img_00018>": 32018,
+  "<img_00019>": 32019,
+  "<img_00020>": 32020,
+  "<img_00021>": 32021,
+  "<img_00022>": 32022,
+  "<img_00023>": 32023,
+  "<img_00024>": 32024,
+  "<img_00025>": 32025,
+  "<img_00026>": 32026,
+  "<img_00027>": 32027,
+  "<img_00028>": 32028,
+  "<img_00029>": 32029,
+  "<img_00030>": 32030,
+  "<img_00031>": 32031,
+  "<img_00032>": 32032,
+  "<img_00033>": 32033,
+  "<img_00034>": 32034,
+  "<img_00035>": 32035,
+  "<img_00036>": 32036,
+  "<img_00037>": 32037,
+  "<img_00038>": 32038,
+  "<img_00039>": 32039,
+  "<img_00040>": 32040,
+  "<img_00041>": 32041,
+  "<img_00042>": 32042,
+  "<img_00043>": 32043,
+  "<img_00044>": 32044,
+  "<img_00045>": 32045,
+  "<img_00046>": 32046,
+  "<img_00047>": 32047,
+  "<img_00048>": 32048,
+  "<img_00049>": 32049,
+  "<img_00050>": 32050,
+  "<img_00051>": 32051,
+  "<img_00052>": 32052,
+  "<img_00053>": 32053,
+  "<img_00054>": 32054,
+  "<img_00055>": 32055,
+  "<img_00056>": 32056,
+  "<img_00057>": 32057,
+  "<img_00058>": 32058,
+  "<img_00059>": 32059,
+  "<img_00060>": 32060,
+  "<img_00061>": 32061,
+  "<img_00062>": 32062,
+  "<img_00063>": 32063
+}

pretrained/cvlm_llama2_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

pretrained/cvlm_llama2_tokenizer/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

pretrained/cvlm_llama2_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,573 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<img_00000>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32001": {
+      "content": "<img_00001>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32002": {
+      "content": "<img_00002>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32003": {
+      "content": "<img_00003>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32004": {
+      "content": "<img_00004>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32005": {
+      "content": "<img_00005>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32006": {
+      "content": "<img_00006>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32007": {
+      "content": "<img_00007>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32008": {
+      "content": "<img_00008>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32009": {
+      "content": "<img_00009>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32010": {
+      "content": "<img_00010>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32011": {
+      "content": "<img_00011>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32012": {
+      "content": "<img_00012>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32013": {
+      "content": "<img_00013>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32014": {
+      "content": "<img_00014>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32015": {
+      "content": "<img_00015>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32016": {
+      "content": "<img_00016>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32017": {
+      "content": "<img_00017>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32018": {
+      "content": "<img_00018>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32019": {
+      "content": "<img_00019>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32020": {
+      "content": "<img_00020>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32021": {
+      "content": "<img_00021>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32022": {
+      "content": "<img_00022>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32023": {
+      "content": "<img_00023>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32024": {
+      "content": "<img_00024>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32025": {
+      "content": "<img_00025>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32026": {
+      "content": "<img_00026>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32027": {
+      "content": "<img_00027>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32028": {
+      "content": "<img_00028>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32029": {
+      "content": "<img_00029>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32030": {
+      "content": "<img_00030>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32031": {
+      "content": "<img_00031>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32032": {
+      "content": "<img_00032>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32033": {
+      "content": "<img_00033>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32034": {
+      "content": "<img_00034>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32035": {
+      "content": "<img_00035>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32036": {
+      "content": "<img_00036>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32037": {
+      "content": "<img_00037>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32038": {
+      "content": "<img_00038>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32039": {
+      "content": "<img_00039>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32040": {
+      "content": "<img_00040>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32041": {
+      "content": "<img_00041>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32042": {
+      "content": "<img_00042>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32043": {
+      "content": "<img_00043>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32044": {
+      "content": "<img_00044>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32045": {
+      "content": "<img_00045>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32046": {
+      "content": "<img_00046>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32047": {
+      "content": "<img_00047>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32048": {
+      "content": "<img_00048>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32049": {
+      "content": "<img_00049>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32050": {
+      "content": "<img_00050>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32051": {
+      "content": "<img_00051>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32052": {
+      "content": "<img_00052>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32053": {
+      "content": "<img_00053>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32054": {
+      "content": "<img_00054>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32055": {
+      "content": "<img_00055>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32056": {
+      "content": "<img_00056>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32057": {
+      "content": "<img_00057>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32058": {
+      "content": "<img_00058>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32059": {
+      "content": "<img_00059>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32060": {
+      "content": "<img_00060>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32061": {
+      "content": "<img_00061>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32062": {
+      "content": "<img_00062>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32063": {
+      "content": "<img_00063>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32064": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

pretrained/detokenizer/detokenizer_george_adapted/checkpoint-4000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:785d4e670ccfdce33b493d0aada60ee5c116918468098b2ed82ae2c28f31e423
+size 6471628187

pretrained/qwen_vit_G.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d951083fc79b07bdb84be61944eb263b8e14572fe2dc4fa80b0447f83064463c
+size 3871440281

pretrained/seed_story/george_sft/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46db6f1beb672085204ca9f7d542f6b62063cbe9970933ca702bccc72f00a4f6
+size 14709979626