John6666
/

joy-caption-alpha-two-cli-mod

Safetensors

English

Model card Files Files and versions

xet

Community

John6666 commited on Sep 28, 2024

Commit

625b8ab

verified ·

1 Parent(s): 394e8ba

Upload app.py

Browse files

Files changed (1) hide show

app.py +80 -79

app.py CHANGED Viewed

@@ -206,6 +206,7 @@ def load_models():
             print("Loading CLIP 📎")
             clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
             clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
             if (CHECKPOINT_PATH / "clip_model.pt").exists():
                 print("Loading VLM's custom vision model 📎")
                 checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
@@ -312,88 +313,88 @@ def stream_chat(input_images: List[Image.Image], caption_type: str, caption_leng
     for i in range(0, len(input_images), batch_size):
         batch = input_images[i:i+batch_size]
-        try:
-            # Preprocess image
-            # NOTE: I found the default processor for so400M to have worse results than just using PIL directly
-            #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
-            all_images = []
-            for input_image in batch:
                 image = input_image.resize((384, 384), Image.LANCZOS)
                 pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
                 pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-                all_images.append(TVF.to_pil_image(pixel_values.squeeze()))
-            batch_pixel_values = clip_processor(images=all_images, return_tensors='pt', padding=True).pixel_values.to(device)
-        except ValueError as e:
-            print(f"Error processing image batch: {e}")
-            print("Skipping this batch and continuing...")
-            continue
-        # Embed image
-        # This results in Batch x Image Tokens x Features
-        with torch.amp.autocast_mode.autocast(device, enabled=True):
-            vision_outputs = clip_model(pixel_values=batch_pixel_values, output_hidden_states=True)
-            image_features = vision_outputs.hidden_states
-            embedded_images = image_adapter(image_features).to(device)
-        # Build the conversation
-        convo = [
-            {
-                "role": "system",
-                "content": "You are a helpful image captioner.",
-            },
-            {
-                "role": "user",
-                "content": prompt_str,
-            },
-        ]
-        # Format the conversation
-        convo_string = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
-        assert isinstance(convo_string, str)
-        # Tokenize the conversation
-        # prompt_str is tokenized separately so we can do the calculations below
-        convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
-        prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
-        assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
-        convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
-        prompt_tokens = prompt_tokens.squeeze(0)
-        # Calculate where to inject the image
-        eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
-        assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
-        preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
-        # Embed the tokens
-        convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(device))
-        # Construct the input
-        input_embeds = torch.cat([
-            convo_embeds[:, :preamble_len],   # Part before the prompt
-            embedded_images.to(dtype=convo_embeds.dtype),   # Image
-            convo_embeds[:, preamble_len:],   # The prompt and anything after it
-        ], dim=1).to(device)
-        input_ids = torch.cat([
-            convo_tokens[:preamble_len].unsqueeze(0),
-            torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),   # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
-            convo_tokens[preamble_len:].unsqueeze(0),
-        ], dim=1).to(device)
-        attention_mask = torch.ones_like(input_ids)
-        # Debugging
-        #print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
-        generate_ids = text_model.generate(input_ids=input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, do_sample=True,
-                                           suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature)
-        generate_ids = generate_ids[:, input_ids.shape[1]:]
-        for ids in generate_ids:
-            caption = tokenizer.decode(ids[:-1] if ids[-1] == tokenizer.eos_token_id else ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            caption = caption.replace('<|end_of_text|>', '').replace('<|finetune_right_pad_id|>', '').strip()
-            all_captions.append(caption)
         if pbar:
             pbar.update(len(batch))

             print("Loading CLIP 📎")
             clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
             clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
+            assert (CHECKPOINT_PATH / "clip_model.pt").exists()
             if (CHECKPOINT_PATH / "clip_model.pt").exists():
                 print("Loading VLM's custom vision model 📎")
                 checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
     for i in range(0, len(input_images), batch_size):
         batch = input_images[i:i+batch_size]
+        for input_image in input_images:
+            try:
+                # Preprocess image
+                # NOTE: I found the default processor for so400M to have worse results than just using PIL directly
+                #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
                 image = input_image.resize((384, 384), Image.LANCZOS)
                 pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
                 pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+                pixel_values = pixel_values.to(device)
+            except ValueError as e:
+                print(f"Error processing image: {e}")
+                print("Skipping this image and continuing...")
+                continue
+            # Embed image
+            # This results in Batch x Image Tokens x Features
+            with torch.amp.autocast_mode.autocast(device, enabled=True):
+                vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+                image_features = vision_outputs.hidden_states
+                embedded_images = image_adapter(image_features).to(device)
+            # Build the conversation
+            convo = [
+                {
+                    "role": "system",
+                    "content": "You are a helpful image captioner.",
+                },
+                {
+                    "role": "user",
+                    "content": prompt_str,
+                },
+            ]
+            # Format the conversation
+            convo_string = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
+            assert isinstance(convo_string, str)
+            # Tokenize the conversation
+            # prompt_str is tokenized separately so we can do the calculations below
+            convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
+            prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
+            assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
+            convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
+            prompt_tokens = prompt_tokens.squeeze(0)
+            # Calculate where to inject the image
+            eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
+            assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
+            preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
+            # Embed the tokens
+            convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(device))
+            # Construct the input
+            input_embeds = torch.cat([
+                convo_embeds[:, :preamble_len],   # Part before the prompt
+                embedded_images.to(dtype=convo_embeds.dtype),   # Image
+                convo_embeds[:, preamble_len:],   # The prompt and anything after it
+            ], dim=1).to(device)
+            input_ids = torch.cat([
+                convo_tokens[:preamble_len].unsqueeze(0),
+                torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),   # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
+                convo_tokens[preamble_len:].unsqueeze(0),
+            ], dim=1).to(device)
+            attention_mask = torch.ones_like(input_ids)
+            # Debugging
+            #print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
+            generate_ids = text_model.generate(input_ids=input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, do_sample=True,
+                                            suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature)
+            # Trim off the prompt
+            generate_ids = generate_ids[:, input_ids.shape[1]:]
+            if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+                generate_ids = generate_ids[:, :-1]
+            caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+            all_captions.append(caption.strip())
         if pbar:
             pbar.update(len(batch))