Spaces:

Nemil
/

ImageCaptionGenerator

Runtime error

App Files Files Community

Nemil commited on Apr 16, 2024

Commit

85f688c

verified ·

1 Parent(s): b4527f0

Initial commit

Browse files

Files changed (2) hide show

app.py +296 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,296 @@

+from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig
+import torch
+from PIL import Image
+import requests
+import traceback
+class Image2Text:
+    def __init__(self):
+        # Load the GIT coco model
+        preprocessor_git_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco")
+        model_git_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.preprocessor = preprocessor_git_large_coco
+        self.model = model_git_large_coco
+        self.model.to(self.device)
+    def image_description(
+        self,
+        image_url,
+        max_length=50,
+        temperature=0.1,
+        use_sample_image=False,
+    ):
+        """
+        Generate captions for the given image.
+        -----
+        Parameters
+        image_url: Image URL
+            The image to generate captions for.
+        max_length: int
+            The max length of the generated descriptions.
+        -----
+        Returns
+        str
+            The generated image description.
+        """
+        caption_git_large_coco = ""
+        if use_sample_image:
+            image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        # Generate captions for the image using the GIT coco model
+        try:
+            caption_git_large_coco = self._generate_description(image, max_length, False).strip()
+            return caption_git_large_coco
+        except Exception as e:
+            print(e)
+            traceback.print_exc()
+    def _generate_description(
+        self,
+        image,
+        max_length=50,
+        use_float_16=False,
+    ):
+        """
+        Generate captions for the given image.
+        -----
+        Parameters
+        image: PIL.Image
+            The image to generate captions for.
+        max_length: int
+            The max length of the generated descriptions.
+        use_float_16: bool
+            Whether to use float16 precision. This can speed up inference, but may lead to worse results.
+        -----
+        Returns
+        str
+            The generated caption.
+        """
+        # inputs = preprocessor(image, return_tensors="pt").to(device)
+        pixel_values = self.preprocessor(images=image, return_tensors="pt").pixel_values.to(self.device)
+        generated_ids = self.model.generate(
+            pixel_values=pixel_values,
+            max_length=max_length,
+        )
+        generated_caption = self.preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return generated_caption
+import json
+import os
+from pprint import pprint
+import bitsandbytes as bnb
+import pandas as pd
+import torch
+import torch.nn as nn
+import transformers
+from datasets import load_dataset
+from huggingface_hub import notebook_login
+from peft import (
+  LoraConfig ,
+  PeftConfig ,
+  PeftModel ,
+  get_peft_model ,
+  prepare_model_for_kbit_training,
+)
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
+from peft import LoraConfig, get_peft_model
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+class Social_Media_Captioner:
+    def __init__(self, use_finetuned: bool=True, temp=0.1):
+        self.use_finetuned = use_finetuned
+        self.MODEL_NAME = "vilsonrodrigues/falcon-7b-instruct-sharded"
+        self.peft_model_name = "ayush-vatsal/caption_qlora_finetune"
+        self.model_loaded = False
+        self.device = "cuda:0"
+        self._load_model()
+        self.generation_config = self.model.generation_config
+        self.generation_config.max_new_tokens = 50
+        self.generation_config.temperature = temp
+        self.generation_config.top_p = 0.7
+        self.generation_config.num_return_sequences = 1
+        self.generation_config.pad_token_id = self.tokenizer.eos_token_id
+        self.generation_config.eos_token_id = self.tokenizer.eos_token_id
+        self.cache: list[dict] = [] # [{"image_decription": "A man", "caption": ["A man"]}]
+    def _load_model(self):
+        try:
+            self.bnb_config = BitsAndBytesConfig(
+                load_in_4bit = True,
+                bnb_4bit_use_double_quant = True,
+                bnb_4bit_quant_type= "nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.MODEL_NAME,
+                device_map = "auto",
+                trust_remote_code = True,
+                quantization_config = self.bnb_config
+                )
+            # Defining the tokenizers
+            self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME)
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            if self.use_finetuned:
+                # LORA Config Model
+                self.lora_config = LoraConfig(
+                    r=16,
+                    lora_alpha=32,
+                    target_modules=["query_key_value"],
+                    lora_dropout=0.05,
+                    bias="none",
+                    task_type="CAUSAL_LM"
+                )
+                self.model = get_peft_model(self.model, self.lora_config)
+                # Fitting the adapters
+                self.peft_config = PeftConfig.from_pretrained(self.peft_model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.peft_config.base_model_name_or_path,
+                    return_dict = True,
+                    quantization_config = self.bnb_config,
+                    device_map= "auto",
+                    trust_remote_code = True
+                    )
+                self.model = PeftModel.from_pretrained(self.model, self.peft_model_name)
+                # Defining the tokenizers
+                self.tokenizer = AutoTokenizer.from_pretrained(self.peft_config.base_model_name_or_path)
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.model_loaded = True
+            print("Model Loaded successfully")
+        except Exception as e:
+            print(e)
+            self.model_loaded = False
+    def inference(self, input_text: str, use_cached=True, cache_generation=True) -> str | None:
+        if not self.model_loaded:
+            raise Exception("Model not loaded")
+        try:
+            prompt = Social_Media_Captioner._prompt(input_text)
+            if use_cached:
+                for item in self.cache:
+                    if item['image_description'] == input_text:
+                        return item['caption']
+            encoding = self.tokenizer(prompt, return_tensors = "pt").to(self.device)
+            with torch.inference_mode():
+                outputs = self.model.generate(
+                    input_ids = encoding.input_ids,
+                    attention_mask = encoding.attention_mask,
+                    generation_config = self.generation_config
+                )
+                generated_caption = (self.tokenizer.decode(outputs[0], skip_special_tokens=True).split('Caption: "')[-1]).split('"')[0]
+                if cache_generation:
+                    for item in self.cache:
+                        if item['image_description'] == input_text:
+                            item['caption'].append(generated_caption)
+                            break
+                    else:
+                        self.cache.append({
+                            'image_description': input_text,
+                            'caption': [generated_caption]
+                        })
+                return generated_caption
+        except Exception as e:
+            print(e)
+            return None
+    def _prompt(input_text="A man walking alone in the road"):
+        if input_text is None:
+            raise Exception("Enter a valid input text to generate a valid prompt")
+        return f"""
+            Convert the given image description to a appropriate metaphoric caption
+            Description: {input_text}
+            Caption:
+            """.strip()
+    @staticmethod
+    def get_trainable_parameters(model):
+        trainable_params = 0
+        all_param = 0
+        for _, param in model.named_parameters():
+            all_param += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        return f"trainable_params: {trainable_params} || all_params: {all_param} || Percentage of trainable params: {100*trainable_params / all_param}"
+    def __repr__(self):
+        return f"""
+        Base Model Name: {self.MODEL_NAME}
+        PEFT Model Name: {self.peft_model_name}
+        Using PEFT Finetuned Model: {self.use_finetuned}
+        Model: {self.model}
+        ------------------------------------------------------------
+        {Social_Media_Captioner.get_trainable_parameters(self.model)}
+        """
+class Captions:
+    def __init__(self, use_finetuned_LLM: bool=True, temp_LLM=0.1):
+        self.image_to_text = Image2Text()
+        self.LLM = Social_Media_Captioner(use_finetuned_LLM, temp_LLM)
+    def generate_captions(
+        self,
+        image,
+        image_url=None,
+        max_length_GIT=50,
+        temperature_GIT=0.1,
+        use_sample_image_GIT=False,
+        use_cached_LLM=True,
+        cache_generation_LLM=True
+    ):
+        if image_url:
+            image_description = self.image_to_text.image_description(image_url, max_length=max_length_GIT, temperature=temperature_GIT, use_sample_image=use_sample_image_GIT)
+        else:
+            image_description = self.image_to_text._generate_description(image, max_length=max_length_GIT)
+        captions = self.LLM.inference(image_description, use_cached=use_cached_LLM, cache_generation=cache_generation_LLM)
+        return captions
+caption_generator = Captions()
+import gradio as gr
+def setup(image):
+    return caption_generator.generate_captions(image = image)
+iface = gr.Interface(
+    fn=setup,
+    inputs=gr.inputs.Image(type="pil", label="Upload Image"),
+    outputs=gr.outputs.Textbox(label="Caption")
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio==3.36.0