joy-caption-pre-alpha-mod

Running on Zero

App Files Files Community

John6666 commited on Sep 24, 2024

Commit

fe3e059

1 Parent(s): 9ac050a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +1 -1
joycaption.py +29 -24

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from joycaption import stream_chat_mod, get_text_model, change_text_model, get_r
 JC_TITLE_MD = "<h1><center>JoyCaption Alpha One Mod</center></h1>"
 JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-alpha-one](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-one),
- [Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha)"""
 css = """
 .info {text-align:center; !important}

 JC_TITLE_MD = "<h1><center>JoyCaption Alpha One Mod</center></h1>"
 JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-alpha-one](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-one),
+ [Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha). Thanks to [dominic1021](https://huggingface.co/dominic1021)"""
 css = """
 .info {text-align:center; !important}

joycaption.py CHANGED Viewed

@@ -19,10 +19,14 @@ from PIL import Image
 import torchvision.transforms.functional as TVF
 import gc
 from peft import PeftConfig
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 use_inference_client = False
@@ -38,7 +42,7 @@ llm_models = {
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = list(llm_models.keys())[0]
-CHECKPOINT_PATH = Path("9em124t2-499968")
 LORA_PATH = CHECKPOINT_PATH / "text_model"
 TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
 CAPTION_TYPE_MAP = {
@@ -137,36 +141,41 @@ text_model_client = None
 text_model = None
 image_adapter = None
 peft_config = None
-def load_text_model(model_name: str=MODEL_PATH, gguf_file: str | None=None, is_nf4: bool=True):
-    global tokenizer
-    global text_model
-    global image_adapter
-    global peft_config
-    global text_model_client #
-    global use_inference_client #
     try:
         from transformers import BitsAndBytesConfig
         nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
         print("Loading tokenizer")
         if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
         else: tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
         assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
         print(f"Loading LLM: {model_name}")
         if gguf_file:
-            if device == "cpu": text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
-            elif is_nf4: text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
-            else: text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         else:
-            if device == "cpu": text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
-            elif is_nf4: text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
-            else: text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         if LORA_PATH.exists():
             print("Loading VLM's custom text model")
             if is_nf4: peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device, quantization_config=nf4_config)
             else: peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device)
             text_model.add_adapter(peft_config)
             text_model.enable_adapters()
         print("Loading image adapter")
         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
         image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
@@ -186,7 +195,7 @@ clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
 clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
 if (CHECKPOINT_PATH / "clip_model.pt").exists():
     print("Loading VLM's custom vision model")
-    checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu')
     checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
     clip_model.load_state_dict(checkpoint)
     del checkpoint
@@ -197,10 +206,9 @@ clip_model.eval().requires_grad_(False).to(device)
 # Image Adapter
 load_text_model()
 @spaces.GPU()
 @torch.no_grad()
-def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int) -> str:
     torch.cuda.empty_cache()
     # 'any' means no length specified
@@ -276,12 +284,10 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
     return caption.strip()
 @spaces.GPU()
 @torch.no_grad()
-def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, progress=gr.Progress(track_tqdm=True)) -> str:
-    global use_inference_client
-    global text_model
     torch.cuda.empty_cache()
     gc.collect()
@@ -437,10 +443,9 @@ def get_repo_gguf(repo_id: str):
 @spaces.GPU()
-def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_file: str | None=None,
                       is_nf4: bool=True, progress=gr.Progress(track_tqdm=True)):
-    global use_inference_client
-    global llm_models
     use_inference_client = use_client
     try:
         if not is_repo_name(model_name) or not is_repo_exists(model_name):

 import torchvision.transforms.functional as TVF
 import gc
 from peft import PeftConfig
+from typing import Union
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+# Define the base directory
+BASE_DIR = Path(__file__).resolve().parent
 device = "cuda" if torch.cuda.is_available() else "cpu"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 use_inference_client = False
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = list(llm_models.keys())[0]
+CHECKPOINT_PATH = BASE_DIR / Path("9em124t2-499968")
 LORA_PATH = CHECKPOINT_PATH / "text_model"
 TITLE = "<h1><center>JoyCaption Alpha One (2024-09-20a)</center></h1>"
 CAPTION_TYPE_MAP = {
 text_model = None
 image_adapter = None
 peft_config = None
+def load_text_model(model_name: str=MODEL_PATH, gguf_file: Union[str, None]=None, is_nf4: bool=True):
+    global tokenizer, text_model, image_adapter, peft_config, text_model_client, use_inference_client
     try:
         from transformers import BitsAndBytesConfig
         nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
         print("Loading tokenizer")
         if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
         else: tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
         assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
         print(f"Loading LLM: {model_name}")
         if gguf_file:
+            if device == "cpu":
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
+            elif is_nf4:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
+            else:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         else:
+            if device == "cpu":
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
+            elif is_nf4:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
+            else:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         if LORA_PATH.exists():
             print("Loading VLM's custom text model")
             if is_nf4: peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device, quantization_config=nf4_config)
             else: peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device)
             text_model.add_adapter(peft_config)
             text_model.enable_adapters()
         print("Loading image adapter")
         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
         image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
 clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
 if (CHECKPOINT_PATH / "clip_model.pt").exists():
     print("Loading VLM's custom vision model")
+    checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=True)
     checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
     clip_model.load_state_dict(checkpoint)
     del checkpoint
 # Image Adapter
 load_text_model()
 @spaces.GPU()
 @torch.no_grad()
+def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int]) -> str:
     torch.cuda.empty_cache()
     # 'any' means no length specified
     return caption.strip()
 @spaces.GPU()
 @torch.no_grad()
+def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int], max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, progress=gr.Progress(track_tqdm=True)) -> str:
+    global use_inference_client, text_model
     torch.cuda.empty_cache()
     gc.collect()
 @spaces.GPU()
+def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_file: Union[str, None]=None,
                       is_nf4: bool=True, progress=gr.Progress(track_tqdm=True)):
+    global use_inference_client, llm_models
     use_inference_client = use_client
     try:
         if not is_repo_name(model_name) or not is_repo_exists(model_name):