Spaces:
Build error
Build error
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # # Set-up environment | |
| # In[2]: | |
| get_ipython().system('pip install --upgrade -q accelerate bitsandbytes') | |
| # In[ ]: | |
| get_ipython().system('rm -r transformers') | |
| get_ipython().system('git clone -b llava_improvements https://github.com/NielsRogge/transformers.git') | |
| get_ipython().system('cd transformers') | |
| get_ipython().system('pip install -q ./transformers') | |
| # In[ ]: | |
| get_ipython().system('pip install git+https://github.com/huggingface/transformers.git') | |
| # ## Load model and processor | |
| # In[ ]: | |
| from transformers import AutoProcessor, LlavaForConditionalGeneration | |
| from transformers import BitsAndBytesConfig | |
| import torch | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16 | |
| ) | |
| model_id = "llava-hf/llava-1.5-7b-hf" | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto") | |
| # ## Prepare image and text for the model | |
| # In[ ]: | |
| import requests | |
| from PIL import Image | |
| image1 = Image.open('data/clock.jpeg') | |
| display(image1) | |
| # In the prompt, you can refer to images using the special \<image> token. To indicate which text comes from a human vs. the model, one uses USER and ASSISTANT respectively. The format looks as follows: | |
| # | |
| # ```bash | |
| # USER: <image>\n<prompt>\nASSISTANT: | |
| # ``` | |
| # In other words, you always need to end your prompt with `ASSISTANT:`. Here we will perform batched generation (i.e generating on several prompts). | |
| # In[ ]: | |
| caption = 'an old fashioned clock sitting on top of a table' | |
| user_input = "This is an intricately crafted old-fashioned clock created by a skilled Moroccan artisan back in 1988 from Chefchaoune.. it reminds me of my mother." | |
| prompts = [ | |
| f"USER: <image>\nBased on the caption '{caption}' and the following user input: '{user_input}', generate a detailed product name and description for this Moroccan artisanal item; the description should be minimal yet it gives the essence of the product and convinces people to buy or express their interest in it.\nASSISTANT:" | |
| # f""" | |
| # USER: <image>\nBased on the image caption '{caption}' and the following background information: '{user_input}', generate an attention-grabbing yet concise product name and description for this authentic Moroccan artisanal item. The description should: | |
| # Highlight the key features and unique selling points that make this product exceptional and desirable. | |
| # Convey the cultural significance, craftsmanship, and rich heritage behind the item's creation. | |
| # Use evocative language that resonates with potential buyers and piques their interest in owning this one-of-a-kind piece. | |
| # Be concise, direct, and persuasive, leaving the reader eager to learn more or acquire the product. | |
| # Your response should follow this format: | |
| # Product Name: [Compelling and relevant product name] | |
| # Product Description: [Concise yet captivating description addressing the points above] | |
| # ASSISTANT:""" | |
| ] | |
| inputs = processor(prompts, images=[image1], padding=True, return_tensors="pt").to("cuda") | |
| for k,v in inputs.items(): | |
| print(k,v.shape) | |
| # ## Autoregressively generate completion | |
| # | |
| # Finally, we simply let the model predict the next tokens given the images + prompt. Of course one can adjust all the [generation parameters](https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation#transformers.GenerationMixin.generate). By default, greedy decoding is used. | |
| # In[ ]: | |
| output = model.generate(**inputs, max_new_tokens=200) | |
| generated_text = processor.batch_decode(output, skip_special_tokens=True) | |
| for text in generated_text: | |
| print(text.split("ASSISTANT:")[-1]) | |
| # ## Pipeline API | |
| # | |
| # Alternatively, you can leverage the [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) API which abstracts all of the logic above away for the user. We also provide the quantization config to make sure we leverage 4-bit inference. | |
| # In[ ]: | |
| from transformers import pipeline | |
| pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config}) | |
| # In[ ]: | |
| max_new_tokens = 200 | |
| prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:" | |
| outputs = pipe(image1, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) | |
| # In[ ]: | |
| print(outputs[0]["generated_text"]) | |
| # In[ ]: | |