Spaces:
Build error
Build error
| # for image captioning | |
| import PIL | |
| import torch | |
| from torchvision import transforms | |
| import transformers | |
| transformers.utils.move_cache() | |
| from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from accelerate import Accelerator | |
| def remove_unlikely_words(prompt: str) -> str: | |
| """ | |
| Removes unlikely words from a prompt. | |
| Args: | |
| prompt: The text prompt to be cleaned. | |
| Returns: | |
| The cleaned prompt with unlikely words removed. | |
| """ | |
| unlikely_words = [] | |
| a1_list = [f'{i}s' for i in range(1900, 2000)] | |
| a2_list = [f'{i}' for i in range(1900, 2000)] | |
| a3_list = [f'year {i}' for i in range(1900, 2000)] | |
| a4_list = [f'circa {i}' for i in range(1900, 2000)] | |
| b1_list = [f"{year[0]} {year[1]} {year[2]} {year[3]} s" for year in a1_list] | |
| b2_list = [f"{year[0]} {year[1]} {year[2]} {year[3]}" for year in a1_list] | |
| b3_list = [f"year {year[0]} {year[1]} {year[2]} {year[3]}" for year in a1_list] | |
| b4_list = [f"circa {year[0]} {year[1]} {year[2]} {year[3]}" for year in a1_list] | |
| words_list = [ | |
| "black and white,", "black and white", "black & white,", "black & white", "circa", | |
| "balck and white,", "monochrome,", "black-and-white,", "black-and-white photography,", | |
| "black - and - white photography,", "monochrome bw,", "black white,", "black an white,", | |
| "grainy footage,", "grainy footage", "grainy photo,", "grainy photo", "b&w photo", | |
| "back and white", "back and white,", "monochrome contrast", "monochrome", "grainy", | |
| "grainy photograph,", "grainy photograph", "low contrast,", "low contrast", "b & w", | |
| "grainy black-and-white photo,", "bw", "bw,", "grainy black-and-white photo", | |
| "b & w,", "b&w,", "b&w!,", "b&w", "black - and - white,", "bw photo,", "grainy photo,", | |
| "black-and-white photo,", "black-and-white photo", "black - and - white photography", | |
| "b&w photo,", "monochromatic photo,", "grainy monochrome photo,", "monochromatic", | |
| "blurry photo,", "blurry,", "blurry photography,", "monochromatic photo", | |
| "black - and - white photograph,", "black - and - white photograph", "black on white,", | |
| "black on white", "black-and-white", "historical image,", "historical picture,", | |
| "historical photo,", "historical photograph,", "archival photo,", "taken in the early", | |
| "taken in the late", "taken in the", "historic photograph,", "restored,", "restored", | |
| "historical photo", "historical setting,", | |
| "historic photo,", "historic", "desaturated!!,", "desaturated!,", "desaturated,", "desaturated", | |
| "taken in", "shot on leica", "shot on leica sl2", "sl2", | |
| "taken with a leica camera", "taken with a leica camera", "leica sl2", "leica", "setting", | |
| "overcast day", "overcast weather", "slight overcast", "overcast", | |
| "picture taken in", "photo taken in", | |
| ", photo", ", photo", ", photo", ", photo", ", photograph", | |
| ",,", ",,,", ",,,,", " ,", " ,", " ,", " ,", | |
| ] | |
| unlikely_words.extend(a1_list) | |
| unlikely_words.extend(a2_list) | |
| unlikely_words.extend(a3_list) | |
| unlikely_words.extend(a4_list) | |
| unlikely_words.extend(b1_list) | |
| unlikely_words.extend(b2_list) | |
| unlikely_words.extend(b3_list) | |
| unlikely_words.extend(b4_list) | |
| unlikely_words.extend(words_list) | |
| for word in unlikely_words: | |
| prompt = prompt.replace(word, "") | |
| return prompt | |
| def blip_image_captioning(image, device, processor, generator, conditional="a photography of"): | |
| # Load the processor and model | |
| if processor is None: | |
| processor = BlipProcessor.from_pretrained( | |
| "Salesforce/blip-image-captioning-large" | |
| ) | |
| if generator is None: | |
| model = BlipForConditionalGeneration.from_pretrained( | |
| "Salesforce/blip-image-captioning-large", | |
| torch_dtype=torch.float16 | |
| ).to(device) | |
| # Prepare inputs | |
| inputs = processor( | |
| image, | |
| text=conditional, | |
| return_tensors="pt" | |
| ).to(device) | |
| # Generate the caption | |
| out = generator.generate(**inputs, max_new_tokens=20) # Use max_new_tokens for better clarity | |
| caption = processor.decode(out[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| caption = remove_unlikely_words(caption) | |
| return caption | |
| def apply_color(image: PIL.Image.Image, color_map: PIL.Image.Image) -> PIL.Image.Image: | |
| # Convert input images to LAB color space | |
| image_lab = image.convert('LAB') | |
| color_map_lab = color_map.convert('LAB') | |
| # Split LAB channels | |
| l, a , b = image_lab.split() | |
| _, a_map, b_map = color_map_lab.split() | |
| # Merge LAB channels with color map | |
| merged_lab = PIL.Image.merge('LAB', (l, a_map, b_map)) | |
| # Convert merged LAB image back to RGB color space | |
| result_rgb = merged_lab.convert('RGB') | |
| return result_rgb |