Spaces:
Sleeping
Sleeping
| from PIL import Image | |
| #importing models | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from dotenv import load_dotenv | |
| import os | |
| from langchain.output_parsers import StructuredOutputParser, ResponseSchema | |
| import gradio as gr | |
| from diffusers import DiffusionPipeline,StableDiffusion3Pipeline | |
| from huggingface_hub import login | |
| load_dotenv() | |
| Hugging_face_token=os.getenv('HFToken') | |
| login(Hugging_face_token) | |
| # loading image captionning model | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
| # Set the model name for our LLMs. | |
| GEMINI_MODEL = "gemini-2.0-flash" | |
| # Store the API key in a variable. | |
| GEMINI_API_KEY = os.getenv("google_api_key") | |
| class stable_dif: | |
| def __init__(self,sizes): | |
| self.sizes=sizes | |
| def model(self): | |
| if self.sizes == 'medium': | |
| pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium") | |
| elif self.sizes == 'large': | |
| pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-large-turbo") | |
| elif self.sizes == 'small': | |
| pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") | |
| return pipe | |
| stable=stable_dif('small') | |
| pipe=stable.model() | |
| def image_story_generator(image,requirement,style): | |
| raw_image = Image.open(image) | |
| # get caption from image | |
| inputs = processor(raw_image, return_tensors="pt") | |
| out = model.generate(**inputs, min_length=20) | |
| model_prompt=processor.decode(out[0], skip_special_tokens=True) | |
| #load gemnini for creating story | |
| llm = ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3) | |
| query =f' Write a 4 chapters story based on {model_prompt} and\ | |
| that fits the following requirements: {requirement}. Give a detailed\ | |
| description of the charaters appearences.' | |
| result = llm.invoke(query) | |
| story= result.content.replace('\n',' ') | |
| # create promts for image gen from story | |
| image_prompt_llm=ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3) | |
| # create shemas to format output | |
| schemas=[ | |
| ResponseSchema(name='prompt 1', description='the prompt'), | |
| ResponseSchema(name='prompt 2', description='the prompt'), | |
| ResponseSchema(name='prompt 3', description='the prompt'), | |
| ResponseSchema(name='prompt 4', description='the prompt') | |
| ] | |
| # initialize parser for output | |
| parser=StructuredOutputParser.from_response_schemas(schemas) | |
| instructions=parser.get_format_instructions() | |
| query = f' Based on this story: {story}. Create 4 prompts for stable diffusion that tells of a maximum of 77 tokens\ | |
| what happens in each chapters. Describe the characters everytime their name is mentioned. Each image should be created in the same exact style {style}.\ | |
| '+ '\n\n'+instructions | |
| result=image_prompt_llm.invoke(query) | |
| image_prompts = parser.parse(result.content) | |
| # iterate through the prompts and generate new images | |
| images=[] | |
| for i in image_prompts.keys(): | |
| image = pipe(image_prompts[i]).images[0] | |
| images.append(image) | |
| return images, story | |
| # gradio | |
| interface = gr.Interface( | |
| fn=image_story_generator, | |
| inputs=[gr.Image(type='filepath'),gr.Textbox('enter story requirements'), gr.Textbox('pick a style for the images')], | |
| outputs=[gr.Gallery(), | |
| gr.Textbox('story') | |
| ], | |
| description='Upload an image to start the story generation process.' | |
| ) | |
| interface.launch() |