Spaces:
Runtime error
Runtime error
| import whisper | |
| import gradio as gr | |
| from keybert import KeyBERT | |
| import random as r | |
| from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler | |
| import torch | |
| from PIL import Image | |
| import time | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import PIL | |
| model = whisper.load_model("base") | |
| model.device | |
| model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2" | |
| # model_id = "TaiMingLu/diffusion-architecture" | |
| scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") | |
| pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16 #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16) | |
| pipe = pipe.to("cuda") | |
| def transcribe(audio,prompt_num,user_keywords): | |
| # load audio and pad/trim it to fit 30 seconds | |
| audio1 = whisper.load_audio(audio) | |
| audio1 = whisper.pad_or_trim(audio1) | |
| # make log-Mel spectrogram and move to the same device as the model | |
| mel = whisper.log_mel_spectrogram(audio1).to(model.device) | |
| # detect the spoken language | |
| _, probs = model.detect_language(mel) | |
| print(f"Detected language: {max(probs, key=probs.get)}") | |
| # decode the audio | |
| options = whisper.DecodingOptions() | |
| result = whisper.decode(model, mel, options) | |
| print(result.text) | |
| # model = whisper.load_model("base") | |
| audio2 = whisper.load_audio(audio) | |
| final_result = model.transcribe(audio2) | |
| print(final_result["text"]) | |
| return final_result["text"],int(prompt_num),user_keywords | |
| def keywords(text,prompt_num,user_keywords): | |
| transcription = text | |
| # ub = UrlBuilder("demo.imgix.net") | |
| kw_model = KeyBERT() | |
| a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None) | |
| set_1 = [i[0] for i in a] | |
| b = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', | |
| use_maxsum=True, nr_candidates=20, top_n=5) | |
| set_2 = [i[0] for i in b] | |
| c = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', | |
| use_mmr=True, diversity=0.7) | |
| set_3 = [i[0] for i in c] | |
| d = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', | |
| use_mmr=True, diversity=0.2) | |
| set_4 = [i[0] for i in d] | |
| keyword_pool = set_1 + set_2 + set_3 + set_4 | |
| print("keywords: ", keyword_pool, "length: ", len(keyword_pool)) | |
| generated_prompts = [] | |
| count = 0 | |
| while count != int(prompt_num): | |
| sentence = [] | |
| style_prompts = ["perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting", "detailed, colourful, unreal engine, octane render, blender effect", "70mm, Canon EOS 6D Mark II, 4k, 35mm (FX, Full-Frame), f/2.5, extremely detailed, very high details, photorealistic, hi res, hdr, UHD, hyper-detailed, ultra-realistic, vibrant, centered, vivid colors, Wide angle, zoom out", "detailed, soft ambiance, japanese influence, unreal engine 5, octane render", "perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting --v 4"] | |
| my_list = user_keywords.split(',') | |
| print(my_list) | |
| # for i in range(len(my_list)): | |
| # sentence.append(my_list[i]) | |
| # numb = 5 | |
| for i in range(len(my_list)): | |
| # print("keyword_pool",keyword_pool, len(keyword_pool)) | |
| sentence.append("mdjrny-v4 style") | |
| for i in range (len(my_list)): | |
| sentence.append(my_list[i]) | |
| rand_1 = r.randint(1, 4) | |
| if rand_1 == 1: | |
| sentence.append(r.choice(set_1)) | |
| sentence.append(r.choice(set_1)) | |
| sentence.append(r.choice(set_2)) | |
| sentence.append(r.choice(set_3)) | |
| sentence.append(r.choice(set_4)) | |
| elif rand_1 == 2: | |
| sentence.append(r.choice(set_2)) | |
| sentence.append(r.choice(set_2)) | |
| sentence.append(r.choice(set_1)) | |
| sentence.append(r.choice(set_3)) | |
| sentence.append(r.choice(set_4)) | |
| elif rand_1 == 3: | |
| sentence.append(r.choice(set_3)) | |
| sentence.append(r.choice(set_3)) | |
| sentence.append(r.choice(set_1)) | |
| sentence.append(r.choice(set_2)) | |
| sentence.append(r.choice(set_4)) | |
| else: | |
| sentence.append(r.choice(set_4)) | |
| sentence.append(r.choice(set_4)) | |
| sentence.append(r.choice(set_1)) | |
| sentence.append(r.choice(set_2)) | |
| sentence.append(r.choice(set_3)) | |
| # rand1 = r.randint(0,numb) | |
| # rand2 = r.randint(0,numb) | |
| # if rand2 == rand1: | |
| # rand2 = r.randint(0,numb) | |
| # rand3 = r.randint(0,numb) | |
| # if rand3 == rand1 or rand3 == rand2: | |
| # rand3 = r.randint(0,numb) | |
| # rand4 = r.randint(0,numb) | |
| # if rand4 == rand1 or rand4 == rand2 or rand4 == rand3: | |
| # rand4 = r.randint(0,numb) | |
| # word_1 = keyword_pool[rand1] | |
| # word_2 = keyword_pool[rand2] | |
| # word_3 = keyword_pool[rand3] | |
| # word_4 = keyword_pool[rand4] | |
| # sentence.append(word_1 +", "+ word_2+", " + word_3+", " + word_4) | |
| ## Add Style Tail Prompt | |
| sentence.append(r.choice(style_prompts)) | |
| print("sentence: ", sentence) | |
| # Formatting Data as comma-delimited for Mid Journey | |
| myprompt = ', '.join(str(e) for e in sentence) | |
| sentence = [] | |
| print("prompt: ",myprompt) | |
| generated_prompts.append(myprompt) | |
| count += 1 | |
| print("no. of prompts: ", len(generated_prompts)) | |
| print("generated prompts: ", generated_prompts) | |
| count = 0 | |
| images = [] | |
| # np_images = [] | |
| print("works1") | |
| while count != int(len(generated_prompts)): | |
| print("works2") | |
| for i in generated_prompts: | |
| print("works3") | |
| count += 1 | |
| print(i) | |
| print("works4") | |
| torch.cuda.empty_cache() | |
| # with torch.autocast("cuda"): | |
| image = pipe(i, height=768, width=768, guidance_scale = 10).images[0] | |
| print("works5") | |
| images.append(image) | |
| print("works6") | |
| # min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1] | |
| # imgs_comb = np.hstack([i.resize(min_shape) for i in images]) | |
| # imgs_comb = Image.fromarray( imgs_comb) | |
| return images,transcription,keyword_pool,generated_prompts | |
| #speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True) | |
| #text_prompts = gr.Interface(fn=keywords, title = 'Speech-to-Image-Generator', inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")], theme='darkhuggingface', enable_queue=True) | |
| speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True) | |
| text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated image(s)", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True) | |
| #gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue() | |
| gr.Series(speech_text,text_prompts).launch(enable_queue=True,share=False).queue() | |