Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Multimodal-AI-Assistant-Llava7B.ipynb | |
| """ | |
| import torch | |
| from transformers import BitsAndBytesConfig, pipeline | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16 | |
| ) | |
| model_id = "llava-hf/llava-1.5-7b-hf" | |
| pipe = pipeline("image-to-text", | |
| model=model_id, | |
| model_kwargs={"quantization_config": quantization_config}) | |
| import whisper | |
| import gradio as gr | |
| import time | |
| import warnings | |
| import os | |
| from gtts import gTTS | |
| from PIL import Image | |
| image_path = "img.jpg" | |
| image = Image.open((image_path)) | |
| image | |
| import nltk | |
| nltk.download('punkt') | |
| from nltk import sent_tokenize | |
| max_new_tokens = 200 | |
| prompt_instructions = """ | |
| Describe the image using as much detail as possible, | |
| is it a painting, a photograph, what colors are predominant, what's happening in the image | |
| what is the image about? | |
| """ | |
| prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:" | |
| outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) | |
| outputs | |
| for sent in sent_tokenize(outputs[0]["generated_text"]): | |
| print(sent) | |
| warnings.filterwarnings("ignore") | |
| import warnings | |
| from gtts import gTTS | |
| import numpy as np | |
| torch.cuda.is_available() | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using torch {torch.__version__} ({DEVICE})") | |
| import whisper | |
| model = whisper.load_model("medium", device=DEVICE) #you can use anything with tiny, small ,base, medium and large | |
| print( | |
| f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " | |
| f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters." | |
| ) | |
| import re | |
| import datetime | |
| import os | |
| ## Logger file | |
| tstamp = datetime.datetime.now() | |
| tstamp = str(tstamp).replace(' ','_') | |
| logfile = f'{tstamp}_log.txt' | |
| def writehistory(text): | |
| with open(logfile, 'a', encoding='utf-8') as f: | |
| f.write(text) | |
| f.write('\n') | |
| f.close() | |
| def img2txt(input_text, input_image): | |
| # load the image | |
| image = Image.open(input_image) | |
| writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}") | |
| if type(input_text) == tuple: | |
| prompt_instructions = """ | |
| Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what's happening in the image, what is the image about? | |
| """ | |
| else: | |
| prompt_instructions = """ | |
| Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: | |
| """ + input_text | |
| writehistory(f"prompt_instructions: {prompt_instructions}") | |
| prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:" | |
| outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) | |
| # Properly extract the response text | |
| if outputs is not None and len(outputs[0]["generated_text"]) > 0: | |
| match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"]) | |
| if match: | |
| # Extract the text after "ASSISTANT:" | |
| reply = match.group(1) | |
| else: | |
| reply = "No response found." | |
| else: | |
| reply = "No response generated." | |
| return reply | |
| def transcribe(audio): | |
| # Check if the audio input is None or empty | |
| if audio is None or audio == '': | |
| return ('','',None) # Return empty strings and None audio file | |
| # language = 'en' | |
| audio = whisper.load_audio(audio) | |
| audio = whisper.pad_or_trim(audio) | |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
| _, probs = model.detect_language(mel) | |
| options = whisper.DecodingOptions() | |
| result = whisper.decode(model, mel, options) | |
| result_text = result.text | |
| return result_text | |
| def text_to_speech(text, file_path): | |
| language = 'en' | |
| audioobj = gTTS(text = text, | |
| lang = language, | |
| slow = False) | |
| audioobj.save(file_path) | |
| return file_path | |
| import locale | |
| print(locale.getlocale()) # Before running the pipeline | |
| # Run the pipeline | |
| print(locale.getlocale()) # After running the pipeline | |
| locale.getpreferredencoding = lambda: "UTF-8" #required to | |
| # ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3 | |
| import gradio as gr | |
| import base64 | |
| import os | |
| # A function to handle audio and image inputs | |
| def process_inputs(audio_path, image_path): | |
| # Process the audio file (assuming this is handled by a function called 'transcribe') | |
| speech_to_text_output = transcribe(audio_path) | |
| # Handle the image input | |
| if image_path: | |
| chatgpt_output = img2txt(speech_to_text_output, image_path) | |
| else: | |
| chatgpt_output = "No image provided." | |
| # Assuming 'transcribe' also returns the path to a processed audio file | |
| processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different | |
| return speech_to_text_output, chatgpt_output, processed_audio_path | |
| # Create the interface | |
| iface = gr.Interface( | |
| fn=process_inputs, | |
| inputs=[ | |
| gr.Audio(sources=["microphone"], type="filepath"), | |
| gr.Image(type="filepath") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Speech to Text"), | |
| gr.Textbox(label="AI Output"), | |
| gr.Audio("Temp.mp3") | |
| ], | |
| title="Multi Modal AI Assistant Using Whisper and Llava", | |
| description="Upload an image and interact via voice input and audio response." | |
| ) | |
| # Launch the interface | |
| iface.launch(debug=True, inline=False) | |