Spaces:
Runtime error
Runtime error
File size: 5,522 Bytes
890b142 cd70edb 890b142 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# -*- coding: utf-8 -*-
"""Multimodal-AI-Assistant-Llava7B.ipynb
"""
import torch
from transformers import BitsAndBytesConfig, pipeline
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text",
model=model_id,
model_kwargs={"quantization_config": quantization_config})
import whisper
import gradio as gr
import time
import warnings
import os
from gtts import gTTS
from PIL import Image
image_path = "img.jpg"
image = Image.open((image_path))
image
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
max_new_tokens = 200
prompt_instructions = """
Describe the image using as much detail as possible,
is it a painting, a photograph, what colors are predominant, what's happening in the image
what is the image about?
"""
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
outputs
for sent in sent_tokenize(outputs[0]["generated_text"]):
print(sent)
warnings.filterwarnings("ignore")
import warnings
from gtts import gTTS
import numpy as np
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")
import whisper
model = whisper.load_model("medium", device=DEVICE) #you can use anything with tiny, small ,base, medium and large
print(
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)
import re
import datetime
import os
## Logger file
tstamp = datetime.datetime.now()
tstamp = str(tstamp).replace(' ','_')
logfile = f'{tstamp}_log.txt'
def writehistory(text):
with open(logfile, 'a', encoding='utf-8') as f:
f.write(text)
f.write('\n')
f.close()
def img2txt(input_text, input_image):
# load the image
image = Image.open(input_image)
writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
if type(input_text) == tuple:
prompt_instructions = """
Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what's happening in the image, what is the image about?
"""
else:
prompt_instructions = """
Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
""" + input_text
writehistory(f"prompt_instructions: {prompt_instructions}")
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
# Properly extract the response text
if outputs is not None and len(outputs[0]["generated_text"]) > 0:
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
if match:
# Extract the text after "ASSISTANT:"
reply = match.group(1)
else:
reply = "No response found."
else:
reply = "No response generated."
return reply
def transcribe(audio):
# Check if the audio input is None or empty
if audio is None or audio == '':
return ('','',None) # Return empty strings and None audio file
# language = 'en'
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
result_text = result.text
return result_text
def text_to_speech(text, file_path):
language = 'en'
audioobj = gTTS(text = text,
lang = language,
slow = False)
audioobj.save(file_path)
return file_path
import locale
print(locale.getlocale()) # Before running the pipeline
# Run the pipeline
print(locale.getlocale()) # After running the pipeline
locale.getpreferredencoding = lambda: "UTF-8" #required to
# ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3
import gradio as gr
import base64
import os
# A function to handle audio and image inputs
def process_inputs(audio_path, image_path):
# Process the audio file (assuming this is handled by a function called 'transcribe')
speech_to_text_output = transcribe(audio_path)
# Handle the image input
if image_path:
chatgpt_output = img2txt(speech_to_text_output, image_path)
else:
chatgpt_output = "No image provided."
# Assuming 'transcribe' also returns the path to a processed audio file
processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
return speech_to_text_output, chatgpt_output, processed_audio_path
# Create the interface
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(sources=["microphone"], type="filepath"),
gr.Image(type="filepath")
],
outputs=[
gr.Textbox(label="Speech to Text"),
gr.Textbox(label="AI Output"),
gr.Audio("Temp.mp3")
],
title="Multi Modal AI Assistant Using Whisper and Llava",
description="Upload an image and interact via voice input and audio response."
)
# Launch the interface
iface.launch(debug=True, inline=False)
|