Spaces:

TVRRaviteja
/

Multimodal-AI-Assistant

Runtime error

App Files Files Community

Multimodal-AI-Assistant / app.py

TVRRaviteja

Update app.py

cd70edb verified almost 2 years ago

raw

history blame contribute delete

5.52 kB

	# -- coding: utf-8 --
	"""Multimodal-AI-Assistant-Llava7B.ipynb
	"""

	import torch
	from transformers import BitsAndBytesConfig, pipeline

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16
	)

	model_id = "llava-hf/llava-1.5-7b-hf"

	pipe = pipeline("image-to-text",
	model=model_id,
	model_kwargs={"quantization_config": quantization_config})

	import whisper
	import gradio as gr
	import time
	import warnings
	import os
	from gtts import gTTS

	from PIL import Image

	image_path = "img.jpg"
	image = Image.open((image_path))
	image

	import nltk
	nltk.download('punkt')
	from nltk import sent_tokenize

	max_new_tokens = 200

	prompt_instructions = """
	Describe the image using as much detail as possible,
	is it a painting, a photograph, what colors are predominant, what's happening in the image
	what is the image about?
	"""

	prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

	outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

	outputs

	for sent in sent_tokenize(outputs[0]["generated_text"]):
	print(sent)

	warnings.filterwarnings("ignore")

	import warnings
	from gtts import gTTS
	import numpy as np

	torch.cuda.is_available()
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using torch {torch.__version__} ({DEVICE})")

	import whisper
	model = whisper.load_model("medium", device=DEVICE) #you can use anything with tiny, small ,base, medium and large
	print(
	f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
	f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
	)

	import re
	import datetime
	import os

	## Logger file
	tstamp = datetime.datetime.now()
	tstamp = str(tstamp).replace(' ','_')
	logfile = f'{tstamp}_log.txt'

	def writehistory(text):
	with open(logfile, 'a', encoding='utf-8') as f:
	f.write(text)
	f.write('\n')
	f.close()

	def img2txt(input_text, input_image):

	# load the image
	image = Image.open(input_image)

	writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
	if type(input_text) == tuple:
	prompt_instructions = """
	Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what's happening in the image, what is the image about?
	"""
	else:
	prompt_instructions = """
	Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
	""" + input_text

	writehistory(f"prompt_instructions: {prompt_instructions}")
	prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

	outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

	# Properly extract the response text
	if outputs is not None and len(outputs[0]["generated_text"]) > 0:
	match = re.search(r'ASSISTANT:\s(.)', outputs[0]["generated_text"])
	if match:
	# Extract the text after "ASSISTANT:"
	reply = match.group(1)
	else:
	reply = "No response found."
	else:
	reply = "No response generated."

	return reply

	def transcribe(audio):

	# Check if the audio input is None or empty
	if audio is None or audio == '':
	return ('','',None) # Return empty strings and None audio file

	# language = 'en'

	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	_, probs = model.detect_language(mel)

	options = whisper.DecodingOptions()
	result = whisper.decode(model, mel, options)
	result_text = result.text

	return result_text

	def text_to_speech(text, file_path):
	language = 'en'

	audioobj = gTTS(text = text,
	lang = language,
	slow = False)

	audioobj.save(file_path)

	return file_path

	import locale
	print(locale.getlocale()) # Before running the pipeline
	# Run the pipeline
	print(locale.getlocale()) # After running the pipeline

	locale.getpreferredencoding = lambda: "UTF-8" #required to

	# ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3

	import gradio as gr
	import base64
	import os

	# A function to handle audio and image inputs
	def process_inputs(audio_path, image_path):
	# Process the audio file (assuming this is handled by a function called 'transcribe')
	speech_to_text_output = transcribe(audio_path)

	# Handle the image input
	if image_path:
	chatgpt_output = img2txt(speech_to_text_output, image_path)
	else:
	chatgpt_output = "No image provided."

	# Assuming 'transcribe' also returns the path to a processed audio file
	processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different

	return speech_to_text_output, chatgpt_output, processed_audio_path

	# Create the interface
	iface = gr.Interface(
	fn=process_inputs,
	inputs=[
	gr.Audio(sources=["microphone"], type="filepath"),
	gr.Image(type="filepath")
	],
	outputs=[
	gr.Textbox(label="Speech to Text"),
	gr.Textbox(label="AI Output"),
	gr.Audio("Temp.mp3")
	],
	title="Multi Modal AI Assistant Using Whisper and Llava",
	description="Upload an image and interact via voice input and audio response."
	)

	# Launch the interface
	iface.launch(debug=True, inline=False)