Spaces:

mskov
/

whisper_fileStream

Runtime error

App Files Files Community

whisper_fileStream / app.py

mskov

Update app.py

ce50170 almost 3 years ago

raw

history blame contribute delete

6.25 kB


	'''
	This script calls the model from openai api to predict the next few words in a conversation.
	'''
	import os
	import sys
	import openai
	import gradio as gr
	os.system("pip install git+https://github.com/openai/whisper.git")
	import whisper
	from transformers import pipeline
	import torch
	from transformers import AutoModelForCausalLM
	from transformers import AutoTokenizer
	import time
	import pandas as pd


	EXAMPLE_PROMPT = """This is a tool for helping someone with memory issues remember the next word.
	The predictions follow a few rules:
	1) The predictions are suggestions of ways to continue the transcript as if someone forgot what the next word was.
	2) The predictions do not repeat themselves.
	3) The predictions focus on suggesting nouns, adjectives, and verbs.
	4) The predictions are related to the context in the transcript.
	5) The predictions are ordered from most likely to least likely.
	6) Five unique predictions are made per transcript.

	EXAMPLES:
	Transcript: Tomorrow night we're going out to
	Prediction: The Movies, A Restaurant, A Baseball Game, The Theater, A Party for a friend
	Transcript: I would like to order a cheeseburger with a side of
	Prediction: French fries, Milkshake, Apple slices, Side salad, Extra catsup
	Transcript: My friend Savanah is
	Prediction: An electrical engineer, A marine biologist, A classical musician, A developer, A product manager
	Transcript: I need to buy a birthday
	Prediction: Present, Gift, Cake, Card, balloon
	Transcript: """


	# whisper model specification
	asr_model = whisper.load_model("tiny")

	openai.api_key = os.environ["Openai_APIkey"]


	# Transcribe function
	def transcribe(audio_file):
	print("Transcribing")
	transcription = asr_model.transcribe(audio_file)["text"]
	#transcription = asr_model.transcribe(audio_file)
	return transcription

	def inference(audio, latest):
	# Transcribe with Whisper
	print("The audio is:", audio)
	transcript = transcribe(audio)

	if transcript != None:
	latest.append(transcript)
	#tscript = EXAMPLE_PROMPT + str(transcript) + "\nPrediction: "
	tscript = EXAMPLE_PROMPT + str(latest) + "\nPrediction: "
	else: tscript = EXAMPLE_PROMPT


	print("tscript ------- ", tscript)

	response = openai.Completion.create(
	model="text-davinci-003",
	prompt=tscript,
	temperature=0.8,
	max_tokens=18,
	n=5)

	#infers = []
	#infers = []
	temp = []
	inferred=[]

	for i in range(5):
	print("print1 ", response['choices'][i]['text'])
	temp.append(response['choices'][i]['text'])
	print("print2: infers ", inferred)
	print("print3: Responses ", response)
	print("Object type of response: ", type(response))
	#infered = list(map(lambda x: x.split(',')[0], infers))
	#print("Infered type is: ", type(infered))
	inferred = list(map(lambda x: x.replace("\n", ""), temp))
	#infered = list(map(lambda x: x.split(','), infers))

	infers = pd.Series(inferred)
	infersNew = infers.str.split(",", n=-1, expand=True)
	print("USEAGE: ", response['usage']['completion_tokens'])

	#infers.drop_duplicates(keep='first', inplace=True)
	print("Infers DataType ", type(infers), "Infers after drop: ", infers, "Infers at 0: ", infers[0])
	res = []

	op1 = infersNew[0][0]
	op2 = infersNew[1][0]
	op3 = infersNew[2][0]
	try:
	op4 = infersNew[3][0]
	except KeyError:
	op4 = infersNew[0][1]
	try:
	op5 = infersNew[4][0]
	except KeyError:
	op5 = infersNew[1][1]


	print("INFERS TYPE: ", type(infers), "INFERS ", infers)

	convoState = latest
	#infersStr = str(infers)


	return transcript, op1, op2, op3, op4, op5, convoState

	def appendPrediction(val, convoState):
	convoState.append(val)
	return convoState

	# get audio from microphone
	with gr.Blocks() as face:

	with gr.Row():
	convoState = gr.State([""])
	with gr.Column():
	audio = gr.Audio(source="microphone", type="filepath")
	#promptText = gr.Textbox(lines=15, placeholder="Enter a prompt here")
	#dropChoice = gr.Dropdown(choices=["text-ada-001", "text-davinci-002", "text-davinci-003", "gpt-3.5-turbo"], label="Model")
	#sliderChoice = gr.Slider(minimum=0.0, maximum=1.0, default=0.8, step=0.1, label="Temperature")
	transcribe_btn = gr.Button(value="Transcribe")
	with gr.Column():
	script = gr.Textbox(label="Transcribed text")
	#options = gr.Textbox(label="Predictions")
	option1 = gr.Button(value=" ")
	option2 = gr.Button(value=" ")
	option3 = gr.Button(value=" ")
	option4 = gr.Button(value=" ")
	option5 = gr.Button(value=" ")
	#options = gr.Dataset(components=[gr.Radio], samples=["One", "Two", "Three", "Four", "Five"])
	'''options = gr.Dataset(components=[gr.Textbox(visible=False)],
	label="Text Dataset",
	samples=[
	["One"],
	["Two"],
	["Three"],
	["Four"],
	["Five"],
	],
	)'''
	#options = gr.Radio(choices=["One", "Two", "Three", "Four", "Five"])
	latestConvo = gr.Textbox(label="Running conversation")
	#transcribe_btn.click(inference)
	transcribe_btn.click(fn=inference, inputs=[audio, convoState], outputs=[script, option1, option2, option3, option4, option5, latestConvo])
	option1.click(fn=appendPrediction, inputs=[option1, convoState], outputs=[latestConvo])
	option2.click(fn=appendPrediction, inputs=[option2, convoState], outputs=[latestConvo])
	option3.click(fn=appendPrediction, inputs=[option3, convoState], outputs=[latestConvo])
	option4.click(fn=appendPrediction, inputs=[option4, convoState], outputs=[latestConvo])
	option5.click(fn=appendPrediction, inputs=[option5, convoState], outputs=[latestConvo])
	#examples = gr.Examples(examples=["Sedan, Truck, SUV", "Dalmaion, Shepherd, Lab, Mutt"], inputs=[options])


	face.launch()