Spaces:

jcheng5
/

multimodal

Runtime error

App Files Files Community

multimodal / videoinput /query.py

jcheng5

Try open source model

7a5b32d over 1 year ago

raw

history blame contribute delete

4.29 kB

	from typing import Callable, Optional

	import dotenv
	from openai import AsyncOpenAI
	import requests
	import torch
	from PIL import Image
	from io import BytesIO

	from transformers import AutoProcessor, AutoModelForVision2Seq
	from transformers.image_utils import load_image

	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
	model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to(DEVICE)

	from .input import decode_input
	from .utils import NamedTemporaryFile, file_to_data_uri, timed

	# Load OpenAI API key from .env file
	dotenv.load_dotenv()

	TERSE_PROMPT = """
	The user you're responding to is EXTREMELY busy and cannot waste a single
	second. Above all, answers must be as concise as possible. Every wasted word
	will result in a huge deduction of points. In fact, use the absolute minimum
	number of words while still technically answering the question. Avoid
	adjectives, adverbs, fill words, and qualifiers.
	"""

	EXTRA_TERSE_PROMPT = """
	Definitely don't restate any part of the question in your answer, if it can
	possibly be avoided. Don't speak in complete sentences. Just get to the point as
	quickly as possible.
	"""

	SUBJECT_PROMPT = """
	If the user refers to "I" or "me" in the text input, you should assume that's
	referring to the most prominent person in the video.

	If the user refers to "you" in the text input, you should assume that's
	referring to you, the AI model.
	"""

	VIDEO_PROMPT = """
	The images are frames of a video at 2 frames per second. The user doesn't know
	the video is split into frames, so make sure your video refers to these images
	collectively as "the video", not "the images" or "the video frames".
	"""

	SPEAKING_PROMPT = """
	The user is asking you to speak the answer. Make sure your response is in the
	form of a friendly, casual spoken answer, not a formal written one.
	"""

	SYSTEM_PROMPT = (
	VIDEO_PROMPT
	+ SUBJECT_PROMPT
	+ SPEAKING_PROMPT
	# + TERSE_PROMPT
	# + EXTRA_TERSE_PROMPT
	)


	async def process_video(
	client: AsyncOpenAI, filepath: str, callback: Optional[Callable[[str], None]]
	) -> None:
	if callback is None:
	callback = lambda _: None

	callback("Decoding input")
	input = decode_input(filepath, fps=2)

	with input:
	callback("Decoding speech")
	with open(str(input.audio), "rb") as audio_file:
	transcription = await client.audio.transcriptions.create(
	model="whisper-1", file=audio_file
	)

	callback("Processing video")
	images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]

	callback("Querying")

	prompt = processor.apply_chat_template(
	[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": transcription.text},
	*[{"type": "image"} for image in images],
	],
	},
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": SYSTEM_PROMPT,
	}
	],
	},
	],
	add_generation_prompt=True,
	)
	inputs = processor(
	text=prompt,
	images=[load_image(image) for image in images],
	return_tensors="pt",
	)
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
	generated_ids = model.generate(**inputs, max_new_tokens=500)
	generated_texts = processor.batch_decode(
	generated_ids, skip_special_tokens=True
	)
	print("".join(generated_texts))

	callback("Converting to speech")
	audio = await client.audio.speech.create(
	model="tts-1",
	voice="nova",
	input="".join(generated_texts),
	response_format="mp3",
	)

	callback("Encoding audio")
	with NamedTemporaryFile(suffix=".mp3", delete_on_close=False) as file:
	file.close()
	audio.write_to_file(file.name)
	return file_to_data_uri(file.name, "audio/mpeg")