Spaces:
Runtime error
Runtime error
| from typing import Callable, Optional | |
| import dotenv | |
| from openai import AsyncOpenAI | |
| import requests | |
| import torch | |
| from PIL import Image | |
| from io import BytesIO | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| from transformers.image_utils import load_image | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b") | |
| model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to(DEVICE) | |
| from .input import decode_input | |
| from .utils import NamedTemporaryFile, file_to_data_uri, timed | |
| # Load OpenAI API key from .env file | |
| dotenv.load_dotenv() | |
| TERSE_PROMPT = """ | |
| The user you're responding to is EXTREMELY busy and cannot waste a single | |
| second. Above all, answers must be as concise as possible. Every wasted word | |
| will result in a huge deduction of points. In fact, use the absolute minimum | |
| number of words while still technically answering the question. Avoid | |
| adjectives, adverbs, fill words, and qualifiers. | |
| """ | |
| EXTRA_TERSE_PROMPT = """ | |
| Definitely don't restate any part of the question in your answer, if it can | |
| possibly be avoided. Don't speak in complete sentences. Just get to the point as | |
| quickly as possible. | |
| """ | |
| SUBJECT_PROMPT = """ | |
| If the user refers to "I" or "me" in the text input, you should assume that's | |
| referring to the most prominent person in the video. | |
| If the user refers to "you" in the text input, you should assume that's | |
| referring to you, the AI model. | |
| """ | |
| VIDEO_PROMPT = """ | |
| The images are frames of a video at 2 frames per second. The user doesn't know | |
| the video is split into frames, so make sure your video refers to these images | |
| collectively as "the video", not "the images" or "the video frames". | |
| """ | |
| SPEAKING_PROMPT = """ | |
| The user is asking you to speak the answer. Make sure your response is in the | |
| form of a friendly, casual spoken answer, not a formal written one. | |
| """ | |
| SYSTEM_PROMPT = ( | |
| VIDEO_PROMPT | |
| + SUBJECT_PROMPT | |
| + SPEAKING_PROMPT | |
| # + TERSE_PROMPT | |
| # + EXTRA_TERSE_PROMPT | |
| ) | |
| async def process_video( | |
| client: AsyncOpenAI, filepath: str, callback: Optional[Callable[[str], None]] | |
| ) -> None: | |
| if callback is None: | |
| callback = lambda _: None | |
| callback("Decoding input") | |
| input = decode_input(filepath, fps=2) | |
| with input: | |
| callback("Decoding speech") | |
| with open(str(input.audio), "rb") as audio_file: | |
| transcription = await client.audio.transcriptions.create( | |
| model="whisper-1", file=audio_file | |
| ) | |
| callback("Processing video") | |
| images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images] | |
| callback("Querying") | |
| prompt = processor.apply_chat_template( | |
| [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": transcription.text}, | |
| *[{"type": "image"} for image in images], | |
| ], | |
| }, | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": SYSTEM_PROMPT, | |
| } | |
| ], | |
| }, | |
| ], | |
| add_generation_prompt=True, | |
| ) | |
| inputs = processor( | |
| text=prompt, | |
| images=[load_image(image) for image in images], | |
| return_tensors="pt", | |
| ) | |
| inputs = {k: v.to(DEVICE) for k, v in inputs.items()} | |
| generated_ids = model.generate(**inputs, max_new_tokens=500) | |
| generated_texts = processor.batch_decode( | |
| generated_ids, skip_special_tokens=True | |
| ) | |
| print("".join(generated_texts)) | |
| callback("Converting to speech") | |
| audio = await client.audio.speech.create( | |
| model="tts-1", | |
| voice="nova", | |
| input="".join(generated_texts), | |
| response_format="mp3", | |
| ) | |
| callback("Encoding audio") | |
| with NamedTemporaryFile(suffix=".mp3", delete_on_close=False) as file: | |
| file.close() | |
| audio.write_to_file(file.name) | |
| return file_to_data_uri(file.name, "audio/mpeg") | |