Spaces:
Runtime error
Runtime error
Try open source model
Browse files- requirements.txt +1 -0
- videoinput/query.py +28 -11
requirements.txt
CHANGED
|
@@ -11,3 +11,4 @@ requests
|
|
| 11 |
torch
|
| 12 |
transformers
|
| 13 |
pillow
|
|
|
|
|
|
| 11 |
torch
|
| 12 |
transformers
|
| 13 |
pillow
|
| 14 |
+
requests
|
videoinput/query.py
CHANGED
|
@@ -2,6 +2,17 @@ from typing import Callable, Optional
|
|
| 2 |
|
| 3 |
import dotenv
|
| 4 |
from openai import AsyncOpenAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from .input import decode_input
|
| 7 |
from .utils import NamedTemporaryFile, file_to_data_uri, timed
|
|
@@ -71,20 +82,14 @@ async def process_video(
|
|
| 71 |
images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]
|
| 72 |
|
| 73 |
callback("Querying")
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
{
|
| 78 |
"role": "user",
|
| 79 |
"content": [
|
| 80 |
{"type": "text", "text": transcription.text},
|
| 81 |
-
*[
|
| 82 |
-
{
|
| 83 |
-
"type": "image_url",
|
| 84 |
-
"image_url": {"url": image, "detail": "auto"},
|
| 85 |
-
}
|
| 86 |
-
for image in images
|
| 87 |
-
],
|
| 88 |
],
|
| 89 |
},
|
| 90 |
{
|
|
@@ -97,13 +102,25 @@ async def process_video(
|
|
| 97 |
],
|
| 98 |
},
|
| 99 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
)
|
|
|
|
| 101 |
|
| 102 |
callback("Converting to speech")
|
| 103 |
audio = await client.audio.speech.create(
|
| 104 |
model="tts-1",
|
| 105 |
voice="nova",
|
| 106 |
-
input=
|
| 107 |
response_format="mp3",
|
| 108 |
)
|
| 109 |
|
|
|
|
| 2 |
|
| 3 |
import dotenv
|
| 4 |
from openai import AsyncOpenAI
|
| 5 |
+
import requests
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
|
| 10 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
| 11 |
+
from transformers.image_utils import load_image
|
| 12 |
+
|
| 13 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 14 |
+
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
|
| 15 |
+
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to(DEVICE)
|
| 16 |
|
| 17 |
from .input import decode_input
|
| 18 |
from .utils import NamedTemporaryFile, file_to_data_uri, timed
|
|
|
|
| 82 |
images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]
|
| 83 |
|
| 84 |
callback("Querying")
|
| 85 |
+
|
| 86 |
+
prompt = processor.apply_chat_template(
|
| 87 |
+
[
|
| 88 |
{
|
| 89 |
"role": "user",
|
| 90 |
"content": [
|
| 91 |
{"type": "text", "text": transcription.text},
|
| 92 |
+
*[{"type": "image"} for image in images],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
],
|
| 94 |
},
|
| 95 |
{
|
|
|
|
| 102 |
],
|
| 103 |
},
|
| 104 |
],
|
| 105 |
+
add_generation_prompt=True,
|
| 106 |
+
)
|
| 107 |
+
inputs = processor(
|
| 108 |
+
text=prompt,
|
| 109 |
+
images=[load_image(image) for image in images],
|
| 110 |
+
return_tensors="pt",
|
| 111 |
+
)
|
| 112 |
+
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
| 113 |
+
generated_ids = model.generate(**inputs, max_new_tokens=500)
|
| 114 |
+
generated_texts = processor.batch_decode(
|
| 115 |
+
generated_ids, skip_special_tokens=True
|
| 116 |
)
|
| 117 |
+
print("".join(generated_texts))
|
| 118 |
|
| 119 |
callback("Converting to speech")
|
| 120 |
audio = await client.audio.speech.create(
|
| 121 |
model="tts-1",
|
| 122 |
voice="nova",
|
| 123 |
+
input="".join(generated_texts),
|
| 124 |
response_format="mp3",
|
| 125 |
)
|
| 126 |
|