Spaces:

KasKniesmeijer
/

FAAM-demo

Sleeping

App Files Files Community

KasKniesmeijer commited on Dec 16, 2024

Commit

a259df9

1 Parent(s): 7373a84

improved gradio interface

Browse files

Files changed (4) hide show

app.py +47 -8
index.html +0 -25
requirements.txt +4 -1
src/main.js +0 -83

app.py CHANGED Viewed

@@ -1,12 +1,19 @@
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForVision2Seq
-from transformers.image_utils import load_image
 import numpy as np
 import gradio as gr
 # Set the device (GPU or CPU)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize processor and model
 try:
@@ -16,13 +23,36 @@ try:
         torch_dtype=torch.bfloat16,
         _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
     ).to(DEVICE)
 except Exception as e:
     print(f"Error loading model or processor: {str(e)}")
     exit(1)
 # Define the function to answer questions
-def answer_question(image, question):
     # Check if the image is provided
     if image is None:
         return "Error: Please upload an image."
@@ -65,17 +95,26 @@ def answer_question(image, question):
         return f"Error: Failed to generate answer. {str(e)}"
-# Create Gradio interface
 iface = gr.Interface(
     fn=answer_question,
     inputs=[
-        gr.Image(type="numpy"),
         gr.Textbox(lines=2, placeholder="Enter your question here..."),
     ],
     outputs="text",
     title="FAAM-demo | Vision Language Model | SmolVLM",
-    description="Upload an image and ask a question about it.",
 )
-if __name__ == "__main__":
-    iface.launch()

 import torch
 from PIL import Image
+from transformers import (
+    AutoProcessor,
+    AutoModelForVision2Seq,
+    Wav2Vec2ForCTC,
+    Wav2Vec2Processor,
+)
 import numpy as np
 import gradio as gr
+import librosa
+from gradio.themes import Citrus
 # Set the device (GPU or CPU)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {DEVICE}")
 # Initialize processor and model
 try:
         torch_dtype=torch.bfloat16,
         _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
     ).to(DEVICE)
+    stt_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+    stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE)
 except Exception as e:
     print(f"Error loading model or processor: {str(e)}")
     exit(1)
+# Define the function to convert speech to text
+def speech_to_text(audio):
+    try:
+        # Load audio
+        audio, rate = librosa.load(audio, sr=16000)
+        input_values = stt_processor(
+            audio, return_tensors="pt", sampling_rate=16000
+        ).input_values.to(DEVICE)
+        logits = stt_model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = stt_processor.decode(predicted_ids[0])
+        print(f"Detected text: {transcription}")
+        return transcription
+    except Exception as e:
+        return f"Error: Unable to process the audio. {str(e)}"
 # Define the function to answer questions
+def answer_question(image, question, audio):
+    # Convert speech to text if audio is provided
+    if audio is not None:
+        question = speech_to_text(audio)
     # Check if the image is provided
     if image is None:
         return "Error: Please upload an image."
         return f"Error: Failed to generate answer. {str(e)}"
+# Customize the Citrus theme with a specific neutral_hue
+custom_citrus = Citrus(neutral_hue="slate")
+# Define your Gradio interface
 iface = gr.Interface(
     fn=answer_question,
     inputs=[
+        gr.Image(type="numpy", value="faam_to_the_future.jpg"),
         gr.Textbox(lines=2, placeholder="Enter your question here..."),
+        gr.Audio(
+            type="filepath",
+            sources="microphone",
+            label="Upload a recording or record a question",
+        ),
     ],
     outputs="text",
     title="FAAM-demo | Vision Language Model | SmolVLM",
+    description="Welcome to the FAAM-demo!",
+    theme=custom_citrus,
 )
+# Launch the interface
+iface.launch()

index.html DELETED Viewed

@@ -1,25 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-	<meta charset="UTF-8">
-	<meta name="viewport" content="width=device-width, initial-scale=1.0">
-	<title>SmolVLM WebGPU</title>
-	<link rel="stylesheet" href="styles.css">
-</head>
-<body>
-	<h1>SmolVLM - Vision-Language Model</h1>
-	<div id="app">
-		<canvas id="webgpu-canvas"></canvas>
-		<div id="controls">
-			<input type="file" id="image-upload" accept="image/*">
-			<input type="text" id="question" placeholder="Ask a question about the image">
-			<button id="submit-btn">Submit</button>
-		</div>
-		<div id="answer">Answer will appear here</div>
-	</div>
-	<script type="module" src="./src/main.js"></script>
-</body>
-</html>

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 torch
 transformers
-gradio

 torch
 transformers
+gradio
+pillow
+numpy
+librosa

src/main.js CHANGED Viewed

@@ -1,83 +0,0 @@
-async function initializeWebGPU() {
-    const canvas = document.getElementById("webgpu-canvas");
-    if (!navigator.gpu) {
-        document.body.innerHTML = "<p>Your browser does not support WebGPU.</p>";
-        return;
-    }
-    console.log("WebGPU is supported.");
-    const adapter = await navigator.gpu.requestAdapter();
-    if (!adapter) {
-        console.error("Failed to get GPU adapter.");
-        return;
-    }
-    console.log("GPU adapter obtained.");
-    const device = await adapter.requestDevice();
-    if (!device) {
-        console.error("Failed to get GPU device.");
-        return;
-    }
-    console.log("GPU device obtained.");
-    const context = canvas.getContext("webgpu");
-    if (!context) {
-        console.error("Failed to get WebGPU context.");
-        return;
-    }
-    console.log("WebGPU context obtained.");
-    context.configure({
-        device: device,
-        format: navigator.gpu.getPreferredCanvasFormat(),
-        alphaMode: "opaque",
-    });
-    console.log("WebGPU initialized and canvas configured.");
-}
-// Call the initializeWebGPU function to ensure it runs
-initializeWebGPU();
-async function submitQuestion(imageFile, question) {
-    const formData = new FormData();
-    formData.append("image", imageFile);
-    formData.append("text", question);
-    try {
-        const response = await fetch("/predict", {
-            method: "POST",
-            body: formData,
-        });
-        if (!response.ok) {
-            const errorText = await response.text();
-            console.error("Failed to get a response:", response.status, response.statusText, errorText);
-            return `Error: Unable to fetch the answer. Status: ${response.status}, ${response.statusText}`;
-        }
-        const result = await response.json();
-        return result.data[0];
-    } catch (error) {
-        console.error("Fetch error:", error);
-        return `Error: Unable to fetch the answer. ${error.message}`;
-    }
-}
-// Handle user interactions
-document.getElementById("submit-btn").addEventListener("click", async () => {
-    const imageFile = document.getElementById("image-upload").files[0];
-    if (!imageFile) {
-        alert("Please upload an image.");
-        return;
-    }
-    const question = document.getElementById("question").value;
-    const answer = await submitQuestion(imageFile, question);
-    document.getElementById("answer").innerText = `Answer: ${answer}`;
-});
-// Initialize WebGPU when the page loads
-initializeWebGPU();