Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
|
| 3 |
from diffusers import StableDiffusionPipeline
|
| 4 |
import torch
|
| 5 |
-
|
| 6 |
# Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
|
| 7 |
prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
|
| 8 |
|
|
@@ -24,25 +24,13 @@ def generate_image(prompt: str):
|
|
| 24 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
| 25 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
| 26 |
|
| 27 |
-
import librosa # For handling audio files
|
| 28 |
-
|
| 29 |
def transcribe_audio(audio):
|
| 30 |
# Convert audio to text using Whisper
|
| 31 |
-
|
| 32 |
-
# Check if audio is a numpy array (for recordings)
|
| 33 |
-
if isinstance(audio, np.ndarray):
|
| 34 |
-
audio_input = processor(audio, return_tensors="pt").input_features
|
| 35 |
-
else:
|
| 36 |
-
# If it's a file path (upload), use librosa to load the file
|
| 37 |
-
audio_input, _ = librosa.load(audio, sr=16000) # Load audio with 16kHz sample rate
|
| 38 |
-
audio_input = processor(audio_input, return_tensors="pt").input_features
|
| 39 |
-
|
| 40 |
-
# Transcribe the audio
|
| 41 |
predicted_ids = model.generate(audio_input)
|
| 42 |
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
|
| 43 |
return transcription
|
| 44 |
|
| 45 |
-
|
| 46 |
# Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio)
|
| 47 |
def process_input(description: str, creativity: float, include_background: bool):
|
| 48 |
# Generate a detailed prompt
|
|
@@ -72,7 +60,6 @@ background_checkbox = gr.Checkbox(label="Include Background", value=True)
|
|
| 72 |
|
| 73 |
audio_input = gr.Audio(type="numpy", label="Speak your Description")
|
| 74 |
|
| 75 |
-
|
| 76 |
# Create interface with both text and audio inputs
|
| 77 |
interface = gr.Interface(
|
| 78 |
fn=process_input,
|
|
|
|
| 2 |
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
|
| 3 |
from diffusers import StableDiffusionPipeline
|
| 4 |
import torch
|
| 5 |
+
|
| 6 |
# Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
|
| 7 |
prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
|
| 8 |
|
|
|
|
| 24 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
| 25 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
| 26 |
|
|
|
|
|
|
|
| 27 |
def transcribe_audio(audio):
|
| 28 |
# Convert audio to text using Whisper
|
| 29 |
+
audio_input = processor(audio, return_tensors="pt").input_features
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
predicted_ids = model.generate(audio_input)
|
| 31 |
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
|
| 32 |
return transcription
|
| 33 |
|
|
|
|
| 34 |
# Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio)
|
| 35 |
def process_input(description: str, creativity: float, include_background: bool):
|
| 36 |
# Generate a detailed prompt
|
|
|
|
| 60 |
|
| 61 |
audio_input = gr.Audio(type="numpy", label="Speak your Description")
|
| 62 |
|
|
|
|
| 63 |
# Create interface with both text and audio inputs
|
| 64 |
interface = gr.Interface(
|
| 65 |
fn=process_input,
|