MaykaGR commited on
Commit
fd8a558
verified
1 Parent(s): 04c36a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -24
app.py CHANGED
@@ -3,14 +3,19 @@ import gradio as gr
3
  import requests
4
  from PIL import Image
5
  from transformers import BlipProcessor, BlipForConditionalGeneration
6
- import torch
7
- import soundfile as sf
8
- from diffusers import StableAudioPipeline
9
- import os
10
- from huggingface_hub import login
 
 
 
 
 
 
11
 
12
 
13
- login(token=os.environ["HF_TOKEN"])
14
 
15
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
16
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu")
@@ -41,25 +46,17 @@ with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", f
41
  return processor.decode(out[0], skip_special_tokens=True)
42
 
43
  def leer(texto):
44
- prompt = texto
45
- negative_prompt = "Low quality."
46
-
47
- generator = torch.Generator("cpu").manual_seed(0)
48
-
49
- audio = pipe(
50
- prompt,
51
- negative_prompt=negative_prompt,
52
- num_inference_steps=200,
53
- audio_end_in_s=10.0,
54
- num_waveforms_per_prompt=3,
55
- generator=generator,
56
- ).audios
57
 
58
- salida = audio[0].T.float().cpu().numpy()
59
- print("Fijar audio")
60
- sf.write("demo.wav", salida, pipe.vae.sampling_rate)
61
- print("sf writed")
62
- return sf.read("demo.wav")
63
 
64
 
65
  button.click(describir, [textbox], output)
 
3
  import requests
4
  from PIL import Image
5
  from transformers import BlipProcessor, BlipForConditionalGeneration
6
+ from outetts.v0_1.interface import InterfaceHF, InterfaceGGUF
7
+
8
+ # Initialize the interface with the Hugging Face model
9
+ interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
10
+
11
+ # Or initialize the interface with a GGUF model
12
+ # interface = InterfaceGGUF("path/to/model.gguf")
13
+
14
+
15
+
16
+
17
 
18
 
 
19
 
20
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
21
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu")
 
46
  return processor.decode(out[0], skip_special_tokens=True)
47
 
48
  def leer(texto):
49
+ output = interface.generate(
50
+ text= texto,
51
+ temperature=0.1,
52
+ repetition_penalty=1.1,
53
+ max_length=4096
54
+ )
55
+
56
+ #output.play()
57
+ #output.save("output.wav")
 
 
 
 
58
 
59
+ return output
 
 
 
 
60
 
61
 
62
  button.click(describir, [textbox], output)