mi55th commited on
Commit
7edcdca
·
verified ·
1 Parent(s): d563ad4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -75
app.py CHANGED
@@ -20,29 +20,6 @@ from sentence_transformers import SentenceTransformer
20
  # Инициализация моделей (ленивая загрузка)
21
  models = {}
22
 
23
- def load_audio_model(model_name):
24
- if model_name not in models:
25
- if model_name == "whisper":
26
- models[model_name] = pipeline(
27
- "automatic-speech-recognition",
28
- model="openai/whisper-small"
29
- )
30
- elif model_name == "wav2vec2":
31
- models[model_name] = pipeline(
32
- "automatic-speech-recognition",
33
- model="bond005/wav2vec2-large-ru-golos"
34
- )
35
- elif model_name == "audio_classifier":
36
- models[model_name] = pipeline(
37
- "audio-classification",
38
- model="MIT/ast-finetuned-audioset-10-10-0.4593"
39
- )
40
- elif model_name == "emotion_classifier":
41
- models[model_name] = pipeline(
42
- "audio-classification",
43
- model="superb/hubert-large-superb-er"
44
- )
45
- return models[model_name]
46
 
47
  def load_image_model(model_name):
48
  if model_name not in models:
@@ -59,58 +36,6 @@ def load_image_model(model_name):
59
  models[f"{model_name}_processor"] = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
60
  return models[model_name]
61
 
62
- # Функции для обработки аудио
63
- def audio_classification(audio_file, model_type):
64
- classifier = load_audio_model(model_type)
65
- results = classifier(audio_file)
66
-
67
- output = "Топ-5 предсказаний:\n"
68
- for i, result in enumerate(results[:5]):
69
- output += f"{i+1}. {result['label']}: {result['score']:.4f}\n"
70
-
71
- return output
72
-
73
- def speech_recognition(audio_file, model_type):
74
- asr_pipeline = load_audio_model(model_type)
75
-
76
- if model_type == "whisper":
77
- result = asr_pipeline(audio_file, generate_kwargs={"language": "russian"})
78
- else:
79
- result = asr_pipeline(audio_file)
80
-
81
- return result['text']
82
-
83
- def text_to_speech(text, model_type):
84
- if model_type == "silero":
85
- # Silero TTS
86
- model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
87
- model='silero_tts',
88
- language='ru',
89
- speaker='ru_v3')
90
-
91
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
92
- model.save_wav(text=text, speaker='aidar', sample_rate=48000, audio_path=f.name)
93
- return f.name
94
-
95
- elif model_type == "gtts":
96
- # Google TTS
97
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
98
- tts = gTTS(text=text, lang='ru')
99
- tts.save(f.name)
100
- return f.name
101
-
102
- elif model_type == "mms":
103
- # Facebook MMS TTS
104
- model = VitsModel.from_pretrained("facebook/mms-tts-rus")
105
- tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
106
-
107
- inputs = tokenizer(text, return_tensors="pt")
108
- with torch.no_grad():
109
- output = model(**inputs).waveform
110
-
111
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
112
- sf.write(f.name, output.numpy().squeeze(), model.config.sampling_rate)
113
- return f.name
114
 
115
  # Функции для обработки изображений
116
  def object_detection(image):
 
20
  # Инициализация моделей (ленивая загрузка)
21
  models = {}
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def load_image_model(model_name):
25
  if model_name not in models:
 
36
  models[f"{model_name}_processor"] = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
37
  return models[model_name]
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Функции для обработки изображений
41
  def object_detection(image):