Spaces:

aiivar
/

Transformers

Sleeping

App Files Files Community

MinAA commited on Jan 9

Commit

3228848

1 Parent(s): 05f7dda

cleanup

Browse files

Files changed (1) hide show

app.py +159 -13

app.py CHANGED Viewed

@@ -302,11 +302,28 @@ def audio_zero_shot_classifier(audio, candidate_labels, model_name):
         labels = [label.strip() for label in candidate_labels.split(",")]
         result = classifier(audio, candidate_labels=labels)
         output = "Результаты классификации:\n"
-        for label, score in zip(result['labels'], result['scores']):
-            output += f"{label}: {score:.4f}\n"
         return output
     except Exception as e:
-        return f"Ошибка: {str(e)}"
 @measure_time_and_save("Распознавание речи")
 def speech_recognition(audio, model_name):
@@ -331,7 +348,57 @@ def speech_synthesis(text, model_name):
         if not text or not text.strip():
             raise ValueError("Текст для синтеза не может быть пустым")
-        # Используем стандартный pipeline
         tts = get_pipeline("text-to-speech", model_name)
         result = tts(text)
@@ -389,9 +456,18 @@ def speech_synthesis(text, model_name):
             return (sample_rate, audio_data)
         else:
-            raise ValueError(f"Неожиданный формат результата от pipeline: {type(result)}")
     except Exception as e:
-        raise Exception(f"Ошибка синтеза речи: {str(e)}")
 # ==================== ЗАДАЧИ С ИЗОБРАЖЕНИЯМИ ====================
@@ -696,15 +772,85 @@ def visual_qa(image, question, model_name):
 def image_zero_shot_classification(image, candidate_labels, model_name):
     """Zero-shot классификация изображений"""
     try:
-        classifier = get_pipeline("zero-shot-image-classification", model_name)
         labels = [label.strip() for label in candidate_labels.split(",")]
-        result = classifier(image, candidate_labels=labels)
-        output = "Результаты классификации:\n"
-        for label, score in zip(result['labels'], result['scores']):
-            output += f"{label}: {score:.4f}\n"
-        return output
     except Exception as e:
-        return f"Ошибка: {str(e)}"
 # ==================== ФУНКЦИИ ДЛЯ ИСТОРИИ ====================

         labels = [label.strip() for label in candidate_labels.split(",")]
         result = classifier(audio, candidate_labels=labels)
         output = "Результаты классификации:\n"
+        # Обрабатываем разные форматы результатов
+        if isinstance(result, dict) and 'labels' in result and 'scores' in result:
+            # Формат: {'labels': [...], 'scores': [...]}
+            for label, score in zip(result['labels'], result['scores']):
+                output += f"{label}: {score:.4f}\n"
+        elif isinstance(result, list):
+            # Формат: [{'label': '...', 'score': ...}, ...]
+            for item in result:
+                if isinstance(item, dict):
+                    label = item.get('label', '')
+                    score = item.get('score', 0.0)
+                    output += f"{label}: {score:.4f}\n"
+        else:
+            return f"Ошибка: Неожиданный формат результата от pipeline: {type(result)}. Ожидался словарь с ключами 'labels' и 'scores' или список словарей."
         return output
     except Exception as e:
+        error_msg = str(e)
+        if "Could not load model" in error_msg or "Unrecognized" in error_msg:
+            return f"Ошибка: Модель '{model_name}' не поддерживается для zero-shot классификации аудио. Попробуйте другую модель, например 'laion/clap-htsat-unfused'."
+        return f"Ошибка: {error_msg}"
 @measure_time_and_save("Распознавание речи")
 def speech_recognition(audio, model_name):
         if not text or not text.strip():
             raise ValueError("Текст для синтеза не может быть пустым")
+        # Проверяем, является ли модель SpeechT5
+        if "speecht5" in model_name.lower():
+            try:
+                # Для SpeechT5 нужны speaker_embeddings
+                from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+                cache_key = f"tts_speecht5_{model_name}"
+                cached = model_cache.get(cache_key)
+                if cached is None:
+                    processor = SpeechT5Processor.from_pretrained(model_name)
+                    model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
+                    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+                    # Генерируем speaker embeddings используя модель напрямую
+                    # Используем размерность speaker embeddings из конфигурации модели
+                    speaker_embedding_dim = model.config.speaker_embedding_dim
+                    # Создаем случайный speaker embedding (можно заменить на предобученный)
+                    # Для более стабильного результата используем нормализованный случайный вектор
+                    speaker_embeddings = torch.randn(1, speaker_embedding_dim)
+                    speaker_embeddings = speaker_embeddings / torch.norm(speaker_embeddings, dim=1, keepdim=True)
+                    cached = (processor, model, vocoder, speaker_embeddings)
+                    model_cache.put(cache_key, cached)
+                processor, model, vocoder, speaker_embeddings = cached
+                inputs = processor(text=text, return_tensors="pt")
+                with torch.no_grad():
+                    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+                # Конвертируем в numpy и нормализуем
+                audio_data = speech.numpy()
+                # Убеждаемся, что это 1D массив
+                if len(audio_data.shape) > 1:
+                    audio_data = audio_data.flatten()
+                # Нормализуем в диапазон [-1, 1] если нужно
+                if audio_data.dtype != np.float32:
+                    audio_data = audio_data.astype(np.float32)
+                # Нормализуем если значения выходят за пределы [-1, 1]
+                max_val = np.abs(audio_data).max()
+                if max_val > 1.0:
+                    audio_data = audio_data / max_val
+                sample_rate = 16000
+                return (sample_rate, audio_data)
+            except Exception as e:
+                error_msg = str(e)
+                if "ImportError" in str(type(e)) or "ModuleNotFoundError" in str(type(e)):
+                    raise Exception(f"Ошибка: Не удалось импортировать необходимые модули для SpeechT5. Убедитесь, что transformers установлен: {error_msg}")
+                raise Exception(f"Ошибка синтеза речи с SpeechT5: {error_msg}")
+        # Используем стандартный pipeline для других моделей
         tts = get_pipeline("text-to-speech", model_name)
         result = tts(text)
             return (sample_rate, audio_data)
         else:
+            raise ValueError(f"Неожиданный формат результата от pipeline: {type(result)}. Ожидался словарь с ключами 'audio' и 'sampling_rate' или кортеж (sample_rate, audio_data).")
     except Exception as e:
+        error_msg = str(e)
+        if "speaker_embeddings" in error_msg.lower():
+            if "speecht5" in model_name.lower():
+                return f"Ошибка: Модель SpeechT5 требует speaker_embeddings. Они должны генерироваться автоматически, но произошла ошибка: {error_msg}"
+            return f"Ошибка: Модель '{model_name}' требует speaker_embeddings. Для SpeechT5 они генерируются автоматически, но для других моделей может потребоваться дополнительная настройка."
+        if "does not appear to have a file named" in error_msg or "Unrecognized model" in error_msg:
+            return f"Ошибка: Модель '{model_name}' не поддерживается библиотекой transformers для синтеза речи. Попробуйте использовать модель 'microsoft/speecht5_tts'."
+        if "negative output size" in error_msg.lower() or "input size 0" in error_msg.lower():
+            return f"Ошибка: Проблема с обработкой текста моделью '{model_name}'. Возможные причины: неподдерживаемый язык, пустой текст после обработки, или проблема с токенизацией. Попробуйте использовать другой текст или модель."
+        raise Exception(f"Ошибка синтеза речи: {error_msg}")
 # ==================== ЗАДАЧИ С ИЗОБРАЖЕНИЯМИ ====================
 def image_zero_shot_classification(image, candidate_labels, model_name):
     """Zero-shot классификация изображений"""
     try:
         labels = [label.strip() for label in candidate_labels.split(",")]
+        # Проверяем, является ли модель LAION
+        if "laion/" in model_name.lower() or "laion5b" in model_name.lower():
+            # Используем OpenCLIP для LAION моделей
+            import open_clip
+            cache_key = f"clip_laion_{model_name}"
+            cached = model_cache.get(cache_key)
+            if cached is None:
+                # Определяем имя модели и веса для OpenCLIP
+                if "xlm-roberta-base-ViT-B-32" in model_name or "xlm-roberta-base" in model_name:
+                    clip_model_name = "xlm-roberta-base-ViT-B-32"
+                    pretrained = "laion5b_s13b_b90k"
+                else:
+                    # Пытаемся извлечь информацию из имени модели
+                    clip_model_name = "xlm-roberta-base-ViT-B-32"
+                    pretrained = "laion5b_s13b_b90k"
+                model, _, preprocess = open_clip.create_model_and_transforms(
+                    clip_model_name,
+                    pretrained=pretrained
+                )
+                tokenizer = open_clip.get_tokenizer(clip_model_name)
+                model.eval()
+                cached = (model, preprocess, tokenizer)
+                model_cache.put(cache_key, cached)
+            model, preprocess, tokenizer = cached
+            # Обрабатываем изображение и тексты
+            image_tensor = preprocess(image).unsqueeze(0)
+            text_tokens = tokenizer(labels)
+            with torch.no_grad():
+                image_features = model.encode_image(image_tensor)
+                text_features = model.encode_text(text_tokens)
+                # Нормализуем признаки
+                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+                text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+                # Вычисляем косинусное сходство (логиты)
+                logits_per_image = (image_features @ text_features.T) * 100  # Масштабируем для лучшей точности
+                probs = logits_per_image.softmax(dim=1)
+            output = "Результаты классификации:\n"
+            for label, prob in zip(labels, probs[0]):
+                output += f"{label}: {prob.item():.4f}\n"
+            return output
+        else:
+            # Используем стандартный pipeline
+            classifier = get_pipeline("zero-shot-image-classification", model_name)
+            result = classifier(image, candidate_labels=labels)
+            output = "Результаты классификации:\n"
+            # Обрабатываем разные форматы результатов
+            if isinstance(result, dict) and 'labels' in result and 'scores' in result:
+                # Формат: {'labels': [...], 'scores': [...]}
+                for label, score in zip(result['labels'], result['scores']):
+                    output += f"{label}: {score:.4f}\n"
+            elif isinstance(result, list):
+                # Формат: [{'label': '...', 'score': ...}, ...]
+                for item in result:
+                    if isinstance(item, dict):
+                        label = item.get('label', '')
+                        score = item.get('score', 0.0)
+                        output += f"{label}: {score:.4f}\n"
+            else:
+                return f"Ошибка: Неожиданный формат результата от pipeline: {type(result)}. Ожидался словарь с ключами 'labels' и 'scores' или список словарей."
+            return output
     except Exception as e:
+        error_msg = str(e)
+        if "Could not load model" in error_msg or "Unrecognized" in error_msg:
+            if "laion" in model_name.lower():
+                return f"Ошибка: Модель '{model_name}' требует библиотеку open-clip-torch. Убедитесь, что она установлена: pip install open-clip-torch"
+            return f"Ошибка: Модель '{model_name}' не поддерживается для zero-shot классификации изображений. Попробуйте другую модель, например 'openai/clip-vit-base-patch32'."
+        if "open_clip" in error_msg or "open-clip" in error_msg:
+            return f"Ошибка: Для работы с LAION моделями требуется библиотека open-clip-torch. Установите её: pip install open-clip-torch"
+        return f"Ошибка: {error_msg}"
 # ==================== ФУНКЦИИ ДЛЯ ИСТОРИИ ====================