DRDELATV2025
Update: Configurar modelo Epicuro para Hugging Face con archivos de configuración correctos
a22c618
| #!/usr/bin/env python3 | |
| """ | |
| 🎙️ Modelo Epicuro - Modelo de IA para Podcast | |
| Sistema completo de IA para transcripción, síntesis de voz y generación de contenido | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModel, | |
| Wav2Vec2ForCTC, | |
| Wav2Vec2Processor, | |
| SpeechT5Processor, | |
| SpeechT5ForTextToSpeech, | |
| SpeechT5HifiGan, | |
| AutoModelForCausalLM, | |
| AutoConfig, | |
| PreTrainedModel, | |
| PreTrainedTokenizer, | |
| PretrainedConfig | |
| ) | |
| import numpy as np | |
| import librosa | |
| import soundfile as sf | |
| from pathlib import Path | |
| import json | |
| from datetime import datetime | |
| from typing import Dict, List, Optional, Tuple, Union | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| class EpicuroConfig(PretrainedConfig): | |
| """Configuración para el modelo Epicuro""" | |
| model_type = "epicuro" | |
| def __init__( | |
| self, | |
| vocab_size=50257, | |
| hidden_size=768, | |
| num_attention_heads=12, | |
| num_hidden_layers=12, | |
| intermediate_size=3072, | |
| hidden_act="gelu", | |
| hidden_dropout_prob=0.1, | |
| attention_probs_dropout_prob=0.1, | |
| max_position_embeddings=512, | |
| type_vocab_size=2, | |
| initializer_range=0.02, | |
| layer_norm_eps=1e-12, | |
| sample_rate=22050, | |
| max_length=512, | |
| supported_languages=["es", "en"], | |
| voice_styles=["neutral", "happy", "sad", "angry", "fearful"], | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.num_attention_heads = num_attention_heads | |
| self.num_hidden_layers = num_hidden_layers | |
| self.intermediate_size = intermediate_size | |
| self.hidden_act = hidden_act | |
| self.hidden_dropout_prob = hidden_dropout_prob | |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob | |
| self.max_position_embeddings = max_position_embeddings | |
| self.type_vocab_size = type_vocab_size | |
| self.initializer_range = initializer_range | |
| self.layer_norm_eps = layer_norm_eps | |
| self.sample_rate = sample_rate | |
| self.max_length = max_length | |
| self.supported_languages = supported_languages | |
| self.voice_styles = voice_styles | |
| class EpicuroTokenizer(PreTrainedTokenizer): | |
| """Tokenizador personalizado para el modelo Epicuro""" | |
| def __init__(self, **kwargs): | |
| super().__init__(**kwargs) | |
| self.vocab = { | |
| "<pad>": 0, | |
| "<unk>": 1, | |
| "<s>": 2, | |
| "</s>": 3 | |
| } | |
| self.ids_to_tokens = {v: k for k, v in self.vocab.items()} | |
| def _tokenize(self, text): | |
| return text.split() | |
| def _convert_token_to_id(self, token): | |
| return self.vocab.get(token, self.vocab["<unk>"]) | |
| def _convert_id_to_token(self, index): | |
| return self.ids_to_tokens.get(index, "<unk>") | |
| def get_vocab(self): | |
| return self.vocab | |
| def vocab_size(self): | |
| return len(self.vocab) | |
| class EpicuroModel(PreTrainedModel): | |
| """ | |
| Modelo principal de Podcast Epicuro | |
| Combina transcripción, síntesis de voz y generación de contenido | |
| """ | |
| config_class = EpicuroConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.config = config | |
| # Componentes del modelo | |
| self.whisper_model = None | |
| self.whisper_processor = None | |
| self.tts_model = None | |
| self.tts_processor = None | |
| self.vocoder = None | |
| self.text_generator = None | |
| self.text_tokenizer = None | |
| # Configuración de audio | |
| self.sample_rate = config.get('sample_rate', 22050) | |
| self.max_length = config.get('max_length', 512) | |
| print("🎙️ Inicializando Modelo Epicuro...") | |
| def load_models(self): | |
| """Cargar todos los modelos necesarios""" | |
| print("🔄 Cargando modelos de IA...") | |
| try: | |
| # Modelo de transcripción (Whisper) | |
| self.whisper_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53") | |
| self.whisper_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53") | |
| # Modelo de síntesis de voz (SpeechT5) | |
| self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Modelo de generación de texto | |
| self.text_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") | |
| self.text_generator = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") | |
| print("✅ Modelos cargados exitosamente!") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error cargando modelos: {e}") | |
| return False | |
| def transcribe_audio(self, audio_path: str) -> Dict[str, Union[str, float, List]]: | |
| """ | |
| Transcribir audio a texto usando Whisper | |
| """ | |
| try: | |
| # Cargar audio | |
| audio, sr = librosa.load(audio_path, sr=16000) | |
| # Preprocesar para wav2vec2 | |
| inputs = self.whisper_processor(audio, sampling_rate=16000, return_tensors="pt") | |
| # Transcribir | |
| with torch.no_grad(): | |
| logits = self.whisper_model(inputs.input_values).logits | |
| # Decodificar | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = self.whisper_processor.batch_decode(predicted_ids)[0] | |
| # Detectar idioma (simplificado) | |
| language = self._detect_language(transcription) | |
| return { | |
| 'text': transcription.strip(), | |
| 'language': language, | |
| 'confidence': float(torch.max(torch.softmax(logits, dim=-1)).item()), | |
| 'duration': len(audio) / sr, | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| return { | |
| 'text': '', | |
| 'language': 'unknown', | |
| 'confidence': 0.0, | |
| 'duration': 0.0, | |
| 'error': str(e), | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| def synthesize_speech(self, text: str, voice_style: str = "neutral") -> Dict[str, Union[str, float, np.ndarray]]: | |
| """ | |
| Sintetizar texto a voz usando SpeechT5 | |
| """ | |
| try: | |
| if not text.strip(): | |
| return { | |
| 'audio': None, | |
| 'sample_rate': self.sample_rate, | |
| 'duration': 0.0, | |
| 'error': 'Texto vacío', | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| # Procesar texto | |
| inputs = self.tts_processor(text=text, return_tensors="pt") | |
| # Generar audio | |
| with torch.no_grad(): | |
| speech = self.tts_model.generate_speech( | |
| inputs["input_ids"], | |
| self.vocoder, | |
| speaker_embeddings=None | |
| ) | |
| # Convertir a numpy | |
| audio_np = speech.numpy() | |
| duration = len(audio_np) / self.sample_rate | |
| return { | |
| 'audio': audio_np, | |
| 'sample_rate': self.sample_rate, | |
| 'duration': duration, | |
| 'voice_style': voice_style, | |
| 'text_length': len(text), | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| return { | |
| 'audio': None, | |
| 'sample_rate': self.sample_rate, | |
| 'duration': 0.0, | |
| 'error': str(e), | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| def generate_podcast_content(self, topic: str, duration_minutes: int = 5) -> Dict[str, Union[str, int, List]]: | |
| """ | |
| Generar contenido de podcast usando IA | |
| """ | |
| try: | |
| # Crear prompt | |
| prompt = f"Crear un guión de podcast sobre {topic} de {duration_minutes} minutos. El guión debe ser dinámico, entretenido y profesional." | |
| # Tokenizar | |
| inputs = self.text_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True) | |
| # Generar | |
| with torch.no_grad(): | |
| outputs = self.text_generator.generate( | |
| inputs.input_ids, | |
| max_length=512, | |
| num_return_sequences=1, | |
| temperature=0.8, | |
| do_sample=True, | |
| pad_token_id=self.text_tokenizer.eos_token_id | |
| ) | |
| # Decodificar | |
| generated_text = self.text_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Limpiar texto | |
| script = generated_text.replace(prompt, "").strip() | |
| return { | |
| 'script': script, | |
| 'topic': topic, | |
| 'duration_minutes': duration_minutes, | |
| 'word_count': len(script.split()), | |
| 'estimated_duration': len(script.split()) * 0.5, # Aproximado | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| return { | |
| 'script': '', | |
| 'topic': topic, | |
| 'duration_minutes': duration_minutes, | |
| 'word_count': 0, | |
| 'error': str(e), | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| def process_podcast_episode(self, audio_path: str, target_voice: str = "neutral") -> Dict: | |
| """ | |
| Procesar un episodio completo de podcast | |
| """ | |
| try: | |
| # Transcribir audio | |
| transcription = self.transcribe_audio(audio_path) | |
| if transcription.get('error'): | |
| return { | |
| 'success': False, | |
| 'error': transcription['error'], | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| # Generar resumen | |
| summary = self._generate_summary(transcription['text']) | |
| # Generar tags | |
| tags = self._generate_tags(transcription['text']) | |
| # Convertir a voz objetivo | |
| voice_conversion = self.synthesize_speech(transcription['text'], target_voice) | |
| return { | |
| 'success': True, | |
| 'transcription': transcription, | |
| 'summary': summary, | |
| 'tags': tags, | |
| 'voice_conversion': voice_conversion, | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| return { | |
| 'success': False, | |
| 'error': str(e), | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| def _detect_language(self, text: str) -> str: | |
| """Detectar idioma del texto (simplificado)""" | |
| spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las', 'una', 'como', 'más', 'pero', 'sus', 'le', 'ha', 'me', 'si', 'sin', 'sobre', 'este', 'ya', 'entre', 'cuando', 'todo', 'esta', 'ser', 'son', 'dos', 'también', 'fue', 'había', 'era', 'muy', 'años', 'hasta', 'desde', 'está', 'mi', 'porque', 'qué', 'sólo', 'han', 'yo', 'hay', 'vez', 'puede', 'todos', 'así', 'nos', 'ni', 'parte', 'tiene', 'él', 'uno', 'donde', 'bien', 'tiempo', 'mismo', 'ese', 'ahora', 'cada', 'e', 'vida', 'otro', 'después', 'te', 'otros', 'aunque', 'esa', 'esos', 'estas', 'le', 'les', 'nosotros', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vosotros', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'ellos', 'ellas', 'suyo', 'suya', 'suyos', 'suyas', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras'] | |
| text_lower = text.lower() | |
| spanish_count = sum(1 for word in spanish_words if word in text_lower) | |
| if spanish_count > 5: | |
| return 'es' | |
| else: | |
| return 'en' | |
| def _generate_summary(self, text: str) -> Dict[str, str]: | |
| """Generar resumen del texto""" | |
| try: | |
| # Resumen simple (primeras 3 oraciones) | |
| sentences = text.split('.') | |
| summary = '. '.join(sentences[:3]) + '.' | |
| return { | |
| 'summary': summary, | |
| 'word_count': len(summary.split()), | |
| 'original_word_count': len(text.split()) | |
| } | |
| except: | |
| return { | |
| 'summary': text[:200] + '...', | |
| 'word_count': 0, | |
| 'original_word_count': len(text.split()) | |
| } | |
| def _generate_tags(self, text: str) -> List[str]: | |
| """Generar tags del texto""" | |
| # Tags básicos basados en palabras clave | |
| tags = [] | |
| text_lower = text.lower() | |
| if any(word in text_lower for word in ['tecnología', 'tecnico', 'digital', 'software', 'hardware']): | |
| tags.append('tecnología') | |
| if any(word in text_lower for word in ['salud', 'médico', 'medicina', 'clínica', 'doctor']): | |
| tags.append('salud') | |
| if any(word in text_lower for word in ['negocio', 'empresa', 'marketing', 'ventas']): | |
| tags.append('negocios') | |
| if any(word in text_lower for word in ['educación', 'aprender', 'estudio', 'universidad']): | |
| tags.append('educación') | |
| if any(word in text_lower for word in ['entretenimiento', 'música', 'cine', 'arte']): | |
| tags.append('entretenimiento') | |
| return tags if tags else ['general'] | |
| def save_model(self, path: str): | |
| """Guardar modelo completo""" | |
| try: | |
| model_path = Path(path) | |
| model_path.mkdir(parents=True, exist_ok=True) | |
| # Guardar configuración | |
| with open(model_path / "config.json", "w") as f: | |
| json.dump(self.config, f, indent=2) | |
| # Guardar modelos (si están cargados) | |
| if self.whisper_model: | |
| self.whisper_model.save_pretrained(model_path / "whisper") | |
| self.whisper_processor.save_pretrained(model_path / "whisper") | |
| if self.tts_model: | |
| self.tts_model.save_pretrained(model_path / "tts") | |
| self.tts_processor.save_pretrained(model_path / "tts") | |
| if self.text_generator: | |
| self.text_generator.save_pretrained(model_path / "text_generator") | |
| self.text_tokenizer.save_pretrained(model_path / "text_generator") | |
| print(f"✅ Modelo guardado en: {model_path}") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error guardando modelo: {e}") | |
| return False | |
| def load_model(self, path: str): | |
| """Cargar modelo desde archivo""" | |
| try: | |
| model_path = Path(path) | |
| # Cargar configuración | |
| with open(model_path / "config.json", "r") as f: | |
| self.config = json.load(f) | |
| # Cargar modelos | |
| if (model_path / "whisper").exists(): | |
| self.whisper_model = Wav2Vec2ForCTC.from_pretrained(model_path / "whisper") | |
| self.whisper_processor = Wav2Vec2Processor.from_pretrained(model_path / "whisper") | |
| if (model_path / "tts").exists(): | |
| self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(model_path / "tts") | |
| self.tts_processor = SpeechT5Processor.from_pretrained(model_path / "tts") | |
| self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| if (model_path / "text_generator").exists(): | |
| self.text_generator = AutoModelForCausalLM.from_pretrained(model_path / "text_generator") | |
| self.text_tokenizer = AutoTokenizer.from_pretrained(model_path / "text_generator") | |
| print(f"✅ Modelo cargado desde: {model_path}") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error cargando modelo: {e}") | |
| return False | |
| # Configuración del modelo | |
| CONFIG = { | |
| 'model_name': 'modelo_epicuro', | |
| 'version': '1.0.0', | |
| 'description': 'Modelo de IA para Podcast Epicuro - Transcripción, Síntesis de Voz y Generación de Contenido', | |
| 'author': 'DRDELATV2025', | |
| 'sample_rate': 22050, | |
| 'max_length': 512, | |
| 'supported_languages': ['es', 'en'], | |
| 'voice_styles': ['neutral', 'happy', 'sad', 'angry', 'fearful'], | |
| 'created_at': datetime.now().isoformat() | |
| } | |
| if __name__ == "__main__": | |
| # Crear instancia del modelo | |
| model = EpicuroModel(CONFIG) | |
| # Cargar modelos | |
| if model.load_models(): | |
| print("🎉 Modelo Epicuro listo para usar!") | |
| # Ejemplo de uso | |
| print("\n📝 Ejemplo de generación de contenido:") | |
| content = model.generate_podcast_content("Inteligencia Artificial en Medicina", 5) | |
| print(f"Tema: {content['topic']}") | |
| print(f"Guion: {content['script'][:200]}...") | |
| else: | |
| print("❌ Error inicializando el modelo") | |