modelo_epicuro / modelo_epicuro.py
DRDELATV2025
Update: Configurar modelo Epicuro para Hugging Face con archivos de configuración correctos
a22c618
#!/usr/bin/env python3
"""
🎙️ Modelo Epicuro - Modelo de IA para Podcast
Sistema completo de IA para transcripción, síntesis de voz y generación de contenido
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
AutoTokenizer,
AutoModel,
Wav2Vec2ForCTC,
Wav2Vec2Processor,
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
AutoModelForCausalLM,
AutoConfig,
PreTrainedModel,
PreTrainedTokenizer,
PretrainedConfig
)
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
import json
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union
import warnings
warnings.filterwarnings("ignore")
class EpicuroConfig(PretrainedConfig):
"""Configuración para el modelo Epicuro"""
model_type = "epicuro"
def __init__(
self,
vocab_size=50257,
hidden_size=768,
num_attention_heads=12,
num_hidden_layers=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
sample_rate=22050,
max_length=512,
supported_languages=["es", "en"],
voice_styles=["neutral", "happy", "sad", "angry", "fearful"],
**kwargs
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.sample_rate = sample_rate
self.max_length = max_length
self.supported_languages = supported_languages
self.voice_styles = voice_styles
class EpicuroTokenizer(PreTrainedTokenizer):
"""Tokenizador personalizado para el modelo Epicuro"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.vocab = {
"<pad>": 0,
"<unk>": 1,
"<s>": 2,
"</s>": 3
}
self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
def _tokenize(self, text):
return text.split()
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.vocab["<unk>"])
def _convert_id_to_token(self, index):
return self.ids_to_tokens.get(index, "<unk>")
def get_vocab(self):
return self.vocab
def vocab_size(self):
return len(self.vocab)
class EpicuroModel(PreTrainedModel):
"""
Modelo principal de Podcast Epicuro
Combina transcripción, síntesis de voz y generación de contenido
"""
config_class = EpicuroConfig
def __init__(self, config):
super().__init__(config)
self.config = config
# Componentes del modelo
self.whisper_model = None
self.whisper_processor = None
self.tts_model = None
self.tts_processor = None
self.vocoder = None
self.text_generator = None
self.text_tokenizer = None
# Configuración de audio
self.sample_rate = config.get('sample_rate', 22050)
self.max_length = config.get('max_length', 512)
print("🎙️ Inicializando Modelo Epicuro...")
def load_models(self):
"""Cargar todos los modelos necesarios"""
print("🔄 Cargando modelos de IA...")
try:
# Modelo de transcripción (Whisper)
self.whisper_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
self.whisper_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
# Modelo de síntesis de voz (SpeechT5)
self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Modelo de generación de texto
self.text_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
self.text_generator = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
print("✅ Modelos cargados exitosamente!")
return True
except Exception as e:
print(f"❌ Error cargando modelos: {e}")
return False
def transcribe_audio(self, audio_path: str) -> Dict[str, Union[str, float, List]]:
"""
Transcribir audio a texto usando Whisper
"""
try:
# Cargar audio
audio, sr = librosa.load(audio_path, sr=16000)
# Preprocesar para wav2vec2
inputs = self.whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
# Transcribir
with torch.no_grad():
logits = self.whisper_model(inputs.input_values).logits
# Decodificar
predicted_ids = torch.argmax(logits, dim=-1)
transcription = self.whisper_processor.batch_decode(predicted_ids)[0]
# Detectar idioma (simplificado)
language = self._detect_language(transcription)
return {
'text': transcription.strip(),
'language': language,
'confidence': float(torch.max(torch.softmax(logits, dim=-1)).item()),
'duration': len(audio) / sr,
'timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'text': '',
'language': 'unknown',
'confidence': 0.0,
'duration': 0.0,
'error': str(e),
'timestamp': datetime.now().isoformat()
}
def synthesize_speech(self, text: str, voice_style: str = "neutral") -> Dict[str, Union[str, float, np.ndarray]]:
"""
Sintetizar texto a voz usando SpeechT5
"""
try:
if not text.strip():
return {
'audio': None,
'sample_rate': self.sample_rate,
'duration': 0.0,
'error': 'Texto vacío',
'timestamp': datetime.now().isoformat()
}
# Procesar texto
inputs = self.tts_processor(text=text, return_tensors="pt")
# Generar audio
with torch.no_grad():
speech = self.tts_model.generate_speech(
inputs["input_ids"],
self.vocoder,
speaker_embeddings=None
)
# Convertir a numpy
audio_np = speech.numpy()
duration = len(audio_np) / self.sample_rate
return {
'audio': audio_np,
'sample_rate': self.sample_rate,
'duration': duration,
'voice_style': voice_style,
'text_length': len(text),
'timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'audio': None,
'sample_rate': self.sample_rate,
'duration': 0.0,
'error': str(e),
'timestamp': datetime.now().isoformat()
}
def generate_podcast_content(self, topic: str, duration_minutes: int = 5) -> Dict[str, Union[str, int, List]]:
"""
Generar contenido de podcast usando IA
"""
try:
# Crear prompt
prompt = f"Crear un guión de podcast sobre {topic} de {duration_minutes} minutos. El guión debe ser dinámico, entretenido y profesional."
# Tokenizar
inputs = self.text_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
# Generar
with torch.no_grad():
outputs = self.text_generator.generate(
inputs.input_ids,
max_length=512,
num_return_sequences=1,
temperature=0.8,
do_sample=True,
pad_token_id=self.text_tokenizer.eos_token_id
)
# Decodificar
generated_text = self.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Limpiar texto
script = generated_text.replace(prompt, "").strip()
return {
'script': script,
'topic': topic,
'duration_minutes': duration_minutes,
'word_count': len(script.split()),
'estimated_duration': len(script.split()) * 0.5, # Aproximado
'timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'script': '',
'topic': topic,
'duration_minutes': duration_minutes,
'word_count': 0,
'error': str(e),
'timestamp': datetime.now().isoformat()
}
def process_podcast_episode(self, audio_path: str, target_voice: str = "neutral") -> Dict:
"""
Procesar un episodio completo de podcast
"""
try:
# Transcribir audio
transcription = self.transcribe_audio(audio_path)
if transcription.get('error'):
return {
'success': False,
'error': transcription['error'],
'timestamp': datetime.now().isoformat()
}
# Generar resumen
summary = self._generate_summary(transcription['text'])
# Generar tags
tags = self._generate_tags(transcription['text'])
# Convertir a voz objetivo
voice_conversion = self.synthesize_speech(transcription['text'], target_voice)
return {
'success': True,
'transcription': transcription,
'summary': summary,
'tags': tags,
'voice_conversion': voice_conversion,
'timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'success': False,
'error': str(e),
'timestamp': datetime.now().isoformat()
}
def _detect_language(self, text: str) -> str:
"""Detectar idioma del texto (simplificado)"""
spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las', 'una', 'como', 'más', 'pero', 'sus', 'le', 'ha', 'me', 'si', 'sin', 'sobre', 'este', 'ya', 'entre', 'cuando', 'todo', 'esta', 'ser', 'son', 'dos', 'también', 'fue', 'había', 'era', 'muy', 'años', 'hasta', 'desde', 'está', 'mi', 'porque', 'qué', 'sólo', 'han', 'yo', 'hay', 'vez', 'puede', 'todos', 'así', 'nos', 'ni', 'parte', 'tiene', 'él', 'uno', 'donde', 'bien', 'tiempo', 'mismo', 'ese', 'ahora', 'cada', 'e', 'vida', 'otro', 'después', 'te', 'otros', 'aunque', 'esa', 'esos', 'estas', 'le', 'les', 'nosotros', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vosotros', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'ellos', 'ellas', 'suyo', 'suya', 'suyos', 'suyas', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras']
text_lower = text.lower()
spanish_count = sum(1 for word in spanish_words if word in text_lower)
if spanish_count > 5:
return 'es'
else:
return 'en'
def _generate_summary(self, text: str) -> Dict[str, str]:
"""Generar resumen del texto"""
try:
# Resumen simple (primeras 3 oraciones)
sentences = text.split('.')
summary = '. '.join(sentences[:3]) + '.'
return {
'summary': summary,
'word_count': len(summary.split()),
'original_word_count': len(text.split())
}
except:
return {
'summary': text[:200] + '...',
'word_count': 0,
'original_word_count': len(text.split())
}
def _generate_tags(self, text: str) -> List[str]:
"""Generar tags del texto"""
# Tags básicos basados en palabras clave
tags = []
text_lower = text.lower()
if any(word in text_lower for word in ['tecnología', 'tecnico', 'digital', 'software', 'hardware']):
tags.append('tecnología')
if any(word in text_lower for word in ['salud', 'médico', 'medicina', 'clínica', 'doctor']):
tags.append('salud')
if any(word in text_lower for word in ['negocio', 'empresa', 'marketing', 'ventas']):
tags.append('negocios')
if any(word in text_lower for word in ['educación', 'aprender', 'estudio', 'universidad']):
tags.append('educación')
if any(word in text_lower for word in ['entretenimiento', 'música', 'cine', 'arte']):
tags.append('entretenimiento')
return tags if tags else ['general']
def save_model(self, path: str):
"""Guardar modelo completo"""
try:
model_path = Path(path)
model_path.mkdir(parents=True, exist_ok=True)
# Guardar configuración
with open(model_path / "config.json", "w") as f:
json.dump(self.config, f, indent=2)
# Guardar modelos (si están cargados)
if self.whisper_model:
self.whisper_model.save_pretrained(model_path / "whisper")
self.whisper_processor.save_pretrained(model_path / "whisper")
if self.tts_model:
self.tts_model.save_pretrained(model_path / "tts")
self.tts_processor.save_pretrained(model_path / "tts")
if self.text_generator:
self.text_generator.save_pretrained(model_path / "text_generator")
self.text_tokenizer.save_pretrained(model_path / "text_generator")
print(f"✅ Modelo guardado en: {model_path}")
return True
except Exception as e:
print(f"❌ Error guardando modelo: {e}")
return False
def load_model(self, path: str):
"""Cargar modelo desde archivo"""
try:
model_path = Path(path)
# Cargar configuración
with open(model_path / "config.json", "r") as f:
self.config = json.load(f)
# Cargar modelos
if (model_path / "whisper").exists():
self.whisper_model = Wav2Vec2ForCTC.from_pretrained(model_path / "whisper")
self.whisper_processor = Wav2Vec2Processor.from_pretrained(model_path / "whisper")
if (model_path / "tts").exists():
self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(model_path / "tts")
self.tts_processor = SpeechT5Processor.from_pretrained(model_path / "tts")
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
if (model_path / "text_generator").exists():
self.text_generator = AutoModelForCausalLM.from_pretrained(model_path / "text_generator")
self.text_tokenizer = AutoTokenizer.from_pretrained(model_path / "text_generator")
print(f"✅ Modelo cargado desde: {model_path}")
return True
except Exception as e:
print(f"❌ Error cargando modelo: {e}")
return False
# Configuración del modelo
CONFIG = {
'model_name': 'modelo_epicuro',
'version': '1.0.0',
'description': 'Modelo de IA para Podcast Epicuro - Transcripción, Síntesis de Voz y Generación de Contenido',
'author': 'DRDELATV2025',
'sample_rate': 22050,
'max_length': 512,
'supported_languages': ['es', 'en'],
'voice_styles': ['neutral', 'happy', 'sad', 'angry', 'fearful'],
'created_at': datetime.now().isoformat()
}
if __name__ == "__main__":
# Crear instancia del modelo
model = EpicuroModel(CONFIG)
# Cargar modelos
if model.load_models():
print("🎉 Modelo Epicuro listo para usar!")
# Ejemplo de uso
print("\n📝 Ejemplo de generación de contenido:")
content = model.generate_podcast_content("Inteligencia Artificial en Medicina", 5)
print(f"Tema: {content['topic']}")
print(f"Guion: {content['script'][:200]}...")
else:
print("❌ Error inicializando el modelo")