modelo_epicuro / modelo_epicuro.py

DRDELATV2025

Update: Configurar modelo Epicuro para Hugging Face con archivos de configuración correctos

a22c618 3 months ago

17.9 kB

	#!/usr/bin/env python3
	"""
	🎙️ Modelo Epicuro - Modelo de IA para Podcast
	Sistema completo de IA para transcripción, síntesis de voz y generación de contenido
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import (
	AutoTokenizer,
	AutoModel,
	Wav2Vec2ForCTC,
	Wav2Vec2Processor,
	SpeechT5Processor,
	SpeechT5ForTextToSpeech,
	SpeechT5HifiGan,
	AutoModelForCausalLM,
	AutoConfig,
	PreTrainedModel,
	PreTrainedTokenizer,
	PretrainedConfig
	)
	import numpy as np
	import librosa
	import soundfile as sf
	from pathlib import Path
	import json
	from datetime import datetime
	from typing import Dict, List, Optional, Tuple, Union
	import warnings
	warnings.filterwarnings("ignore")

	class EpicuroConfig(PretrainedConfig):
	"""Configuración para el modelo Epicuro"""
	model_type = "epicuro"

	def __init__(
	self,
	vocab_size=50257,
	hidden_size=768,
	num_attention_heads=12,
	num_hidden_layers=12,
	intermediate_size=3072,
	hidden_act="gelu",
	hidden_dropout_prob=0.1,
	attention_probs_dropout_prob=0.1,
	max_position_embeddings=512,
	type_vocab_size=2,
	initializer_range=0.02,
	layer_norm_eps=1e-12,
	sample_rate=22050,
	max_length=512,
	supported_languages=["es", "en"],
	voice_styles=["neutral", "happy", "sad", "angry", "fearful"],
	**kwargs
	):
	super().__init__(**kwargs)
	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.num_attention_heads = num_attention_heads
	self.num_hidden_layers = num_hidden_layers
	self.intermediate_size = intermediate_size
	self.hidden_act = hidden_act
	self.hidden_dropout_prob = hidden_dropout_prob
	self.attention_probs_dropout_prob = attention_probs_dropout_prob
	self.max_position_embeddings = max_position_embeddings
	self.type_vocab_size = type_vocab_size
	self.initializer_range = initializer_range
	self.layer_norm_eps = layer_norm_eps
	self.sample_rate = sample_rate
	self.max_length = max_length
	self.supported_languages = supported_languages
	self.voice_styles = voice_styles

	class EpicuroTokenizer(PreTrainedTokenizer):
	"""Tokenizador personalizado para el modelo Epicuro"""

	def __init__(self, **kwargs):
	super().__init__(**kwargs)
	self.vocab = {
	"<pad>": 0,
	"<unk>": 1,
	"<s>": 2,
	"</s>": 3
	}
	self.ids_to_tokens = {v: k for k, v in self.vocab.items()}

	def _tokenize(self, text):
	return text.split()

	def _convert_token_to_id(self, token):
	return self.vocab.get(token, self.vocab["<unk>"])

	def _convert_id_to_token(self, index):
	return self.ids_to_tokens.get(index, "<unk>")

	def get_vocab(self):
	return self.vocab

	def vocab_size(self):
	return len(self.vocab)

	class EpicuroModel(PreTrainedModel):
	"""
	Modelo principal de Podcast Epicuro
	Combina transcripción, síntesis de voz y generación de contenido
	"""

	config_class = EpicuroConfig

	def __init__(self, config):
	super().__init__(config)
	self.config = config

	# Componentes del modelo
	self.whisper_model = None
	self.whisper_processor = None
	self.tts_model = None
	self.tts_processor = None
	self.vocoder = None
	self.text_generator = None
	self.text_tokenizer = None

	# Configuración de audio
	self.sample_rate = config.get('sample_rate', 22050)
	self.max_length = config.get('max_length', 512)

	print("🎙️ Inicializando Modelo Epicuro...")

	def load_models(self):
	"""Cargar todos los modelos necesarios"""
	print("🔄 Cargando modelos de IA...")

	try:
	# Modelo de transcripción (Whisper)
	self.whisper_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
	self.whisper_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")

	# Modelo de síntesis de voz (SpeechT5)
	self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Modelo de generación de texto
	self.text_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
	self.text_generator = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

	print("✅ Modelos cargados exitosamente!")
	return True

	except Exception as e:
	print(f"❌ Error cargando modelos: {e}")
	return False

	def transcribe_audio(self, audio_path: str) -> Dict[str, Union[str, float, List]]:
	"""
	Transcribir audio a texto usando Whisper
	"""
	try:
	# Cargar audio
	audio, sr = librosa.load(audio_path, sr=16000)

	# Preprocesar para wav2vec2
	inputs = self.whisper_processor(audio, sampling_rate=16000, return_tensors="pt")

	# Transcribir
	with torch.no_grad():
	logits = self.whisper_model(inputs.input_values).logits

	# Decodificar
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = self.whisper_processor.batch_decode(predicted_ids)[0]

	# Detectar idioma (simplificado)
	language = self._detect_language(transcription)

	return {
	'text': transcription.strip(),
	'language': language,
	'confidence': float(torch.max(torch.softmax(logits, dim=-1)).item()),
	'duration': len(audio) / sr,
	'timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'text': '',
	'language': 'unknown',
	'confidence': 0.0,
	'duration': 0.0,
	'error': str(e),
	'timestamp': datetime.now().isoformat()
	}

	def synthesize_speech(self, text: str, voice_style: str = "neutral") -> Dict[str, Union[str, float, np.ndarray]]:
	"""
	Sintetizar texto a voz usando SpeechT5
	"""
	try:
	if not text.strip():
	return {
	'audio': None,
	'sample_rate': self.sample_rate,
	'duration': 0.0,
	'error': 'Texto vacío',
	'timestamp': datetime.now().isoformat()
	}

	# Procesar texto
	inputs = self.tts_processor(text=text, return_tensors="pt")

	# Generar audio
	with torch.no_grad():
	speech = self.tts_model.generate_speech(
	inputs["input_ids"],
	self.vocoder,
	speaker_embeddings=None
	)

	# Convertir a numpy
	audio_np = speech.numpy()
	duration = len(audio_np) / self.sample_rate

	return {
	'audio': audio_np,
	'sample_rate': self.sample_rate,
	'duration': duration,
	'voice_style': voice_style,
	'text_length': len(text),
	'timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'audio': None,
	'sample_rate': self.sample_rate,
	'duration': 0.0,
	'error': str(e),
	'timestamp': datetime.now().isoformat()
	}

	def generate_podcast_content(self, topic: str, duration_minutes: int = 5) -> Dict[str, Union[str, int, List]]:
	"""
	Generar contenido de podcast usando IA
	"""
	try:
	# Crear prompt
	prompt = f"Crear un guión de podcast sobre {topic} de {duration_minutes} minutos. El guión debe ser dinámico, entretenido y profesional."

	# Tokenizar
	inputs = self.text_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

	# Generar
	with torch.no_grad():
	outputs = self.text_generator.generate(
	inputs.input_ids,
	max_length=512,
	num_return_sequences=1,
	temperature=0.8,
	do_sample=True,
	pad_token_id=self.text_tokenizer.eos_token_id
	)

	# Decodificar
	generated_text = self.text_tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Limpiar texto
	script = generated_text.replace(prompt, "").strip()

	return {
	'script': script,
	'topic': topic,
	'duration_minutes': duration_minutes,
	'word_count': len(script.split()),
	'estimated_duration': len(script.split()) * 0.5, # Aproximado
	'timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'script': '',
	'topic': topic,
	'duration_minutes': duration_minutes,
	'word_count': 0,
	'error': str(e),
	'timestamp': datetime.now().isoformat()
	}

	def process_podcast_episode(self, audio_path: str, target_voice: str = "neutral") -> Dict:
	"""
	Procesar un episodio completo de podcast
	"""
	try:
	# Transcribir audio
	transcription = self.transcribe_audio(audio_path)

	if transcription.get('error'):
	return {
	'success': False,
	'error': transcription['error'],
	'timestamp': datetime.now().isoformat()
	}

	# Generar resumen
	summary = self._generate_summary(transcription['text'])

	# Generar tags
	tags = self._generate_tags(transcription['text'])

	# Convertir a voz objetivo
	voice_conversion = self.synthesize_speech(transcription['text'], target_voice)

	return {
	'success': True,
	'transcription': transcription,
	'summary': summary,
	'tags': tags,
	'voice_conversion': voice_conversion,
	'timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'success': False,
	'error': str(e),
	'timestamp': datetime.now().isoformat()
	}

	def _detect_language(self, text: str) -> str:
	"""Detectar idioma del texto (simplificado)"""
	spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las', 'una', 'como', 'más', 'pero', 'sus', 'le', 'ha', 'me', 'si', 'sin', 'sobre', 'este', 'ya', 'entre', 'cuando', 'todo', 'esta', 'ser', 'son', 'dos', 'también', 'fue', 'había', 'era', 'muy', 'años', 'hasta', 'desde', 'está', 'mi', 'porque', 'qué', 'sólo', 'han', 'yo', 'hay', 'vez', 'puede', 'todos', 'así', 'nos', 'ni', 'parte', 'tiene', 'él', 'uno', 'donde', 'bien', 'tiempo', 'mismo', 'ese', 'ahora', 'cada', 'e', 'vida', 'otro', 'después', 'te', 'otros', 'aunque', 'esa', 'esos', 'estas', 'le', 'les', 'nosotros', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vosotros', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'ellos', 'ellas', 'suyo', 'suya', 'suyos', 'suyas', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras']

	text_lower = text.lower()
	spanish_count = sum(1 for word in spanish_words if word in text_lower)

	if spanish_count > 5:
	return 'es'
	else:
	return 'en'

	def _generate_summary(self, text: str) -> Dict[str, str]:
	"""Generar resumen del texto"""
	try:
	# Resumen simple (primeras 3 oraciones)
	sentences = text.split('.')
	summary = '. '.join(sentences[:3]) + '.'

	return {
	'summary': summary,
	'word_count': len(summary.split()),
	'original_word_count': len(text.split())
	}
	except:
	return {
	'summary': text[:200] + '...',
	'word_count': 0,
	'original_word_count': len(text.split())
	}

	def _generate_tags(self, text: str) -> List[str]:
	"""Generar tags del texto"""
	# Tags básicos basados en palabras clave
	tags = []
	text_lower = text.lower()

	if any(word in text_lower for word in ['tecnología', 'tecnico', 'digital', 'software', 'hardware']):
	tags.append('tecnología')
	if any(word in text_lower for word in ['salud', 'médico', 'medicina', 'clínica', 'doctor']):
	tags.append('salud')
	if any(word in text_lower for word in ['negocio', 'empresa', 'marketing', 'ventas']):
	tags.append('negocios')
	if any(word in text_lower for word in ['educación', 'aprender', 'estudio', 'universidad']):
	tags.append('educación')
	if any(word in text_lower for word in ['entretenimiento', 'música', 'cine', 'arte']):
	tags.append('entretenimiento')

	return tags if tags else ['general']

	def save_model(self, path: str):
	"""Guardar modelo completo"""
	try:
	model_path = Path(path)
	model_path.mkdir(parents=True, exist_ok=True)

	# Guardar configuración
	with open(model_path / "config.json", "w") as f:
	json.dump(self.config, f, indent=2)

	# Guardar modelos (si están cargados)
	if self.whisper_model:
	self.whisper_model.save_pretrained(model_path / "whisper")
	self.whisper_processor.save_pretrained(model_path / "whisper")

	if self.tts_model:
	self.tts_model.save_pretrained(model_path / "tts")
	self.tts_processor.save_pretrained(model_path / "tts")

	if self.text_generator:
	self.text_generator.save_pretrained(model_path / "text_generator")
	self.text_tokenizer.save_pretrained(model_path / "text_generator")

	print(f"✅ Modelo guardado en: {model_path}")
	return True

	except Exception as e:
	print(f"❌ Error guardando modelo: {e}")
	return False

	def load_model(self, path: str):
	"""Cargar modelo desde archivo"""
	try:
	model_path = Path(path)

	# Cargar configuración
	with open(model_path / "config.json", "r") as f:
	self.config = json.load(f)

	# Cargar modelos
	if (model_path / "whisper").exists():
	self.whisper_model = Wav2Vec2ForCTC.from_pretrained(model_path / "whisper")
	self.whisper_processor = Wav2Vec2Processor.from_pretrained(model_path / "whisper")

	if (model_path / "tts").exists():
	self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(model_path / "tts")
	self.tts_processor = SpeechT5Processor.from_pretrained(model_path / "tts")
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	if (model_path / "text_generator").exists():
	self.text_generator = AutoModelForCausalLM.from_pretrained(model_path / "text_generator")
	self.text_tokenizer = AutoTokenizer.from_pretrained(model_path / "text_generator")

	print(f"✅ Modelo cargado desde: {model_path}")
	return True

	except Exception as e:
	print(f"❌ Error cargando modelo: {e}")
	return False

	# Configuración del modelo
	CONFIG = {
	'model_name': 'modelo_epicuro',
	'version': '1.0.0',
	'description': 'Modelo de IA para Podcast Epicuro - Transcripción, Síntesis de Voz y Generación de Contenido',
	'author': 'DRDELATV2025',
	'sample_rate': 22050,
	'max_length': 512,
	'supported_languages': ['es', 'en'],
	'voice_styles': ['neutral', 'happy', 'sad', 'angry', 'fearful'],
	'created_at': datetime.now().isoformat()
	}

	if __name__ == "__main__":
	# Crear instancia del modelo
	model = EpicuroModel(CONFIG)

	# Cargar modelos
	if model.load_models():
	print("🎉 Modelo Epicuro listo para usar!")

	# Ejemplo de uso
	print("\n📝 Ejemplo de generación de contenido:")
	content = model.generate_podcast_content("Inteligencia Artificial en Medicina", 5)
	print(f"Tema: {content['topic']}")
	print(f"Guion: {content['script'][:200]}...")

	else:
	print("❌ Error inicializando el modelo")