CLMARRARA's picture
Adicionado POC do serviço de audio (Adicionado tradutor e emotion-english)
eb54254
import base64
import tempfile
import os
from app.config import WHISPER_MODEL, WHISPER_MODEL_PATH
os.environ["XDG_CACHE_HOME"] = str(WHISPER_MODEL_PATH) # "./models"
from app.logger import log
from app.sentiment_model import analyze_sentiment_portgues
from app.translator import translate_long_text
from app.emotion_model import analyze_emotion
import whisper
from moviepy.editor import VideoFileClip
whisper_model = whisper.load_model(WHISPER_MODEL)
# -------------------------
# 🔥 UTIL: salvar base64
# -------------------------
def save_base64_to_file(base64_data, suffix):
try:
file_bytes = base64.b64decode(base64_data)
except Exception:
raise ValueError("Base64 inválido")
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
with open(temp_file.name, "wb") as f:
f.write(file_bytes)
return temp_file.name
# -------------------------
# 🔥 UTIL: extrair áudio
# -------------------------
def extract_audio_from_video(video_path):
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
video = VideoFileClip(video_path)
video.audio.write_audiofile(temp_audio.name)
video.close()
return temp_audio.name
# -------------------------
# 🔥 PROCESSAMENTO PRINCIPAL
# -------------------------
def process_audio(video_base64=None, audio_base64=None, logger=None):
video_path = None
audio_path = None
try:
# -------------------------
# 🎯 1. ORIGEM DO ÁUDIO
# -------------------------
if audio_base64:
log("Recebido áudio base64", logger=logger)
audio_path = save_base64_to_file(audio_base64, ".wav")
elif video_base64:
log("Recebido vídeo base64", logger=logger)
video_path = save_base64_to_file(video_base64, ".mp4")
log("Extraindo áudio do vídeo...", logger=logger)
audio_path = extract_audio_from_video(video_path)
else:
raise ValueError("Nenhum áudio ou vídeo fornecido")
# -------------------------
# 🎯 2. WHISPER
# -------------------------
log("Transcrevendo com Whisper...", logger=logger)
result = whisper_model.transcribe(
audio_path,
language="pt",
verbose=False,
fp16=False # importante para CPU
)
text = " ".join([seg["text"] for seg in result["segments"]])
# -------------------------
# 🎯 3. ROBERTA (cardiffnlp/twitter-xlm-roberta)
# -------------------------
#sentiment, score = analyze_sentiment_portgues(text, logger)
#print(f"Sentimento: {sentiment} ({score})")
#return {
# "transcription": text,
# "sentiment": sentiment,
# "score": score
#}
# -------------------------
# 🎯 3. TRADUÇÃO + EMOÇÃO
# -------------------------
print("Traduzindo texto para inglês...")
text_en = translate_long_text(text)
print("Analisando emoção...")
emotion_result = analyze_emotion(text_en)
return {
"transcription": text,
"translation": text_en,
"analysis": {
"emotion": emotion_result["emotion"],
"confidence": emotion_result["confidence"],
"emotion_raw": emotion_result["emotion_raw"],
"all_emotions": emotion_result["all_emotions"]
}
}
finally:
# -------------------------
# 🧹 LIMPEZA (MUITO IMPORTANTE)
# -------------------------
if video_path and os.path.exists(video_path):
os.remove(video_path)
if audio_path and os.path.exists(audio_path):
os.remove(audio_path)