my_bot / src /avatar_streamlit.py
gadhalevy
remove edge
c5d81bd
# Placeholder - insert your full avatar_streamlit.py script here
print("Avatar Streamlit App using SadTalker only (no D-ID)")
import os
import time
import json
import tempfile
import asyncio
import requests
import numpy as np
import matplotlib.pyplot as plt
from pydub import AudioSegment
import streamlit as st
from dotenv import load_dotenv
from TTS.api import TTS
# ========== Load environment and settings ==========
load_dotenv()
# DID_API_KEY = os.getenv("DID_API_KEY")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3")
# Make sure results folder exists
RESULTS_DIR = os.path.join(os.getcwd(), "results")
os.makedirs(RESULTS_DIR, exist_ok=True)
# ========== Helpers ==========
# --- Ollama chat ---
def ask_ollama(question: str) -> str:
"""Ask a local Ollama model."""
try:
res = requests.post(
"http://localhost:11434/api/generate",
json={"model": OLLAMA_MODEL, "prompt": question},
)
res.raise_for_status()
output = "".join(
json.loads(line).get("response", "")
for line in res.text.splitlines()
if line.strip()
)
return output.strip()
except Exception as e:
st.error(f"Ollama error: {e}")
return "ืžืฆื˜ืขืจ, ืœื ื”ืฆืœื—ืชื™ ืœื”ืชื—ื‘ืจ ืœืžื•ื“ืœ ื”ืžืงื•ืžื™."
# --- Text-to-Speech using Edge-TTS ---
# import edge_tts
# async def text_to_speech_edge(text, voice="en-US-GuyNeural"):
# """Convert text to speech and return mp3 path."""
# fd, mp3_path = tempfile.mkstemp(suffix=".mp3")
# os.close(fd)
# communicate = edge_tts.Communicate(text, voice)
# await communicate.save(mp3_path)
# return mp3_path
def synthesize_speech(text, lang="he"):
"""Offline GPU TTS using Coqui XTTS v2."""
print("๐Ÿ—ฃ๏ธ Generating speech with Coqui TTS...")
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=True)
fd, wav_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
tts.tts_to_file(text=text, file_path=wav_path, language=lang)
print("โœ… Saved audio to:", wav_path)
return wav_path
# --- Check SadTalker availability ---
SADTALKER_AVAILABLE = True # set False if not installed
# ========== Streamlit UI ==========
st.set_page_config(page_title="Avatar Lip Sync", page_icon="๐ŸŽฌ", layout="centered")
st.title("๐Ÿง  Avatar Chatbot & Lip Sync Studio")
# --- Image Upload ---
st.subheader("๐Ÿง‘ ื”ืขืœืืช ืชืžื•ื ืช ืื•ื•ื˜ืจ")
avatar_image = st.file_uploader("ื‘ื—ืจ ืชืžื•ื ื” (JPG/PNG):", type=["jpg", "jpeg", "png"])
if avatar_image is not None:
avatar_image_path = os.path.join(tempfile.gettempdir(), avatar_image.name)
with open(avatar_image_path, "wb") as f:
f.write(avatar_image.read())
st.image(avatar_image_path, caption="Avatar Image", width=250)
else:
avatar_image_path = None
# --- User input & mode ---
st.subheader("๐Ÿ’ฌ ื›ืชื•ื‘ ืฉืืœื” ืื• ื˜ืงืกื˜ ืœืฉื™ื—ื”")
user_input = st.text_area("ื”ืงืœื“ ื˜ืงืกื˜:", "")
talk_mode = st.radio("ื‘ื—ืจ ืžืฆื‘:", ["ืฉื™ื—ื” (ื˜ืงืกื˜ ื—ื•ืคืฉื™)", "ืฆ'ืื˜ ืขื ืžื•ื“ืœ Ollama"])
# --- Voice selection ---
selected_voice = st.selectbox(
"ื‘ื—ืจ ืงื•ืœ ืœื“ื™ื‘ื•ืจ:",
[
"en-US-GuyNeural (Male)",
"en-US-JennyNeural (Female)",
"he-IL-AsafNeural (Male)",
"he-IL-NoaNeural (Female)",
],
)
VOICE_CODE = selected_voice.split(" ")[0]
# --- Audio Upload Section ---
st.subheader("๐ŸŽต ื”ืขืœืืช ืงื•ื‘ืฅ MP3 ืœืกื™ื ื›ืจื•ืŸ ืฉืคืชื™ื™ื")
uploaded_audio = st.file_uploader("ื”ืขืœื” ืงื•ื‘ืฅ MP3:", type=["mp3"])
if uploaded_audio is not None:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_audio:
tmp_audio.write(uploaded_audio.read())
mp3_path = tmp_audio.name
audio = AudioSegment.from_file(mp3_path)
duration_seconds = len(audio) / 1000
st.audio(mp3_path)
st.info(f"โฑ ืื•ืจืš ื”ืงื•ื‘ืฅ: {duration_seconds:.1f} ืฉื ื™ื•ืช")
# Waveform preview
try:
samples = np.array(audio.get_array_of_samples())
plt.figure(figsize=(6, 1.5))
plt.plot(samples[::200])
plt.title("Waveform Preview")
plt.axis("off")
st.pyplot(plt)
except Exception as e:
st.warning(f"Waveform preview unavailable: {e}")
# --- Generate Button ---
if st.button("ืฉืœื—"):
if not avatar_image_path:
st.warning("ืื ื ื”ืขืœื” ืชืžื•ื ืช ืื•ื•ื˜ืจ ืชื—ื™ืœื”.")
elif not SADTALKER_AVAILABLE:
st.error("SadTalker ืœื ืžื•ืชืงืŸ. ืื ื ื”ืชืงืŸ ืœืคื™ ื”ื”ื•ืจืื•ืช.")
else:
if uploaded_audio is not None:
st.success("โœ… ืื•ื“ื™ื• ื ื˜ืขืŸ ื‘ื”ืฆืœื—ื”! ืžืฉืชืžืฉ ื‘ืงื•ื‘ืฅ ืฉื”ืขืœื™ืช.")
else:
if not user_input:
st.warning("ืื ื ื”ืงืœื“ ืฉืืœื” ืื• ื”ืขืœื” ืงื•ื‘ืฅ ืื•ื“ื™ื•.")
st.stop()
with st.spinner("๐Ÿค– ืžื—ื›ื” ืœืชื’ื•ื‘ื” ืžื”ืžื•ื— ื”ืžืงื•ืžื™..."):
response = user_input if talk_mode.startswith("ืฉื™ื—ื”") else ask_ollama(user_input)
st.success(response)
with st.spinner("๐ŸŽ™๏ธ ื™ื•ืฆืจ ืงื•ืœ..."):
# mp3_path = synthesize_speech(response, VOICE_CODE)
mp3_path = synthesize_speech(response, lang="he")
st.audio(mp3_path)
# --- Generate video ---
timestamp = time.strftime("%Y%m%d_%H%M%S")
output_video = os.path.join(RESULTS_DIR, f"avatar_{timestamp}.mp4")
with st.spinner("๐ŸŽฌ ื™ื•ืฆืจ ื•ื™ื“ืื• ืขื ืกื™ื ื›ืจื•ืŸ ืฉืคืชื™ื™ื..."):
video_path = create_sadtalker_video(avatar_image_path, mp3_path, output_video)
if video_path and os.path.exists(video_path):
st.success(f"โœ… ื”ืกืจื˜ื•ืŸ ื ืฉืžืจ ื‘ืชื™ืงื™ื™ืช results!\n๐Ÿ“ {video_path}")
st.video(video_path)
with open(video_path, "rb") as vid_file:
st.download_button(
label="โฌ‡๏ธ ื”ื•ืจื“ ืืช ื”ืกืจื˜ื•ืŸ",
data=vid_file,
file_name=os.path.basename(video_path),
mime="video/mp4",
)
else:
st.error("โŒ ืœื ื”ืฆืœื—ืชื™ ืœื™ืฆื•ืจ ืืช ื”ืกืจื˜ื•ืŸ. ื‘ื“ื•ืง ืืช SadTalker.")