File size: 14,839 Bytes
bed7409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
import os
import sys
import streamlit as st
from src.gradio_demo import SadTalker
import tempfile
from PIL import Image

# Set page configuration
st.set_page_config(
    page_title="SadTalker - Talking Face Animation",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS styling
st.markdown("""

<style>

    .header {

        text-align: center;

        padding: 1.5rem 0;

        margin-bottom: 2rem;

        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);

        color: white;

        border-radius: 10px;

        box-shadow: 0 4px 8px rgba(0,0,0,0.1);

    }

    .header h1 {

        margin-bottom: 0.5rem;

        font-size: 2.5rem;

    }

    .header p {

        margin-bottom: 0;

        font-size: 1.1rem;

    }

    .tab-content {

        padding: 1.5rem;

        background: #f8f9fa;

        border-radius: 10px;

        margin-bottom: 1.5rem;

    }

    .stVideo {

        border-radius: 10px;

        box-shadow: 0 4px 8px rgba(0,0,0,0.1);

    }

    .stImage {

        border-radius: 10px;

        box-shadow: 0 4px 8px rgba(0,0,0,0.1);

    }

    .settings-section {

        background: #ffffff;

        padding: 1.5rem;

        border-radius: 10px;

        margin-bottom: 1.5rem;

        box-shadow: 0 2px 4px rgba(0,0,0,0.05);

    }

    .warning-box {

        background-color: #fff3cd;

        color: #856404;

        padding: 0.75rem 1.25rem;

        border-radius: 0.25rem;

        margin-bottom: 1rem;

        border: 1px solid #ffeeba;

    }

    .download-btn {

        display: flex;

        justify-content: center;

        margin-top: 1rem;

    }

</style>

""", unsafe_allow_html=True)

# Initialize SadTalker with caching
@st.cache_resource
def load_sadtalker():
    return SadTalker('checkpoints', 'src/config', lazy_load=True)

sad_talker = load_sadtalker()

# Check if running in webui
try:
    import webui
    in_webui = True
except:
    in_webui = False

# Header section
st.markdown("""

<div class="header">

    <h1>😭 SadTalker</h1>

    <p>Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023)</p>

    <div style="display: flex; justify-content: center; gap: 1.5rem; margin-top: 0.5rem;">

        <a href="https://arxiv.org/abs/2211.12194" style="color: white; text-decoration: none; font-weight: 500;">πŸ“„ Arxiv</a>

        <a href="https://sadtalker.github.io" style="color: white; text-decoration: none; font-weight: 500;">🌐 Homepage</a>

        <a href="https://github.com/Winfredy/SadTalker" style="color: white; text-decoration: none; font-weight: 500;">πŸ’» GitHub</a>

    </div>

</div>

""", unsafe_allow_html=True)

# Initialize session state
if 'generated_video' not in st.session_state:
    st.session_state.generated_video = None
if 'tts_audio' not in st.session_state:
    st.session_state.tts_audio = None
if 'source_image' not in st.session_state:
    st.session_state.source_image = None
if 'driven_audio' not in st.session_state:
    st.session_state.driven_audio = None

# Main columns layout
col1, col2 = st.columns([1, 1], gap="large")

with col1:
    st.markdown("### Input Settings")
    
    # Source Image Upload
    with st.expander("🎨 Source Image", expanded=True):
        uploaded_image = st.file_uploader(
            "Upload a clear frontal face image",
            type=["jpg", "jpeg", "png"],
            key="source_image_upload"
        )
        
        if uploaded_image:
            st.session_state.source_image = uploaded_image
            image = Image.open(uploaded_image)
            st.image(image, caption="Source Image", use_container_width=True)
        elif st.session_state.source_image:
            image = Image.open(st.session_state.source_image)
            st.image(image, caption="Source Image (from session)", use_container_width=True)
        else:
            st.warning("Please upload a source image")
    
    # Audio Input
    with st.expander("🎡 Audio Input", expanded=True):
        input_method = st.radio(
            "Select input method:",
            ["Upload audio file", "Text-to-speech"],
            index=0,
            key="audio_input_method",
            horizontal=True
        )
        
        if input_method == "Upload audio file":
            audio_file = st.file_uploader(
                "Upload an audio file (WAV, MP3)",
                type=["wav", "mp3"],
                key="audio_file_upload"
            )
            
            if audio_file:
                st.session_state.driven_audio = audio_file
                st.audio(audio_file)
            elif st.session_state.driven_audio and isinstance(st.session_state.driven_audio, str):
                st.audio(st.session_state.driven_audio)
        else:
            if sys.platform != 'win32' and not in_webui:
                from src.utils.text2speech import TTSTalker
                tts_talker = TTSTalker()
                
                input_text = st.text_area(
                    "Enter text for speech synthesis:",
                    height=150,
                    placeholder="Type what you want the face to say...",
                    key="tts_input_text"
                )
                
                if st.button("Generate Speech", key="tts_generate_button"):
                    if input_text.strip():
                        with st.spinner("Generating audio from text..."):
                            try:
                                audio_path = tts_talker.test(input_text)
                                st.session_state.driven_audio = audio_path
                                st.session_state.tts_audio = audio_path
                                st.audio(audio_path)
                                st.success("Audio generated successfully!")
                            except Exception as e:
                                st.error(f"Error generating audio: {str(e)}")
                    else:
                        st.warning("Please enter some text first")
            else:
                st.markdown("""

                <div class="warning-box">

                    ⚠️ Text-to-speech is not available on Windows or in webui mode. 

                    Please use audio upload instead.

                </div>

                """, unsafe_allow_html=True)

with col2:
    st.markdown("### Generation Settings")
    
    with st.container():
        st.markdown("""

        <div class="settings-section">

            <h4>βš™οΈ Animation Parameters</h4>

        """, unsafe_allow_html=True)
        
        # First row of settings
        col_a, col_b = st.columns(2)
        
        with col_a:
            preprocess_type = st.radio(
                "Preprocessing Method",
                ['crop', 'resize', 'full', 'extcrop', 'extfull'],
                index=0,
                key="preprocess_type",
                help="How to handle the input image before processing"
            )
            
            size_of_image = st.radio(
                "Face Model Resolution",
                [256, 512],
                index=0,
                key="size_of_image",
                horizontal=True,
                help="Higher resolution (512) may produce better quality but requires more resources"
            )
            
        with col_b:
            is_still_mode = st.checkbox(
                "Still Mode",
                value=False,
                key="is_still_mode",
                help="Produces fewer head movements (works best with 'full' preprocessing)"
            )
            
            enhancer = st.checkbox(
                "Use GFPGAN Enhancer",
                value=False,
                key="enhancer",
                help="Improves face quality using GFPGAN (may slow down processing)"
            )
        
        # Second row of settings
        pose_style = st.slider(
            "Pose Style",
            min_value=0,
            max_value=46,
            value=0,
            step=1,
            key="pose_style",
            help="Different head poses and expressions"
        )
        
        batch_size = st.slider(
            "Batch Size",
            min_value=1,
            max_value=10,
            value=2,
            step=1,
            key="batch_size",
            help="Number of frames processed at once (higher may be faster but uses more memory)"
        )
        
        st.markdown("</div>", unsafe_allow_html=True)
    
    # Generate button
    if st.button(
        "✨ Generate Talking Face Animation",
        type="primary",
        use_container_width=True,
        key="generate_button"
    ):
        if not st.session_state.source_image:
            st.error("Please upload a source image first")
        elif input_method == "Upload audio file" and not st.session_state.driven_audio:
            st.error("Please upload an audio file first")
        elif input_method == "Text-to-speech" and not st.session_state.driven_audio:
            st.error("Please generate audio from text first")
        else:
            with st.spinner("Generating talking face animation. This may take a few minutes..."):
                try:
                    # Save uploaded files to temp files
                    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_image:
                        image = Image.open(st.session_state.source_image)
                        image.save(tmp_image.name)
                    
                    audio_path = None
                    if input_method == "Upload audio file":
                        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio:
                            tmp_audio.write(st.session_state.driven_audio.read())
                            audio_path = tmp_audio.name
                    else:
                        audio_path = st.session_state.driven_audio
                    
                    # Generate video
                    try:
                        # Ensure all paths are absolute
                        tmp_image_path = os.path.abspath(tmp_image.name)
                        audio_path = os.path.abspath(audio_path) if audio_path else None
                        
                        # Convert all parameters to correct types
                        video_path = sad_talker.test(
                            source_image=tmp_image_path,
                            driven_audio=audio_path,
                            preprocess_type=str(preprocess_type),
                            is_still_mode=bool(is_still_mode),
                            enhancer=bool(enhancer),
                            batch_size=int(batch_size),
                            size_of_image=int(size_of_image),
                            pose_style=int(pose_style),
                            # These additional parameters might be needed:
                        )
                        
                        # Verify the output
                        if not os.path.exists(video_path):
                            raise FileNotFoundError(f"Output video not created at {video_path}")
                        
                        st.session_state.generated_video = video_path

                    except Exception as e:
                        st.error(f"Generation failed: {str(e)}")
                        # Debug information
                        st.text(f"Parameters used:")
                        st.json({
                            "source_image": tmp_image_path,
                            "driven_audio": audio_path,
                            "preprocess_type": preprocess_type,
                            "is_still_mode": is_still_mode,
                            "enhancer": enhancer,
                            "batch_size": batch_size,
                            "size_of_image": size_of_image,
                            "pose_style": pose_style
                        })
                    # Store the generated video in session state
                    st.session_state.generated_video = video_path
                    
                    # Clean up temp files
                    os.unlink(tmp_image.name)
                    if audio_path and os.path.exists(audio_path) and input_method == "Upload audio file":
                        os.unlink(audio_path)
                    
                    st.success("Generation complete! View your result below.")
                except Exception as e:
                    st.error(f"An error occurred during generation: {str(e)}")
                    st.error("Please check your inputs and try again")

# Results section
if st.session_state.generated_video:
    st.markdown("---")
    st.markdown("### Generated Animation")
    
    # Display video and download options
    col_video, col_download = st.columns([3, 1])
    
    with col_video:
        st.video(st.session_state.generated_video)
    
    with col_download:
        # Download button
        with open(st.session_state.generated_video, "rb") as f:
            video_bytes = f.read()
        
        st.download_button(
            label="Download Video",
            data=video_bytes,
            file_name="sadtalker_animation.mp4",
            mime="video/mp4",
            use_container_width=True,
            key="download_button"
        )
        
        # Regenerate button
        if st.button(
            "πŸ”„ Regenerate with Same Settings",
            use_container_width=True,
            key="regenerate_button"
        ):
            st.experimental_rerun()
        
        # New generation button
        if st.button(
            "πŸ†• Start New Generation",
            use_container_width=True,
            key="new_generation_button"
        ):
            st.session_state.generated_video = None
            st.session_state.tts_audio = None
            st.session_state.source_image = None
            st.session_state.driven_audio = None
            st.experimental_rerun()

# Footer
st.markdown("---")
st.markdown("""

<div style="text-align: center; color: #666; padding: 1.5rem 0; font-size: 0.9rem;">

    <p>SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation</p>

    <p>CVPR 2023 | <a href="https://github.com/Winfredy/SadTalker" target="_blank">GitHub Repository</a> | <a href="https://sadtalker.github.io" target="_blank">Project Page</a></p>

</div>

""", unsafe_allow_html=True)