banao-tech commited on
Commit
4fabefa
Β·
verified Β·
1 Parent(s): 04f6d97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -26
app.py CHANGED
@@ -3,54 +3,131 @@ import torch
3
  import subprocess
4
  import os
5
  from pathlib import Path
 
6
 
7
- def setup():
8
- """One-time setup"""
9
- if not Path("DiffSynth-Studio").exists():
10
- subprocess.run("git clone https://github.com/modelscope/DiffSynth-Studio.git", shell=True)
11
- subprocess.run("pip install -e ./DiffSynth-Studio", shell=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def generate_video(image, audio):
 
14
  try:
15
  if not image or not audio:
16
- return None, "Please upload both image and audio!"
 
 
 
 
17
 
18
- setup()
 
 
 
 
 
 
 
 
 
19
 
20
  # Run inference
21
- output_path = "output.mp4"
22
 
23
  cmd = f"""
24
- cd DiffSynth-Studio && python examples/video_generation/musetalk.py \
25
- --image_path ../{image} \
26
- --audio_path ../{audio} \
27
- --output_path ../{output_path}
 
 
 
28
  """
29
 
30
  result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
31
 
32
- if Path(output_path).exists():
33
- return output_path, "βœ… Video generated successfully!"
34
  else:
35
- return None, f"❌ Failed: {result.stderr}"
36
 
37
  except Exception as e:
38
  return None, f"❌ Error: {str(e)}"
39
 
40
  # Gradio Interface
41
- with gr.Blocks(title="AI Lip Sync") as app:
42
- gr.Markdown("# 🎀 AI Lip Sync Generator")
43
- gr.Markdown("Upload a face image and audio to generate lip-synced video")
 
 
 
 
 
 
44
 
45
  with gr.Row():
46
- with gr.Column():
47
- image_input = gr.Image(type="filepath", label="πŸ“· Face Image")
48
- audio_input = gr.Audio(type="filepath", label="🎡 Audio File")
49
- generate_btn = gr.Button("πŸš€ Generate Video", variant="primary")
50
-
51
- with gr.Column():
52
- video_output = gr.Video(label="πŸ“Ή Generated Video")
53
- status_output = gr.Textbox(label="Status", lines=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  generate_btn.click(
56
  fn=generate_video,
 
3
  import subprocess
4
  import os
5
  from pathlib import Path
6
+ import shutil
7
 
8
+ def setup_hallo():
9
+ """Setup Hallo2 on first run"""
10
+ if Path("hallo2_installed.txt").exists():
11
+ return True
12
+
13
+ try:
14
+ print("Installing Hallo2...")
15
+
16
+ # Clone repo
17
+ subprocess.run("git clone https://github.com/fudan-generative-vision/hallo2.git /tmp/hallo2", shell=True, check=True)
18
+
19
+ # Install requirements
20
+ subprocess.run("pip install -q diffusers[torch] transformers av insightface onnxruntime-gpu", shell=True, check=True)
21
+
22
+ # Download models
23
+ subprocess.run("huggingface-cli download fudan-generative-ai/hallo2 --local-dir /tmp/hallo2/pretrained_models", shell=True, check=True)
24
+
25
+ Path("hallo2_installed.txt").touch()
26
+ print("βœ… Hallo2 setup complete!")
27
+ return True
28
+
29
+ except Exception as e:
30
+ print(f"Setup error: {e}")
31
+ return False
32
 
33
  def generate_video(image, audio):
34
+ """Generate lip-synced video"""
35
  try:
36
  if not image or not audio:
37
+ return None, "❌ Please upload both image and audio!"
38
+
39
+ # Setup on first run
40
+ if not setup_hallo():
41
+ return None, "❌ Setup failed"
42
 
43
+ # Prepare paths
44
+ output_dir = Path("/tmp/outputs")
45
+ output_dir.mkdir(exist_ok=True)
46
+ output_file = output_dir / "result.mp4"
47
+
48
+ # Copy inputs
49
+ img_path = "/tmp/input_img.jpg"
50
+ aud_path = "/tmp/input_audio.wav"
51
+ shutil.copy(image, img_path)
52
+ shutil.copy(audio, aud_path)
53
 
54
  # Run inference
55
+ print("🎬 Generating video...")
56
 
57
  cmd = f"""
58
+ cd /tmp/hallo2 && python scripts/inference.py \
59
+ --source_image {img_path} \
60
+ --driving_audio {aud_path} \
61
+ --output {output_file} \
62
+ --pose_weight 1.0 \
63
+ --face_weight 1.0 \
64
+ --lip_weight 1.0
65
  """
66
 
67
  result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
68
 
69
+ if output_file.exists():
70
+ return str(output_file), "βœ… Video generated successfully!"
71
  else:
72
+ return None, f"❌ Generation failed. Error: {result.stderr[:200]}"
73
 
74
  except Exception as e:
75
  return None, f"❌ Error: {str(e)}"
76
 
77
  # Gradio Interface
78
+ with gr.Blocks(theme=gr.themes.Soft(), title="AI Lip Sync") as app:
79
+
80
+ gr.Markdown("""
81
+ # 🎀 AI Lip Sync Generator
82
+
83
+ Upload a portrait image and audio to create a realistic lip-synced video!
84
+
85
+ **⚑ Fast generation on T4 GPU (~30-60 seconds)**
86
+ """)
87
 
88
  with gr.Row():
89
+ with gr.Column(scale=1):
90
+ gr.Markdown("### πŸ“€ Upload Files")
91
+ image_input = gr.Image(
92
+ type="filepath",
93
+ label="Portrait Image (JPG/PNG)",
94
+ height=300
95
+ )
96
+ audio_input = gr.Audio(
97
+ type="filepath",
98
+ label="Audio File (WAV/MP3)"
99
+ )
100
+
101
+ generate_btn = gr.Button(
102
+ "πŸš€ Generate Lip-Synced Video",
103
+ variant="primary",
104
+ size="lg"
105
+ )
106
+
107
+ with gr.Column(scale=1):
108
+ gr.Markdown("### πŸ“Ή Output")
109
+ video_output = gr.Video(
110
+ label="Generated Video",
111
+ height=400
112
+ )
113
+ status_output = gr.Textbox(
114
+ label="Status",
115
+ lines=3,
116
+ interactive=False
117
+ )
118
+
119
+ gr.Markdown("""
120
+ ---
121
+ ### πŸ’‘ Tips:
122
+ - Use clear, front-facing portrait images
123
+ - Best resolution: 512x512 or higher
124
+ - Audio length: Up to 60 seconds recommended
125
+ - First generation will download models (~2GB)
126
+
127
+ ### ⏱️ Performance:
128
+ - First run: ~3-5 minutes (model download)
129
+ - Subsequent runs: ~30-60 seconds per video
130
+ """)
131
 
132
  generate_btn.click(
133
  fn=generate_video,