Spaces:

Cun-Duck
/

test

Runtime error

App Files Files Community

Cun-Duck commited on Jan 6, 2025

Commit

b2077e8

verified ·

1 Parent(s): a5758ff

Create app.py

Browse files

Files changed (1) hide show

app.py +323 -0

app.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import os
+import subprocess
+import sys
+# 1. Instalasi Dependencies (Pastikan ini dijalankan hanya jika diperlukan)
+# Cek apakah dependencies sudah terinstall
+def check_dependencies():
+    try:
+        import torch
+        import transformers
+        import datasets
+        import librosa
+        import numpy
+        import scipy
+        import ffmpeg
+        import gradio
+        import huggingface_hub
+        return True
+    except ImportError:
+        return False
+if not check_dependencies():
+    # Install pytorch (CPU version)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==1.12.1+cpu", "torchvision==0.13.1+cpu", "torchaudio==0.12.1", "--extra-index-url", "https://download.pytorch.org/whl/cpu"])
+    # Install other dependencies
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.24.0", "datasets==2.7.1", "librosa==0.9.2", "numpy==1.23.4", "scipy==1.9.3", "ffmpeg-python==0.2.0", "gradio==3.10.1", "huggingface_hub==0.11.0"])
+    # Install non-pip dependencies
+    os.system("apt-get update && apt-get install -y ffmpeg")
+# 2. Impor Libraries
+import torch
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_dataset, Audio
+import librosa
+import numpy as np
+from scipy.io import wavfile
+import ffmpeg
+import gradio as gr
+from huggingface_hub import HfApi, HfFolder
+# 3. Konfigurasi Hugging Face Hub
+# Dapatkan token dari environment variable (lebih aman)
+HF_TOKEN = os.environ.get("HF_TOKEN") # Gunakan secrets HF_TOKEN pada pengaturan HF Spaces
+# Atau, jika Anda ingin hardcode token (tidak disarankan untuk production)
+# HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN"
+# Konfigurasi repository
+repo_id = "your_username/your_model_repo"  # Ganti dengan username dan nama repo Anda
+model_filename = "lipsync_model.pth"
+# Inisialisasi HfApi
+api = HfApi()
+# Login ke Hugging Face Hub (jika belum)
+if HF_TOKEN:
+    api.set_access_token(HF_TOKEN)
+    # Atau bisa juga menggunakan:
+    # HfFolder.save_token(HF_TOKEN);
+    # api.set_access_token(HfFolder.get_token())
+else:
+    print("HF_TOKEN not found. Model will not be uploaded.")
+# 4. Definisi Model dan Fungsi-Fungsi
+# Model ASR (sama seperti sebelumnya)
+asr_model_name = "facebook/wav2vec2-base-960h"
+asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
+asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
+# Placeholder untuk model lipsync (Model yang lebih ringan dan efisien)
+class LipSyncModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Arsitektur yang lebih sederhana:
+        self.fc1 = torch.nn.Linear(512, 256) # Reduced input features
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(256, 128 * 3 * 32 * 32) # Reduced output size
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = x.view(-1, 3, 32, 32) # Reduced frame size: 32x32
+        return x
+lipsync_model = LipSyncModel()
+optimizer = torch.optim.Adam(lipsync_model.parameters(), lr=5e-5)
+criterion = torch.nn.MSELoss()
+# Fungsi untuk mengekstrak fitur audio (sama seperti sebelumnya)
+def extract_audio_features(audio_file):
+    audio, sr = librosa.load(audio_file, sr=asr_processor.feature_extractor.sampling_rate, mono=True) # Ensure mono audio
+    inputs = asr_processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+      # Get hidden states from a specific layer (before the output layer)
+      # Note: Wav2Vec2 might not provide hidden features directly.
+      # You may need to modify the model to obtain the desired features.
+      # Alternatively, use MFCCs:
+      mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=16, hop_length=512)
+      mfccs = torch.tensor(mfccs.T).float()[:512, :]  # Limit feature size, adjust as needed
+      return mfccs
+# Fungsi untuk memproses video dan audio (sama seperti sebelumnya)
+def process_video(video_file, audio_file):
+    # 1. Ekstrak audio dari video (jika video memiliki audio)
+    if audio_file is None:
+      try:
+          audio_file = "temp_audio.wav"
+          (
+              ffmpeg.input(video_file)
+              .output(audio_file, acodec="pcm_s16le", ar="16000", ac=1)  # Convert to mono
+              .run(overwrite_output=True, quiet=True)
+          )
+      except ffmpeg.Error as e:
+          print(f"Error extracting audio from {video_file}: {e.stderr.decode()}")
+          return None, None
+    # 2. Ekstrak frame dari video
+    probe = ffmpeg.probe(video_file)
+    video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
+    width = int(video_info['width'])
+    height = int(video_info['height'])
+    num_frames = int(video_info['nb_frames'])
+    fps = eval(video_info['r_frame_rate'])
+    frames, _, _ = (
+        ffmpeg.input(video_file)
+        .output("pipe:", format="rawvideo", pix_fmt="rgb24", s="32x32") # Downsample to 32x32
+        .run(capture_stdout=True, quiet=True)
+    )
+    frames = np.frombuffer(frames, np.uint8).reshape([-1, 32, 32, 3])
+    frames = torch.tensor(frames).permute(0, 3, 1, 2).float() / 255.0
+    # 3. Ekstrak fitur audio
+    audio_features = extract_audio_features(audio_file)
+    return frames, audio_features, fps
+# Fungsi untuk melatih model lipsync
+def train_lipsync_model(video_file, audio_file, epochs=5):
+    frames, audio_features, fps = process_video(video_file, audio_file)
+    if frames is None or audio_features is None:
+      print("Skipping training due to error in video or audio processing.")
+      return
+    for epoch in range(epochs):
+        optimizer.zero_grad()
+        # Sesuaikan ukuran audio features
+        num_frames = frames.shape[0]
+        # Reduce the number of frames to match the audio features, if necessary
+        if num_frames > audio_features.shape[0]:
+          frames = frames[:audio_features.shape[0]]
+          num_frames = audio_features.shape[0]
+        # Pad audio features if they are shorter than the number of frames
+        if audio_features.shape[0] < num_frames:
+          padding_size = num_frames - audio_features.shape[0]
+          padding = audio_features[-1,:].repeat(padding_size, 1)
+          audio_features_padded = torch.cat((audio_features, padding), dim=0)
+        else:
+          audio_features_padded = audio_features
+        # Generate video frame
+        generated_frames = lipsync_model(audio_features_padded)
+        # Hitung loss
+        loss = criterion(generated_frames, frames)
+        # Backpropagation dan optimasi
+        loss.backward()
+        optimizer.step()
+        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
+    # Simpan dan upload model setelah pelatihan
+    if HF_TOKEN:
+        save_and_upload_model()
+# Fungsi untuk inference (sama seperti sebelumnya)
+def lipsync_inference(video_file, audio_file, output_file="output.mp4"):
+    frames, audio_features, fps = process_video(video_file, audio_file)
+    if frames is None or audio_features is None:
+      print("Error during video or audio processing.")
+      return None
+    with torch.no_grad():
+      num_frames = frames.shape[0]
+      # Reduce the number of frames to match the audio features, if necessary
+      if num_frames > audio_features.shape[0]:
+        frames = frames[:audio_features.shape[0]]
+        num_frames = audio_features.shape[0]
+      # Pad audio features if they are shorter than the number of frames
+      if audio_features.shape[0] < num_frames:
+        padding_size = num_frames - audio_features.shape[0]
+        padding = audio_features[-1,:].repeat(padding_size, 1)
+        audio_features_padded = torch.cat((audio_features, padding), dim=0)
+      else:
+        audio_features_padded = audio_features
+      generated_frames = lipsync_model(audio_features_padded)
+    # Convert tensor to numpy array
+    generated_frames = (generated_frames * 255).byte().permute(0, 2, 3, 1).cpu().numpy()
+    # Simpan video hasil inference
+    temp_video = "temp_output.mp4"
+    (
+        ffmpeg.input(
+            "pipe:",
+            format="rawvideo",
+            pix_fmt="rgb24",
+            s=f"{generated_frames.shape[2]}x{generated_frames.shape[1]}",
+            r=fps,
+        )
+        .output(temp_video, pix_fmt="yuv420p", vcodec="libx264", crf=28)
+        .overwrite_output()
+        .run(input=generated_frames.tobytes(), quiet=True)
+    )
+    # Gabungkan audio baru dengan video
+    (
+        ffmpeg.input(temp_video)
+        .input(audio_file)
+        .output(output_file, c="copy", map="0:v:0", map="1:a:0")
+        .overwrite_output()
+        .run(quiet=True)
+    )
+    os.remove(temp_video)
+    print(f"Video hasil lipsync disimpan di: {output_file}")
+    return output_file
+# 5. Fungsi untuk menyimpan dan mengupload model
+def save_and_upload_model():
+    # Create repo if it doesn't exist
+    try:
+        api.create_repo(repo_id=repo_id, token=HF_TOKEN, private=True, exist_ok=True) # repo dibuat private agar lebih aman
+    except Exception as e:
+        print(f"Error creating repo: {e}")
+    # Simpan model secara lokal
+    torch.save(lipsync_model.state_dict(), model_filename)
+    print(f"Model saved locally to {model_filename}")
+    # Upload model ke Hugging Face Hub
+    try:
+        api.upload_file(
+            path_or_fileobj=model_filename,
+            path_in_repo=model_filename,
+            repo_id=repo_id,
+            token=HF_TOKEN,
+        )
+        print(f"Model uploaded to {repo_id}/{model_filename}")
+    except Exception as e:
+        print(f"Error uploading model: {e}")
+# 6. Fungsi untuk mengunduh dan memuat model
+def download_and_load_model():
+  try:
+      model_path = api.model_info(repo_id=repo_id, token=HF_TOKEN).siblings[0].rfilename
+      api.download_file(
+          path_or_fileobj=model_filename,
+          path_in_repo=model_path,
+          repo_id=repo_id,
+          token=HF_TOKEN,
+          local_dir="."
+      )
+      lipsync_model.load_state_dict(torch.load(model_filename))
+      print("Model loaded from Hugging Face Hub")
+  except Exception as e:
+      print(f"Error loading model: {e}")
+      print("Starting with a fresh model.")
+# 7. Antarmuka Gradio
+def run_app(input_video, input_audio, output_video):
+    # Coba untuk load model dari HF Hub
+    if HF_TOKEN:
+        download_and_load_model()
+    # save files to path
+    input_video_path = "input_video.mp4"
+    input_audio_path = "input_audio.wav"
+    with open(input_video_path, "wb") as f:
+      f.write(input_video.getbuffer())
+    with open(input_audio_path, "wb") as f:
+      f.write(input_audio.getbuffer())
+    # Lakukan pelatihan selama 5 epoch
+    train_lipsync_model(input_video_path, input_audio_path, epochs=5)
+    output_video = lipsync_inference(input_video_path, input_audio_path, output_video)
+    # remove files from path
+    os.remove(input_video_path)
+    os.remove(input_audio_path)
+    return output_video
+input_video = gr.inputs.Video(type="file", label="Input Video")
+input_audio = gr.inputs.Audio(type="file", label="Input Audio")
+output_video = "output_video.mp4"
+iface = gr.Interface(
+    fn=run_app,
+    inputs=[input_video, input_audio],
+    outputs="video",
+    title="LipSync AI on CPU",
+    description="Ubah audio dari video menggunakan AI Lipsync (CPU Version).",
+)
+iface.launch(debug=True)