Jeonghwanny commited on
Commit
0a3dc93
·
verified ·
1 Parent(s): def8bd6

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +190 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore") # 경고 무시
3
+ #!pip install pyannote.audio
4
+ #!pip install moviepy
5
+ import librosa
6
+ import numpy as np
7
+ import os
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ import torch.functional as F
14
+
15
+ from pyannote.audio import Pipeline
16
+
17
+ from pyannote.audio import Audio
18
+ import torchaudio
19
+
20
+
21
+ import torch.nn.functional as F
22
+ import os
23
+
24
+ from moviepy.editor import VideoFileClip
25
+
26
+ from transformers import pipeline
27
+
28
+ from huggingface_hub import hf_hub_download
29
+
30
+ #!pip install gradio
31
+ import gradio as gr
32
+
33
+ from moviepy.editor import VideoFileClip
34
+
35
+
36
+
37
+ # 오디오 변환 mp4 --> wav
38
+ def extract_audio_from_video(video_file_path, audio_file_path):
39
+ # mp4 파일 불러오기
40
+ video = VideoFileClip(video_file_path)
41
+
42
+ # 오디오를 추출하여 wav 파일로 저장
43
+ video.audio.write_audiofile(audio_file_path, codec='pcm_s16le')
44
+
45
+ # 전체 오디오 파일 불러오기
46
+ def seprate_speaker(audio_file, pipeline):
47
+ audio = Audio()
48
+ waveform, sample_rate = torchaudio.load(audio_file)
49
+ diarization = pipeline(audio_file)
50
+
51
+ # 화자별로 발화 구간을 저장할 딕셔너리 초기화
52
+ speaker_segments = {}
53
+
54
+ # diarization 결과를 순회하며 각 화자의 발화를 딕셔너리에 추가
55
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
56
+ start_time = segment.start
57
+ end_time = segment.end
58
+
59
+ # 해당 화자가 처음 등장하면 리스트를 초기화
60
+ if speaker not in speaker_segments:
61
+ speaker_segments[speaker] = []
62
+
63
+ # 발화 구간을 해당 화자의 리스트에 추가
64
+ segment_waveform = waveform[:, int(start_time * sample_rate):int(end_time * sample_rate)]
65
+ speaker_segments[speaker].append(segment_waveform)
66
+
67
+ # 각 화자별로 모든 발화 구간을 하나의 파일로 이어붙여 저장
68
+ for speaker, segments in speaker_segments.items():
69
+ # 화자의 모든 발화 구간을 이어붙임
70
+ combined_waveform = torch.cat(segments, dim=1)
71
+ #current_path = os.getcwd()
72
+ output_path = "/tmp/wav" # 경로
73
+ os.makedirs(output_path, exist_ok=True) # 경로가 없으면 생성
74
+ output_filename = os.path.join(output_path,f"{speaker}.wav")
75
+
76
+ torchaudio.save(output_filename, combined_waveform, sample_rate) #오디오 파일 저장
77
+
78
+
79
+
80
+ # 간단한 DeepVoice 스타일 모델 정의
81
+ class DeepVoiceModel(nn.Module):
82
+ def __init__(self, input_dim, hidden_dim, num_classes, dropout_rate=0.3, l2_reg=0.01):
83
+ super(DeepVoiceModel, self).__init__()
84
+ self.conv1 = nn.Conv1d(input_dim, hidden_dim, kernel_size=5, padding=2)
85
+ self.bn1 = nn.BatchNorm1d(hidden_dim)
86
+ self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=5, padding=2)
87
+ self.bn2 = nn.BatchNorm1d(hidden_dim)
88
+ self.dropout = nn.Dropout(dropout_rate)
89
+ self.fc = nn.Linear(hidden_dim, num_classes)
90
+
91
+ def forward(self, x):
92
+ x = self.bn1(torch.relu(self.conv1(x)))
93
+ x = self.dropout(x)
94
+ x = self.bn2(torch.relu(self.conv2(x)))
95
+ x = self.dropout(x)
96
+ x = torch.mean(x, dim=2) # Temporal pooling
97
+ x = self.fc(x)
98
+ return x
99
+
100
+
101
+ def extract_mfcc_path(file_path, n_mfcc=13, max_len=100):
102
+ # 음성 파일
103
+ audio, sample_rate = librosa.load(file_path, sr=None)
104
+ # mfcc 특성 추출
105
+ mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
106
+
107
+ # 일정한 길이로 맞춤
108
+ if mfcc.shape[1] < max_len:
109
+ pad_width = max_len - mfcc.shape[1]
110
+ mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
111
+ else:
112
+ mfcc = mfcc[:, :max_len]
113
+
114
+ return torch.Tensor(mfcc)
115
+
116
+ # 폴더에 있는 데이터 한번에 접근해서 한번에 체크
117
+ def real_fake_check(list_dir, path, model):
118
+ THRESHOLD = 0.4 #딥페이크 기준을 0.4로 설정
119
+ r_cnt = 0
120
+ f_cnt = 0
121
+ prob = {}
122
+ for i in list_dir: # real / fake 선택
123
+ input_data = extract_mfcc_path(os.path.join(path, i))
124
+ input_data = torch.tensor(input_data).unsqueeze(0).to('cuda') # 배치 차원을 추가하여 (1, input_dim, sequence_length)로 맞춤
125
+ result = model(input_data.float())
126
+ probabilities = F.softmax(result, dim=1)
127
+ prob[i]='%.2f'%probabilities[0][1].item()
128
+ predicted_class = 0 if probabilities[0][0] >= THRESHOLD else 1 # 확률값이 기준치보다 크다면 real, 아니면 fake
129
+
130
+ if predicted_class == 0:
131
+ r_cnt += 1
132
+ else:
133
+ f_cnt += 1
134
+
135
+ return {'real: ':f'{r_cnt}/{len(list_dir)}', 'fake: ':f'{f_cnt}/{len(list_dir)}', 'prob: ': prob}
136
+
137
+
138
+ def main(file_name):
139
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
140
+ device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
141
+
142
+ video_file = file_name #deepfake #meganfox.mp4'
143
+ #current_path = os.getcwd()
144
+ audio_file = '/tmp/output_audio.wav' # 저장할 오디오 파일의 경로, 이름 지정
145
+
146
+ extract_audio_from_video(video_file, audio_file)
147
+
148
+ seprate_speaker(audio_file,pipeline) # 발화자 분리해서 파일로 만들기
149
+
150
+ mel_dim = 13 # Mel-spectrogram 차원
151
+ num_classes = 2 # 분류할 클래스 수
152
+ input_dim = mel_dim
153
+ hidden_dim = 128
154
+ dropout_rate = 0.2
155
+ l2_reg = 0.01
156
+
157
+ # 모델
158
+ model_name = hf_hub_download(repo_id="sssssungk/deepfake_voice", filename="deepvoice_model_girl.pth")
159
+ model = DeepVoiceModel(input_dim, hidden_dim, num_classes, dropout_rate, l2_reg).to(device)
160
+ model.load_state_dict(torch.load(model_name))
161
+ model.eval() # 평가 모드로 설정
162
+
163
+
164
+ #real,fake 폴더
165
+ #real_path = '/content/drive/MyDrive/캡스톤 1조/data/deepvoice/real'
166
+ #real_path = '/content/drive/MyDrive/Celeb-DF-v2/Celeb-real'
167
+
168
+ #real = os.listdir(real_path)
169
+ #current_path = os.getcwd()
170
+ fake_path = '/tmp/wav'
171
+ fake = os.listdir(fake_path)
172
+
173
+ rf_check = real_fake_check(fake, fake_path,model) #fake dataset\
174
+ return rf_check
175
+
176
+ def deepvoice_check(video_file):
177
+ results = main(video_file)
178
+ return results
179
+
180
+ # Gradio 인터페이스 생성
181
+ deepfake = gr.Interface(
182
+ fn=deepvoice_check,
183
+ inputs=gr.Video(label="Upload mp4 File"),
184
+ outputs=gr.Textbox(label="DeepFaKeVoice Detection Result"),
185
+ title="DeepFaKeVoice Check",
186
+ description="Upload an mp4 file to check."
187
+ )
188
+
189
+ if __name__ == "__main__":
190
+ deepfake.launch(share=True, debug=True)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ torchaudio
4
+ transformers
5
+ huggingface_hub
6
+ gradio
7
+ pyannote.audio
8
+ moviepy
9
+ librosa
10
+ numpy
11
+ ffmpeg