sssssungk commited on
Commit
fcb97b9
ยท
verified ยท
1 Parent(s): 27ba3aa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -0
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore") # ๊ฒฝ๊ณ  ๋ฌด์‹œ
3
+ #!pip install pyannote.audio
4
+ #!pip install moviepy
5
+ import librosa
6
+ import numpy as np
7
+ import os
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ import torch.functional as F
14
+
15
+ from pyannote.audio import Pipeline
16
+
17
+ from pyannote.audio import Audio
18
+ import torchaudio
19
+
20
+
21
+ import torch.nn.functional as F
22
+ import os
23
+
24
+ from moviepy.editor import VideoFileClip
25
+
26
+ from transformers import pipeline
27
+
28
+ from huggingface_hub import hf_hub_download
29
+
30
+ #!pip install gradio
31
+ import gradio as gr
32
+
33
+
34
+
35
+ # ์˜ค๋””์˜ค ๋ณ€ํ™˜ mp4 --> wav
36
+ def extract_audio_from_video(video_file_path, audio_file_path):
37
+ # mp4 ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
38
+ video = VideoFileClip(video_file_path)
39
+
40
+ # ์˜ค๋””์˜ค๋ฅผ ์ถ”์ถœํ•˜์—ฌ wav ํŒŒ์ผ๋กœ ์ €์žฅ
41
+ video.audio.write_audiofile(audio_file_path, codec='pcm_s16le')
42
+
43
+ # ์ „์ฒด ์˜ค๋””์˜ค ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
44
+ def seprate_speaker(audio_file, pipeline):
45
+ audio = Audio()
46
+ waveform, sample_rate = torchaudio.load(audio_file)
47
+ diarization = pipeline(audio_file)
48
+
49
+ # ํ™”์ž๋ณ„๋กœ ๋ฐœํ™” ๊ตฌ๊ฐ„์„ ์ €์žฅํ•  ๋”•์…”๋„ˆ๋ฆฌ ์ดˆ๊ธฐํ™”
50
+ speaker_segments = {}
51
+
52
+ # diarization ๊ฒฐ๊ณผ๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ๊ฐ ํ™”์ž์˜ ๋ฐœํ™”๋ฅผ ๋”•์…”๋„ˆ๋ฆฌ์— ์ถ”๊ฐ€
53
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
54
+ start_time = segment.start
55
+ end_time = segment.end
56
+
57
+ # ํ•ด๋‹น ํ™”์ž๊ฐ€ ์ฒ˜์Œ ๋“ฑ์žฅํ•˜๋ฉด ๋ฆฌ์ŠคํŠธ๋ฅผ ์ดˆ๊ธฐํ™”
58
+ if speaker not in speaker_segments:
59
+ speaker_segments[speaker] = []
60
+
61
+ # ๋ฐœํ™” ๊ตฌ๊ฐ„์„ ํ•ด๋‹น ํ™”์ž์˜ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€
62
+ segment_waveform = waveform[:, int(start_time * sample_rate):int(end_time * sample_rate)]
63
+ speaker_segments[speaker].append(segment_waveform)
64
+
65
+ # ๊ฐ ํ™”์ž๋ณ„๋กœ ๋ชจ๋“  ๋ฐœํ™” ๊ตฌ๊ฐ„์„ ํ•˜๋‚˜์˜ ํŒŒ์ผ๋กœ ์ด์–ด๋ถ™์—ฌ ์ €์žฅ
66
+ for speaker, segments in speaker_segments.items():
67
+ # ํ™”์ž์˜ ๋ชจ๋“  ๋ฐœํ™” ๊ตฌ๊ฐ„์„ ์ด์–ด๋ถ™์ž„
68
+ combined_waveform = torch.cat(segments, dim=1)
69
+ output_path = "/content/wav" # ๊ฒฝ๋กœ
70
+ os.makedirs(output_path, exist_ok=True) # ๊ฒฝ๋กœ๊ฐ€ ์—†์œผ๋ฉด ์ƒ์„ฑ
71
+ output_filename = os.path.join(output_path,f"{speaker}.wav")
72
+
73
+ torchaudio.save(output_filename, combined_waveform, sample_rate) #์˜ค๋””์˜ค ํŒŒ์ผ ์ €์žฅ
74
+ #print(f"Saved {output_filename} for speaker {speaker}")
75
+
76
+
77
+ # ๊ฐ„๋‹จํ•œ DeepVoice ์Šคํƒ€์ผ ๋ชจ๋ธ ์ •์˜
78
+ class DeepVoiceModel(nn.Module):
79
+ def __init__(self, input_dim, hidden_dim, num_classes, dropout_rate=0.3, l2_reg=0.01):
80
+ super(DeepVoiceModel, self).__init__()
81
+ self.conv1 = nn.Conv1d(input_dim, hidden_dim, kernel_size=5, padding=2)
82
+ self.bn1 = nn.BatchNorm1d(hidden_dim)
83
+ self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=5, padding=2)
84
+ self.bn2 = nn.BatchNorm1d(hidden_dim)
85
+ self.dropout = nn.Dropout(dropout_rate)
86
+ self.fc = nn.Linear(hidden_dim, num_classes)
87
+
88
+ def forward(self, x):
89
+ x = self.bn1(torch.relu(self.conv1(x)))
90
+ x = self.dropout(x)
91
+ x = self.bn2(torch.relu(self.conv2(x)))
92
+ x = self.dropout(x)
93
+ x = torch.mean(x, dim=2) # Temporal pooling
94
+ x = self.fc(x)
95
+ return x
96
+
97
+
98
+ def extract_mfcc_path(file_path, n_mfcc=13, max_len=100):
99
+ # ์Œ์„ฑ ํŒŒ์ผ
100
+ audio, sample_rate = librosa.load(file_path, sr=None)
101
+ # mfcc ํŠน์„ฑ ์ถ”์ถœ
102
+ mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
103
+
104
+ # ์ผ์ •ํ•œ ๊ธธ์ด๋กœ ๋งž์ถค
105
+ if mfcc.shape[1] < max_len:
106
+ pad_width = max_len - mfcc.shape[1]
107
+ mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
108
+ else:
109
+ mfcc = mfcc[:, :max_len]
110
+
111
+ return torch.Tensor(mfcc)
112
+
113
+ # ํด๋”์— ์žˆ๋Š” ๋ฐ์ดํ„ฐ ํ•œ๋ฒˆ์— ์ ‘๊ทผํ•ด์„œ ํ•œ๋ฒˆ์— ์ฒดํฌ
114
+ def real_fake_check(list_dir, path, model):
115
+ THRESHOLD = 0.4 #๋”ฅํŽ˜์ดํฌ ๊ธฐ์ค€์„ 0.4๋กœ ์„ค์ •
116
+ r_cnt = 0
117
+ f_cnt = 0
118
+ prob = {}
119
+ for i in list_dir: # real / fake ์„ ํƒ
120
+ #print('------',i)
121
+ input_data = extract_mfcc_path(os.path.join(path, i))
122
+ input_data = torch.tensor(input_data).unsqueeze(0).to('cuda') # ๋ฐฐ์น˜ ์ฐจ์›์„ ์ถ”๊ฐ€ํ•˜์—ฌ (1, input_dim, sequence_length)๋กœ ๋งž์ถค
123
+ result = model(input_data.float())
124
+ # predicted_class = torch.argmax(result, dim=1).item()
125
+ probabilities = F.softmax(result, dim=1)
126
+ prob[i]='%.2f'%probabilities[0][1].item()
127
+
128
+ predicted_class = 0 if probabilities[0][0] >= THRESHOLD else 1 # ํ™•๋ฅ ๊ฐ’์ด ๊ธฐ์ค€์น˜๋ณด๋‹ค ํฌ๋‹ค๋ฉด real, ์•„๋‹ˆ๋ฉด fake
129
+ # print('-- %.2f'%probabilities[0][0].item()) #ํ™•๋ฅ  ๊ฐ’ ์ถœ๋ ฅ
130
+ if predicted_class == 0:
131
+ # print("REAL")
132
+ r_cnt += 1
133
+ else:
134
+ # print("FAKE")
135
+ f_cnt += 1
136
+ #print()
137
+ #print('real: ',r_cnt,'/',len(list_dir))
138
+ #print('fake: ',f_cnt,'/',len(list_dir))
139
+ return {'real: ':f'{r_cnt}/{len(list_dir)}', 'fake: ':f'{f_cnt}/{len(list_dir)}', 'prob: ': prob}
140
+
141
+
142
+ def main(file_name):
143
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
144
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
145
+
146
+ video_file = file_name #deepfake #meganfox.mp4'
147
+ audio_file = '/content/output_audio.wav' # ์ €์žฅํ•  ์˜ค๋””์˜ค ํŒŒ์ผ์˜ ๊ฒฝ๋กœ, ์ด๋ฆ„ ์ง€์ •
148
+
149
+ extract_audio_from_video(video_file, audio_file)
150
+
151
+ seprate_speaker(audio_file,pipeline) # ๋ฐœํ™”์ž ๋ถ„๋ฆฌํ•ด์„œ ํŒŒ์ผ๋กœ ๋งŒ๋“ค๊ธฐ
152
+
153
+ mel_dim = 13 # Mel-spectrogram ์ฐจ์›
154
+ num_classes = 2 # ๋ถ„๋ฅ˜ํ•  ํด๋ž˜์Šค ์ˆ˜
155
+ input_dim = mel_dim
156
+ hidden_dim = 128
157
+ dropout_rate = 0.2
158
+ l2_reg = 0.01
159
+
160
+ # ๋ชจ๋ธ
161
+ model_name = hf_hub_download(repo_id="sssssungk/deepfake_voice", filename="deepvoice_model_girl.pth")
162
+ model = DeepVoiceModel(input_dim, hidden_dim, num_classes, dropout_rate, l2_reg).to(device)
163
+ model.load_state_dict(torch.load(model_name))#("/content/drive/MyDrive/แ„แ…ขแ†ธแ„‰แ…ณแ„แ…ฉแ†ซ 1แ„Œแ…ฉ/model/deepvoice_model_girl.pth"))
164
+ model.eval() # ํ‰๊ฐ€ ๋ชจ๋“œ๋กœ ์„ค์ •
165
+
166
+
167
+ #real,fake ํด๋”
168
+ #real_path = '/content/drive/MyDrive/แ„แ…ขแ†ธแ„‰แ…ณแ„แ…ฉแ†ซ 1แ„Œแ…ฉ/data/deepvoice/real'
169
+ #real_path = '/content/drive/MyDrive/Celeb-DF-v2/Celeb-real'
170
+
171
+ #real = os.listdir(real_path)
172
+ fake_path = '/content/wav'#'/content/drive/MyDrive/แ„แ…ขแ†ธแ„‰แ…ณแ„แ…ฉแ†ซ 1แ„Œแ…ฉ/data/deepvoice/fake'
173
+ fake = os.listdir(fake_path)
174
+
175
+ #print("\n-------real data---------")
176
+ #real_fake_check(real, real_path, model) #real dataset
177
+ #print("\n-------fake data---------")
178
+ rf_check = real_fake_check(fake, fake_path,model) #fake dataset\
179
+ return rf_check
180
+
181
+
182
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
183
+ iface = gr.Interface(
184
+ fn=main,
185
+ inputs=gr.Video(label="Upload mp4 File"),
186
+ outputs=gr.Textbox(label="Deepfake Detection Result"),
187
+ title="DeepVoice Check",
188
+ description="Upload an mp4 file to check for DeepVoice indicators."
189
+ )
190
+
191
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
192
+ iface.launch()