Ricky01anjay commited on
Commit
85f08bf
·
verified ·
1 Parent(s): 355e25c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -75
app.py CHANGED
@@ -7,13 +7,20 @@ import json
7
  import time
8
  import subprocess
9
  import logging
 
10
  import numpy as np
 
 
11
  from flask import Flask, request, jsonify, render_template_string, send_from_directory
12
  import whisper
13
  import edge_tts
14
 
 
 
 
15
  # --- KONFIGURASI SILENT LOGS ---
16
  os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
17
  logging.getLogger('werkzeug').setLevel(logging.ERROR)
18
 
19
  app = Flask(__name__)
@@ -31,78 +38,100 @@ VOICE_MAP = {
31
  'ja-JP': {'Male': 'ja-JP-KeitaNeural', 'Female': 'ja-JP-NanamiNeural'}
32
  }
33
 
34
- # Mapping Bahasa untuk Prompt AI
35
  LANG_MAP = {
36
  'id-ID': 'Indonesia',
37
  'en-US': 'Inggris',
38
  'ja-JP': 'Jepang'
39
  }
40
 
41
- # Load Whisper (CPU Friendly, FP16 Fixed)
 
42
  whisper_model = whisper.load_model("base")
43
 
44
- def get_audio_duration(file_path):
45
- cmd = [
46
- 'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
47
- '-of', 'default=noprint_wrappers=1:nokey=1', file_path
48
- ]
49
- result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
50
- try:
51
- return float(result.stdout)
52
- except:
53
- return 0.0
54
 
55
- def analyze_gender_and_pitch(audio_path):
56
- """Menganalisis potongan audio untuk menentukan gender dan variasi pitch."""
57
- try:
58
- import librosa
59
- # Load audio dengan sample rate standard
60
- y, sr = librosa.load(audio_path, sr=22050)
61
-
62
- if len(y) == 0: return "Male", "+0Hz"
63
 
64
- # Deteksi Fundamental Frequency (F0)
65
- f0 = librosa.yin(y, fmin=65, fmax=300)
66
- valid_f0 = f0[~np.isnan(f0)]
67
-
68
- if len(valid_f0) > 0:
69
- mean_f0 = np.mean(valid_f0)
70
-
71
- # Threshold umum: > 165Hz = Perempuan, < 165Hz = Laki-laki
72
- gender = "Female" if mean_f0 >= 165 else "Male"
73
-
74
- # Hitung variasi pitch (agar tiap orang suaranya beda)
75
- # Normal cowok ~120Hz, cewek ~210Hz. Dibagi 2 agar tidak terlalu ekstrem
76
- base_f0 = 210.0 if gender == "Female" else 120.0
77
- pitch_shift = int((mean_f0 - base_f0) / 2)
78
-
79
- # Batasi modifikasi pitch Edge TTS agar tidak rusak (antara -20Hz s/d +20Hz)
80
- pitch_shift = max(-20, min(20, pitch_shift))
81
- pitch_str = f"+{pitch_shift}Hz" if pitch_shift >= 0 else f"{pitch_shift}Hz"
82
-
83
- return gender, pitch_str
84
- except Exception as e:
85
- print(f"Pitch analysis warning: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- return "Male", "+0Hz" # Default fallback
 
88
 
89
  def translate_segments_llm(segments, custom_prompt, target_voice):
90
  target_lang = LANG_MAP.get(target_voice, 'Indonesia')
91
 
92
- # PERBAIKAN: Memasukkan bahasa target secara paksa ke dalam prompt
93
- if custom_prompt:
94
- instruction = f"{custom_prompt}\n\nPENTING: Terjemahkan SEMUA teks ke dalam bahasa {target_lang}."
95
- else:
96
- instruction = f"Terjemahkan teks dalam JSON ini ke bahasa {target_lang} dengan akurat. Balas HANYA dengan JSON array."
97
 
98
  input_data = [{"id": i, "text": s['text']} for i, s in enumerate(segments)]
99
  full_prompt = f"{instruction}\n\nFormat: [{{'id': 0, 'text': '...'}}]\n\nData:\n{json.dumps(input_data)}"
100
 
101
- url = "https://www.puruboy.kozow.com/api/ai/notegpt"
102
- payload = {"prompt": full_prompt, "model": "gemini-3-flash-preview", "chat_mode": "standard"}
103
-
104
  try:
 
 
105
  response = requests.post(url, json=payload, timeout=60)
 
106
  full_text = ""
107
  for line in response.iter_lines():
108
  if line:
@@ -117,47 +146,50 @@ def translate_segments_llm(segments, custom_prompt, target_voice):
117
  for item in translated_list:
118
  segments[item['id']]['translated_text'] = item['text']
119
  except Exception as e:
120
- print(f"Translation Error: {e}")
121
  for s in segments: s['translated_text'] = s['text']
122
  return segments
123
 
124
- # PERBAIKAN: Menambahkan parameter pitch
125
- async def generate_tts(text, voice, path, pitch_str="+0Hz"):
126
- communicate = edge_tts.Communicate(text, voice, pitch=pitch_str)
127
  await communicate.save(path)
128
 
129
  def process_dubbing(task_id, video_path, target_voice, custom_prompt):
130
  try:
131
  tasks[task_id]['status'] = 'Mengekstrak Audio...'
132
  orig_audio = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}_orig.wav")
133
- subprocess.run(['ffmpeg', '-loglevel', 'quiet', '-y', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', orig_audio], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
 
 
 
134
 
135
  tasks[task_id]['status'] = 'Transkripsi...'
136
  result = whisper_model.transcribe(orig_audio, verbose=False, fp16=False)
137
  segments = result['segments']
138
 
139
  tasks[task_id]['status'] = f'Translasi AI ({LANG_MAP.get(target_voice, target_voice)})...'
140
- # Pass target_voice ke translator
141
  translated_segments = translate_segments_llm(segments, custom_prompt, target_voice)
142
 
143
- tasks[task_id]['status'] = 'Menganalisis Suara & Dubbing...'
144
  processed_audio_files = []
145
 
146
  for i, seg in enumerate(translated_segments):
147
  start_t = seg['start']
148
  end_t = seg['end']
149
  duration_orig = end_t - start_t
150
- text = seg.get('translated_text', seg['text'])
151
- if not text.strip(): continue
 
152
 
153
- # Potong audio asli khusus untuk segmen ini guna deteksi suara
154
- chunk_wav = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}_chunk_{i}.wav")
155
- subprocess.run(['ffmpeg', '-loglevel', 'quiet', '-y', '-i', orig_audio, '-ss', str(start_t), '-t', str(duration_orig), chunk_wav], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
156
 
157
- # Deteksi Cewek/Cowok dan variasi pitch
158
- gender, pitch_str = analyze_gender_and_pitch(chunk_wav)
159
-
160
- # Pilih Voice ID yang sesuai berdasarkan bahasa dan gender
161
  selected_voice = VOICE_MAP.get(target_voice, VOICE_MAP['id-ID'])[gender]
162
 
163
  raw_tts = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}_raw_{i}.mp3")
@@ -166,6 +198,7 @@ def process_dubbing(task_id, video_path, target_voice, custom_prompt):
166
  # Generate TTS dengan pitch modifier
167
  asyncio.run(generate_tts(text, selected_voice, raw_tts, pitch_str))
168
 
 
169
  tts_dur = get_audio_duration(raw_tts)
170
  speed = min(max(tts_dur / duration_orig, 0.7), 1.8) if duration_orig > 0 else 1.0
171
 
@@ -176,8 +209,8 @@ def process_dubbing(task_id, video_path, target_voice, custom_prompt):
176
  output_filename = f"{task_id}_output.mp4"
177
  output_path = os.path.join(app.config['UPLOAD_FOLDER'], output_filename)
178
 
179
- # LOGIKA AUDIO BARU:
180
- filter_complex = "[0:a]equalizer=f=1000:width_type=o:w=2:g=-15,volume=0.4[bg];"
181
  inputs_cmd = ['ffmpeg', '-loglevel', 'quiet', '-y', '-i', video_path]
182
  amix_inputs = "[bg]"
183
 
@@ -224,7 +257,7 @@ def generate():
224
  task_id = str(uuid.uuid4())
225
  path = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}.mp4")
226
  file.save(path)
227
- tasks[task_id] = {'status': 'Queued', 'result_video': None, 'error_message': None}
228
  threading.Thread(target=process_dubbing, args=(task_id, path, request.form.get('voice'), request.form.get('prompt'))).start()
229
  return jsonify({'task_id': task_id})
230
 
@@ -236,6 +269,7 @@ def status():
236
  def download(f):
237
  return send_from_directory(app.config['UPLOAD_FOLDER'], f)
238
 
 
239
  # --- HTML DENGAN TAILWIND CSS ---
240
  HTML_TEMPLATE = """
241
  <!DOCTYPE html>
@@ -248,8 +282,13 @@ HTML_TEMPLATE = """
248
  </head>
249
  <body class="bg-gray-900 text-gray-100 min-h-screen flex items-center justify-center p-4 font-sans">
250
 
251
- <div class="bg-gray-800 rounded-2xl shadow-2xl p-8 w-full max-w-md border border-gray-700">
252
- <h2 class="text-2xl font-bold text-center mb-2 text-white">🎙️ Dubbing Sync Pro</h2>
 
 
 
 
 
253
  <p class="text-sm text-center text-gray-400 mb-6">Deteksi Gender & Multi-Speaker Auto-Pitch</p>
254
 
255
  <form id="uploadForm" class="space-y-4">
@@ -270,12 +309,12 @@ HTML_TEMPLATE = """
270
 
271
  <div>
272
  <label class="block text-sm font-medium text-gray-300 mb-1">Custom Prompt AI (Opsional)</label>
273
- <textarea id="customPrompt" rows="2" placeholder="Gaya bahasa santai, dll..."
274
  class="w-full bg-gray-700 border border-gray-600 rounded-lg p-2.5 text-white focus:ring-2 focus:ring-blue-500 focus:outline-none resize-none"></textarea>
275
  </div>
276
 
277
  <button type="submit" id="btnSubmit"
278
- class="w-full bg-blue-600 hover:bg-blue-700 text-white font-bold py-3 px-4 rounded-lg transition duration-200 shadow-lg shadow-blue-500/30">
279
  Mulai Dubbing
280
  </button>
281
  </form>
@@ -330,7 +369,6 @@ HTML_TEMPLATE = """
330
  document.getElementById('resVideo').src = sData.result_video;
331
  document.getElementById('dlBtn').href = sData.result_video;
332
 
333
- // Reset button
334
  document.getElementById('btnSubmit').disabled = false;
335
  document.getElementById('btnSubmit').classList.remove('opacity-50', 'cursor-not-allowed');
336
  } else if (sData.status === 'Error') {
 
7
  import time
8
  import subprocess
9
  import logging
10
+ import warnings
11
  import numpy as np
12
+ import librosa
13
+ import soundfile as sf
14
  from flask import Flask, request, jsonify, render_template_string, send_from_directory
15
  import whisper
16
  import edge_tts
17
 
18
+ # --- TAMBAHAN AI UNTUK GENDER DETECTION ---
19
+ from transformers import pipeline
20
+
21
  # --- KONFIGURASI SILENT LOGS ---
22
  os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
23
+ warnings.filterwarnings('ignore')
24
  logging.getLogger('werkzeug').setLevel(logging.ERROR)
25
 
26
  app = Flask(__name__)
 
38
  'ja-JP': {'Male': 'ja-JP-KeitaNeural', 'Female': 'ja-JP-NanamiNeural'}
39
  }
40
 
 
41
  LANG_MAP = {
42
  'id-ID': 'Indonesia',
43
  'en-US': 'Inggris',
44
  'ja-JP': 'Jepang'
45
  }
46
 
47
+ # 1. Load Whisper
48
+ print("Memuat Model Whisper...")
49
  whisper_model = whisper.load_model("base")
50
 
51
+ # 2. Load AI Gender Classifier (Transformers)
52
+ print("Memuat Model AI Gender Recognition...")
53
+ try:
54
+ gender_classifier = pipeline("audio-classification", model="alefiury/wav2vec2-large-xlsr-53-gender-recognition-osmr", device="cpu")
55
+ print("Berhasil memuat AI Gender Model!")
56
+ except Exception as e:
57
+ print(f"Peringatan: Gagal memuat AI Gender, akan menggunakan Fallback Librosa. Error: {e}")
58
+ gender_classifier = None
 
 
59
 
 
 
 
 
 
 
 
 
60
 
61
+ def get_audio_duration(file_path):
62
+ cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
63
+ result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
64
+ try: return float(result.stdout)
65
+ except: return 0.0
66
+
67
+ def analyze_audio_chunk(y_chunk, sr):
68
+ """Menganalisis array audio untuk Gender (AI + P-YIN) & Pitch secara komprehensif"""
69
+ if len(y_chunk) == 0: return "Male", "+0%"
70
+
71
+ # 1. Ekstraksi Pitch dengan pYIN (Probabilistic YIN - Sangat Akurat)
72
+ f0, voiced_flag, _ = librosa.pyin(
73
+ y_chunk,
74
+ fmin=librosa.note_to_hz('C2'), # ~65 Hz
75
+ fmax=librosa.note_to_hz('C6'), # ~1046 Hz
76
+ sr=sr
77
+ )
78
+ valid_f0 = f0[voiced_flag]
79
+ mean_f0 = np.median(valid_f0) if len(valid_f0) > 0 else 0
80
+
81
+ gender = None
82
+
83
+ # 2. Klasifikasi Gender Menggunakan Model AI HuggingFace (Jika tersedia)
84
+ if gender_classifier is not None and len(y_chunk) > (sr * 0.3): # Butuh minimal 0.3 detik audio
85
+ try:
86
+ # Resample ke 16kHz karena model Wav2Vec2 mewajibkan 16kHz
87
+ y_16k = librosa.resample(y_chunk, orig_sr=sr, target_sr=16000)
88
+ ai_result = gender_classifier(y_16k)
89
+ best_label = ai_result[0]['label'].lower()
90
+ gender = "Female" if "female" in best_label else "Male"
91
+ except Exception as e:
92
+ print(f"AI Model Error, fallback... {e}")
93
+
94
+ # 3. Fallback: Klasifikasi Gender Cerdas dengan Librosa (Pitch + Timbre)
95
+ if not gender:
96
+ if mean_f0 > 175:
97
+ gender = "Female"
98
+ elif mean_f0 > 0 and mean_f0 < 155:
99
+ gender = "Male"
100
+ else:
101
+ # Jika di zona abu-abu (155-175Hz), cek Kecerahan Suara (Spectral Centroid)
102
+ cent = librosa.feature.spectral_centroid(y=y_chunk, sr=sr)
103
+ mean_cent = np.median(cent)
104
+ gender = "Female" if mean_cent > 1600 else "Male"
105
+
106
+ # 4. Dinamis Pitch Shift (Persentase)
107
+ # Target frekuensi rata-rata: Pria ~120Hz, Wanita ~210Hz
108
+ base_f0 = 210.0 if gender == "Female" else 120.0
109
+
110
+ if mean_f0 > 0:
111
+ # Menghitung selisih persentase antara suara asli dan target
112
+ pitch_shift_pct = ((mean_f0 - base_f0) / base_f0) * 100
113
+ # Batasi perubahan maksimal 12% agar suara AI tidak rusak/chipmunk
114
+ pitch_shift_pct = max(-12, min(12, pitch_shift_pct))
115
+ else:
116
+ pitch_shift_pct = 0
117
 
118
+ pitch_str = f"{int(pitch_shift_pct):+d}%"
119
+ return gender, pitch_str
120
 
121
  def translate_segments_llm(segments, custom_prompt, target_voice):
122
  target_lang = LANG_MAP.get(target_voice, 'Indonesia')
123
 
124
+ if custom_prompt: instruction = f"{custom_prompt}\n\nPENTING: Terjemahkan SEMUA teks ke dalam bahasa {target_lang}."
125
+ else: instruction = f"Terjemahkan teks dalam JSON ini ke bahasa {target_lang} dengan akurat. Balas HANYA dengan JSON array."
 
 
 
126
 
127
  input_data = [{"id": i, "text": s['text']} for i, s in enumerate(segments)]
128
  full_prompt = f"{instruction}\n\nFormat: [{{'id': 0, 'text': '...'}}]\n\nData:\n{json.dumps(input_data)}"
129
 
 
 
 
130
  try:
131
+ url = "https://www.puruboy.kozow.com/api/ai/notegpt"
132
+ payload = {"prompt": full_prompt, "model": "gemini-3-flash-preview", "chat_mode": "standard"}
133
  response = requests.post(url, json=payload, timeout=60)
134
+
135
  full_text = ""
136
  for line in response.iter_lines():
137
  if line:
 
146
  for item in translated_list:
147
  segments[item['id']]['translated_text'] = item['text']
148
  except Exception as e:
149
+ print(f"Translation API Error: {e}, using original text.")
150
  for s in segments: s['translated_text'] = s['text']
151
  return segments
152
 
153
+ async def generate_tts(text, voice, path, pitch_str="+0%"):
154
+ # Edge TTS mendukung persentase (contoh: +5%, -10%)
155
+ communicate = edge_tts.Communicate(text, voice, rate="+0%", pitch=pitch_str)
156
  await communicate.save(path)
157
 
158
  def process_dubbing(task_id, video_path, target_voice, custom_prompt):
159
  try:
160
  tasks[task_id]['status'] = 'Mengekstrak Audio...'
161
  orig_audio = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}_orig.wav")
162
+ subprocess.run(['ffmpeg', '-loglevel', 'quiet', '-y', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '22050', orig_audio], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
163
+
164
+ # Muat audio secara utuh ke memori (Sangat mempercepat proses analisis!)
165
+ tasks[task_id]['status'] = 'Analisis Audio Utama...'
166
+ y_full, sr_full = librosa.load(orig_audio, sr=22050)
167
 
168
  tasks[task_id]['status'] = 'Transkripsi...'
169
  result = whisper_model.transcribe(orig_audio, verbose=False, fp16=False)
170
  segments = result['segments']
171
 
172
  tasks[task_id]['status'] = f'Translasi AI ({LANG_MAP.get(target_voice, target_voice)})...'
 
173
  translated_segments = translate_segments_llm(segments, custom_prompt, target_voice)
174
 
175
+ tasks[task_id]['status'] = 'Mendeteksi Gender & Dubbing...'
176
  processed_audio_files = []
177
 
178
  for i, seg in enumerate(translated_segments):
179
  start_t = seg['start']
180
  end_t = seg['end']
181
  duration_orig = end_t - start_t
182
+ text = seg.get('translated_text', seg['text']).strip()
183
+
184
+ if not text: continue
185
 
186
+ # Potong audio langsung dari RAM (Tidak perlu FFmpeg)
187
+ start_sample = int(start_t * sr_full)
188
+ end_sample = int(end_t * sr_full)
189
+ y_chunk = y_full[start_sample:end_sample]
190
 
191
+ # Gunakan AI & Matematika Akustik untuk tentukan Gender & Variasi Pitch
192
+ gender, pitch_str = analyze_audio_chunk(y_chunk, sr_full)
 
 
193
  selected_voice = VOICE_MAP.get(target_voice, VOICE_MAP['id-ID'])[gender]
194
 
195
  raw_tts = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}_raw_{i}.mp3")
 
198
  # Generate TTS dengan pitch modifier
199
  asyncio.run(generate_tts(text, selected_voice, raw_tts, pitch_str))
200
 
201
+ # Sinkronisasi durasi TTS dengan audio original
202
  tts_dur = get_audio_duration(raw_tts)
203
  speed = min(max(tts_dur / duration_orig, 0.7), 1.8) if duration_orig > 0 else 1.0
204
 
 
209
  output_filename = f"{task_id}_output.mp4"
210
  output_path = os.path.join(app.config['UPLOAD_FOLDER'], output_filename)
211
 
212
+ # Auto-Ducking: Audio asli diredupkan (volume=0.3), dubbing dibesarkan (volume=3.0)
213
+ filter_complex = "[0:a]equalizer=f=1000:width_type=o:w=2:g=-15,volume=0.3[bg];"
214
  inputs_cmd = ['ffmpeg', '-loglevel', 'quiet', '-y', '-i', video_path]
215
  amix_inputs = "[bg]"
216
 
 
257
  task_id = str(uuid.uuid4())
258
  path = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}.mp4")
259
  file.save(path)
260
+ tasks[task_id] = {'status': 'Antri...', 'result_video': None, 'error_message': None}
261
  threading.Thread(target=process_dubbing, args=(task_id, path, request.form.get('voice'), request.form.get('prompt'))).start()
262
  return jsonify({'task_id': task_id})
263
 
 
269
  def download(f):
270
  return send_from_directory(app.config['UPLOAD_FOLDER'], f)
271
 
272
+
273
  # --- HTML DENGAN TAILWIND CSS ---
274
  HTML_TEMPLATE = """
275
  <!DOCTYPE html>
 
282
  </head>
283
  <body class="bg-gray-900 text-gray-100 min-h-screen flex items-center justify-center p-4 font-sans">
284
 
285
+ <div class="bg-gray-800 rounded-2xl shadow-2xl p-8 w-full max-w-md border border-gray-700 relative overflow-hidden">
286
+ <!-- AI Badge -->
287
+ <div class="absolute top-0 right-0 bg-purple-600 text-xs font-bold px-3 py-1 rounded-bl-lg shadow-lg">
288
+ ✨ Advanced AI Engine
289
+ </div>
290
+
291
+ <h2 class="text-2xl font-bold text-center mb-2 text-white mt-2">🎙️ Dubbing Sync Pro</h2>
292
  <p class="text-sm text-center text-gray-400 mb-6">Deteksi Gender & Multi-Speaker Auto-Pitch</p>
293
 
294
  <form id="uploadForm" class="space-y-4">
 
309
 
310
  <div>
311
  <label class="block text-sm font-medium text-gray-300 mb-1">Custom Prompt AI (Opsional)</label>
312
+ <textarea id="customPrompt" rows="2" placeholder="Contoh: Terjemahkan dengan gaya bahasa santai..."
313
  class="w-full bg-gray-700 border border-gray-600 rounded-lg p-2.5 text-white focus:ring-2 focus:ring-blue-500 focus:outline-none resize-none"></textarea>
314
  </div>
315
 
316
  <button type="submit" id="btnSubmit"
317
+ class="w-full bg-blue-600 hover:bg-blue-700 text-white font-bold py-3 px-4 rounded-lg transition duration-200 shadow-lg shadow-blue-500/30 flex justify-center items-center gap-2">
318
  Mulai Dubbing
319
  </button>
320
  </form>
 
369
  document.getElementById('resVideo').src = sData.result_video;
370
  document.getElementById('dlBtn').href = sData.result_video;
371
 
 
372
  document.getElementById('btnSubmit').disabled = false;
373
  document.getElementById('btnSubmit').classList.remove('opacity-50', 'cursor-not-allowed');
374
  } else if (sData.status === 'Error') {