farid678 commited on
Commit
98e8f5d
·
verified ·
1 Parent(s): 37a1d33
Files changed (1) hide show
  1. app.py +56 -41
app.py CHANGED
@@ -3,18 +3,31 @@ import cv2
3
  import mediapipe as mp
4
  import whisper
5
  import numpy as np
6
- import ffmpeg
7
  import os
 
8
 
9
  # -----------------------------
10
- # تنظیمات MediaPipe و Whisper
11
  # -----------------------------
12
  mp_face_mesh = mp.solutions.face_mesh
13
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
14
- model = whisper.load_model("small")
15
 
16
  # -----------------------------
17
- # تابع محاسبه حرکت لب
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # -----------------------------
19
  def lip_aspect_ratio(landmarks, width, height):
20
  top_lip = landmarks[13]
@@ -28,39 +41,35 @@ def lip_aspect_ratio(landmarks, width, height):
28
  return (bottom - top) / (right - left)
29
 
30
  # -----------------------------
31
- # تابع اصلی پردازش ویدئو
32
  # -----------------------------
33
  def process_video(video_file):
34
- input_path = video_file
35
- output_audio = "audio.wav"
36
- output_video = "output.mp4"
37
-
38
- # استخراج صدا از ویدئو
39
- try:
40
- (
41
- ffmpeg
42
- .input(input_path)
43
- .output(output_audio, format='wav', acodec='pcm_s16le', ac=1, ar='16k')
44
- .overwrite_output()
45
- .run(quiet=True)
46
- )
47
- except Exception as e:
48
- return f"❌ Error extracting audio: {e}"
49
 
50
  # تبدیل صدا به متن
51
- result = model.transcribe(output_audio, word_timestamps=True)
52
- segments = result["segments"]
53
 
54
- # آماده‌سازی ویدئو
55
- cap = cv2.VideoCapture(input_path)
56
- fps = cap.get(cv2.CAP_PROP_FPS)
57
- width, height = int(cap.get(3)), int(cap.get(4))
 
58
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
59
- out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
 
60
 
61
  frame_index = 0
62
- segment_index = 0
63
  current_sub = ""
 
64
 
65
  while cap.isOpened():
66
  ret, frame = cap.read()
@@ -69,7 +78,7 @@ def process_video(video_file):
69
  frame_index += 1
70
  time_sec = frame_index / fps
71
 
72
- # بررسی بازه زیرنویس
73
  if segment_index < len(segments):
74
  seg = segments[segment_index]
75
  if time_sec >= seg["start"] and time_sec <= seg["end"]:
@@ -78,33 +87,39 @@ def process_video(video_file):
78
  segment_index += 1
79
  current_sub = ""
80
 
81
- # تشخیص چهره و لب‌ها
82
- rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
83
- results = face_mesh.process(rgb)
 
84
  if results.multi_face_landmarks:
85
  for face_landmarks in results.multi_face_landmarks:
86
  landmarks = face_landmarks.landmark
87
  har = lip_aspect_ratio(landmarks, width, height)
 
88
  lips_indices = list(range(61, 88))
89
  for i in lips_indices:
90
  x = int(landmarks[i].x * width)
91
  y = int(landmarks[i].y * height)
92
  cv2.circle(frame, (x, y), 1, (0,255,0), -1)
 
93
  if har > 0.3:
94
- cv2.putText(frame, "Speaking...", (50,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
 
95
 
96
- # نمایش زیرنویس
97
  if current_sub:
98
- cv2.rectangle(frame, (0, height-80), (width, height), (0,0,0), -1)
99
- cv2.putText(frame, current_sub.strip(), (40, height-30),
100
- cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,255,255), 2, cv2.LINE_AA)
 
 
101
 
102
  out.write(frame)
103
 
104
  cap.release()
105
  out.release()
106
 
107
- return output_video
108
 
109
  # -----------------------------
110
  # رابط Gradio
@@ -112,9 +127,9 @@ def process_video(video_file):
112
  demo = gr.Interface(
113
  fn=process_video,
114
  inputs=gr.Video(label="Upload your video"),
115
- outputs=gr.Video(label="Processed video with lips + subtitles"),
116
- title="👄 Lip Detection & Whisper Subtitle",
117
- description="Upload a short video. The app detects face & lips, then uses Whisper to generate speech subtitles.",
118
  )
119
 
120
  if __name__ == "__main__":
 
3
  import mediapipe as mp
4
  import whisper
5
  import numpy as np
6
+ import subprocess
7
  import os
8
+ from PIL import Image, ImageDraw, ImageFont
9
 
10
  # -----------------------------
11
+ # تنظیم MediaPipe
12
  # -----------------------------
13
  mp_face_mesh = mp.solutions.face_mesh
14
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
 
15
 
16
  # -----------------------------
17
+ # بارگذاری مدل Whisper
18
+ # -----------------------------
19
+ print("Loading Whisper model...")
20
+ model = whisper.load_model("small") # یا 'tiny' برای سرعت بیشتر
21
+
22
+ # -----------------------------
23
+ # فونت برای زیرنویس یونیکد
24
+ # -----------------------------
25
+ font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" # مسیر فونت را طبق سیستم خودت تنظیم کن
26
+ font_size = 32
27
+ font = ImageFont.truetype(font_path, font_size)
28
+
29
+ # -----------------------------
30
+ # تابع حرکت لب
31
  # -----------------------------
32
  def lip_aspect_ratio(landmarks, width, height):
33
  top_lip = landmarks[13]
 
41
  return (bottom - top) / (right - left)
42
 
43
  # -----------------------------
44
+ # تابع پردازش ویدئو
45
  # -----------------------------
46
  def process_video(video_file):
47
+ video_path = video_file
48
+ cap = cv2.VideoCapture(video_path)
49
+ fps = cap.get(cv2.CAP_PROP_FPS)
50
+ width, height = int(cap.get(3)), int(cap.get(4))
51
+
52
+ # استخراج صوت
53
+ audio_path = "temp_audio.wav"
54
+ subprocess.run(['ffmpeg', '-y', '-i', video_path, '-q:a', '0', '-map', 'a', audio_path],
55
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
 
 
 
 
 
56
 
57
  # تبدیل صدا به متن
58
+ result = model.transcribe(audio_path, word_timestamps=True)
59
+ segments = result['segments']
60
 
61
+ # حذف فایل موقت صوتی
62
+ if os.path.exists(audio_path):
63
+ os.remove(audio_path)
64
+
65
+ # خروجی ویدئو MP4
66
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
67
+ output_file = "output_subtitle.mp4"
68
+ out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
69
 
70
  frame_index = 0
 
71
  current_sub = ""
72
+ segment_index = 0
73
 
74
  while cap.isOpened():
75
  ret, frame = cap.read()
 
78
  frame_index += 1
79
  time_sec = frame_index / fps
80
 
81
+ # زیرنویس
82
  if segment_index < len(segments):
83
  seg = segments[segment_index]
84
  if time_sec >= seg["start"] and time_sec <= seg["end"]:
 
87
  segment_index += 1
88
  current_sub = ""
89
 
90
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
91
+ results = face_mesh.process(rgb_frame)
92
+
93
+ # رسم لب‌ها
94
  if results.multi_face_landmarks:
95
  for face_landmarks in results.multi_face_landmarks:
96
  landmarks = face_landmarks.landmark
97
  har = lip_aspect_ratio(landmarks, width, height)
98
+
99
  lips_indices = list(range(61, 88))
100
  for i in lips_indices:
101
  x = int(landmarks[i].x * width)
102
  y = int(landmarks[i].y * height)
103
  cv2.circle(frame, (x, y), 1, (0,255,0), -1)
104
+
105
  if har > 0.3:
106
+ cv2.putText(frame, "Speaking...", (50,50),
107
+ cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
108
 
109
+ # اضافه کردن زیرنویس با PIL
110
  if current_sub:
111
+ frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
112
+ draw = ImageDraw.Draw(frame_pil)
113
+ draw.rectangle([(0, height-80), (width, height)], fill=(0,0,0,127))
114
+ draw.text((40, height-70), current_sub.strip(), font=font, fill=(255,255,255))
115
+ frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
116
 
117
  out.write(frame)
118
 
119
  cap.release()
120
  out.release()
121
 
122
+ return output_file
123
 
124
  # -----------------------------
125
  # رابط Gradio
 
127
  demo = gr.Interface(
128
  fn=process_video,
129
  inputs=gr.Video(label="Upload your video"),
130
+ outputs=gr.Video(label="Download processed video with subtitles"),
131
+ title="👄 Lip Detection + Whisper Subtitle",
132
+ description="Upload a video, detect face & lips, and add subtitles using Whisper (supports Unicode / Persian text)."
133
  )
134
 
135
  if __name__ == "__main__":