farid678 commited on
Commit
eb46570
·
verified ·
1 Parent(s): 09572ab
Files changed (1) hide show
  1. app.py +83 -93
app.py CHANGED
@@ -5,7 +5,7 @@ import whisper
5
  import numpy as np
6
  import subprocess
7
  import os
8
- from PIL import Image, ImageDraw, ImageFont
9
 
10
  # -----------------------------
11
  # تنظیم MediaPipe
@@ -14,123 +14,113 @@ mp_face_mesh = mp.solutions.face_mesh
14
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
15
 
16
  # -----------------------------
17
- # بارگذاری مدل Whisper
18
  # -----------------------------
19
- print("Loading Whisper model...")
20
- model = whisper.load_model("small") # یا 'tiny' برای سرعت بیشتر
21
 
22
  # -----------------------------
23
- # فونت برای زیرنویس یونیکد
24
  # -----------------------------
25
- font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" # مسیر فونت را طبق سیستم خودت تنظیم کن
26
- font_size = 32
27
- font = ImageFont.truetype(font_path, font_size)
28
-
29
- # -----------------------------
30
- # تابع حرکت لب
31
- # -----------------------------
32
- def lip_aspect_ratio(landmarks, width, height):
33
- top_lip = landmarks[13]
34
- bottom_lip = landmarks[14]
35
- left_corner = landmarks[61]
36
- right_corner = landmarks[291]
37
- top = top_lip.y * height
38
- bottom = bottom_lip.y * height
39
- left = left_corner.x * width
40
- right = right_corner.x * width
41
- return (bottom - top) / (right - left)
42
-
43
- # -----------------------------
44
- # تابع پردازش ویدئو
45
- # -----------------------------
46
- def process_video(video_file):
47
- video_path = video_file
48
- cap = cv2.VideoCapture(video_path)
 
 
49
  fps = cap.get(cv2.CAP_PROP_FPS)
50
  width, height = int(cap.get(3)), int(cap.get(4))
51
-
52
- # استخراج صوت
53
- audio_path = "temp_audio.wav"
54
- subprocess.run(['ffmpeg', '-y', '-i', video_path, '-q:a', '0', '-map', 'a', audio_path],
55
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
56
-
57
- # تبدیل صدا به متن
58
- result = model.transcribe(audio_path, word_timestamps=True)
59
- segments = result['segments']
60
-
61
- # حذف فایل موقت صوتی
62
- if os.path.exists(audio_path):
63
- os.remove(audio_path)
64
-
65
- # خروجی ویدئو MP4
66
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
67
- output_file = "output_subtitle.mp4"
68
  out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
69
 
70
- frame_index = 0
71
- current_sub = ""
72
- segment_index = 0
73
 
74
  while cap.isOpened():
75
  ret, frame = cap.read()
76
  if not ret:
77
  break
78
- frame_index += 1
79
- time_sec = frame_index / fps
80
-
81
- # زیرنویس
82
- if segment_index < len(segments):
83
- seg = segments[segment_index]
84
- if time_sec >= seg["start"] and time_sec <= seg["end"]:
85
- current_sub = seg["text"]
86
- elif time_sec > seg["end"]:
87
- segment_index += 1
88
- current_sub = ""
89
-
90
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
91
- results = face_mesh.process(rgb_frame)
92
-
93
- # رسم لب‌ها
94
- if results.multi_face_landmarks:
95
- for face_landmarks in results.multi_face_landmarks:
96
- landmarks = face_landmarks.landmark
97
- har = lip_aspect_ratio(landmarks, width, height)
98
-
99
- lips_indices = list(range(61, 88))
100
- for i in lips_indices:
101
- x = int(landmarks[i].x * width)
102
- y = int(landmarks[i].y * height)
103
- cv2.circle(frame, (x, y), 1, (0,255,0), -1)
104
-
105
- if har > 0.3:
106
- cv2.putText(frame, "Speaking...", (50,50),
107
- cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
108
-
109
- # اضافه کردن زیرنویس با PIL
110
- if current_sub:
111
- frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
112
- draw = ImageDraw.Draw(frame_pil)
113
- draw.rectangle([(0, height-80), (width, height)], fill=(0,0,0,127))
114
- draw.text((40, height-70), current_sub.strip(), font=font, fill=(255,255,255))
115
- frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
 
 
 
 
 
116
 
117
  out.write(frame)
118
 
119
  cap.release()
120
  out.release()
121
 
122
- return output_file
123
 
124
  # -----------------------------
125
  # رابط Gradio
126
  # -----------------------------
127
- demo = gr.Interface(
128
  fn=process_video,
129
- inputs=gr.Video(label="Upload your video"),
130
- outputs=gr.Video(label="Download processed video with subtitles"),
131
- title="👄 Lip Detection + Whisper Subtitle",
132
- description="Upload a video, detect face & lips, and add subtitles using Whisper (supports Unicode / Persian text)."
133
  )
134
 
135
  if __name__ == "__main__":
136
- demo.launch()
 
5
  import numpy as np
6
  import subprocess
7
  import os
8
+ from collections import deque
9
 
10
  # -----------------------------
11
  # تنظیم MediaPipe
 
14
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
15
 
16
  # -----------------------------
17
+ # بارگذاری مدل تشخیص چهره Haar Cascade
18
  # -----------------------------
19
+ cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
20
+ face_cascade = cv2.CascadeClassifier(cascade_path)
21
 
22
  # -----------------------------
23
+ # تابع اصلی پردازش ویدیو
24
  # -----------------------------
25
+ def process_video(video_path):
26
+ # ذخیره ویدیو
27
+ input_video = "input_video.mp4"
28
+ with open(input_video, "wb") as f:
29
+ f.write(video_path.read())
30
+
31
+ # استخراج صدا با ffmpeg
32
+ audio_path = "audio.wav"
33
+ subprocess.call(
34
+ ['ffmpeg', '-y', '-i', input_video, '-q:a', '0', '-map', 'a', audio_path],
35
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
36
+ )
37
+
38
+ # بارگذاری مدل Whisper
39
+ print("Loading Whisper model (medium)...")
40
+ model = whisper.load_model("medium")
41
+
42
+ # تبدیل گفتار به متن
43
+ print("Transcribing...")
44
+ result = model.transcribe(audio_path)
45
+ text_output = result["text"]
46
+
47
+ # -----------------------------
48
+ # تشخیص چهره + لب در فریم‌ها
49
+ # -----------------------------
50
+ cap = cv2.VideoCapture(input_video)
51
  fps = cap.get(cv2.CAP_PROP_FPS)
52
  width, height = int(cap.get(3)), int(cap.get(4))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
54
+ output_file = "output_faces.mp4"
55
  out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
56
 
57
+ har_history = deque(maxlen=5)
 
 
58
 
59
  while cap.isOpened():
60
  ret, frame = cap.read()
61
  if not ret:
62
  break
63
+
64
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
65
+ faces = face_cascade.detectMultiScale(gray, 1.3, 5)
66
+
67
+ for (x, y, w, h) in faces:
68
+ # رسم کادر دور چهره
69
+ cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
70
+
71
+ # فقط روی ناحیه چهره MediaPipe اعمال شود
72
+ roi_rgb = cv2.cvtColor(frame[y:y + h, x:x + w], cv2.COLOR_BGR2RGB)
73
+ results = face_mesh.process(roi_rgb)
74
+
75
+ if results.multi_face_landmarks:
76
+ for face_landmarks in results.multi_face_landmarks:
77
+ landmarks = face_landmarks.landmark
78
+
79
+ # محاسبه نسبت حرکت لب
80
+ def lip_aspect_ratio():
81
+ top_lip = landmarks[13]
82
+ bottom_lip = landmarks[14]
83
+ left_corner = landmarks[61]
84
+ right_corner = landmarks[291]
85
+ top = top_lip.y * h
86
+ bottom = bottom_lip.y * h
87
+ left = left_corner.x * w
88
+ right = right_corner.x * w
89
+ return (bottom - top) / (right - left)
90
+
91
+ har = lip_aspect_ratio()
92
+ har_history.append(har)
93
+ avg_har = np.mean(har_history)
94
+
95
+ # رسم نقاط لب
96
+ lips_indices = list(range(61, 88))
97
+ for i in lips_indices:
98
+ x_lip = int(x + landmarks[i].x * w)
99
+ y_lip = int(y + landmarks[i].y * h)
100
+ cv2.circle(frame, (x_lip, y_lip), 1, (0, 255, 0), -1)
101
+
102
+ # تشخیص صحبت
103
+ if avg_har > 0.3:
104
+ cv2.putText(frame, "Speaking...", (x, y - 10),
105
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
106
 
107
  out.write(frame)
108
 
109
  cap.release()
110
  out.release()
111
 
112
+ return text_output
113
 
114
  # -----------------------------
115
  # رابط Gradio
116
  # -----------------------------
117
+ interface = gr.Interface(
118
  fn=process_video,
119
+ inputs=gr.Video(label="Upload a Video"),
120
+ outputs=gr.Textbox(label="Detected Speech (Whisper Medium)"),
121
+ title="🎥 Face & Lip Detection with HaarCascade + Whisper",
122
+ description="Upload a video. The system detects faces and lips using HaarCascade + MediaPipe, and transcribes speech using Whisper Medium model."
123
  )
124
 
125
  if __name__ == "__main__":
126
+ interface.launch(server_name="0.0.0.0", server_port=7860)