farid678 commited on
Commit
d0e9cf7
·
verified ·
1 Parent(s): e5da71d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -71
app.py CHANGED
@@ -5,8 +5,9 @@ import whisper
5
  import numpy as np
6
  import subprocess
7
  import os
8
- import shutil
9
  from collections import deque
 
 
10
 
11
  # -----------------------------
12
  # تنظیم MediaPipe
@@ -15,117 +16,120 @@ mp_face_mesh = mp.solutions.face_mesh
15
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
16
 
17
  # -----------------------------
18
- # بارگذاری مدل تشخیص چهره Haar Cascade
19
  # -----------------------------
20
- cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
21
- face_cascade = cv2.CascadeClassifier(cascade_path)
22
 
23
  # -----------------------------
24
- # تابع اصلی پردازش ویدیو
25
  # -----------------------------
26
- def process_video(video_path):
27
- # اطمینان از مسیر ورودی
28
- if not os.path.exists(video_path):
29
- return "❌ Video file not found."
30
-
31
- # ذخیره ویدیو آپلودشده
32
- input_video = "input_video.mp4"
33
- shutil.copy(video_path, input_video)
34
-
35
- # استخراج صدا با ffmpeg
36
- audio_path = "audio.wav"
37
- subprocess.call(
38
- ['ffmpeg', '-y', '-i', input_video, '-q:a', '0', '-map', 'a', audio_path],
39
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
40
- )
41
-
42
- # بارگذاری مدل Whisper
43
- print("Loading Whisper model (medium)...")
44
- model = whisper.load_model("medium")
45
-
46
- # تبدیل گفتار به متن
47
- print("Transcribing audio...")
48
- result = model.transcribe(audio_path)
49
- text_output = result["text"]
50
-
51
- # -----------------------------
52
- # تشخیص چهره + لب در فریم‌ها
53
- # -----------------------------
54
- cap = cv2.VideoCapture(input_video)
 
 
55
  fps = cap.get(cv2.CAP_PROP_FPS)
56
  width, height = int(cap.get(3)), int(cap.get(4))
57
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
58
- output_file = "output_faces.mp4"
59
  out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
60
 
61
  har_history = deque(maxlen=5)
 
 
 
62
 
63
  while cap.isOpened():
64
  ret, frame = cap.read()
65
- if not ret:
66
- break
67
 
68
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
69
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
70
 
 
 
71
  for (x, y, w, h) in faces:
72
- # رسم کادر دور چهره
73
  cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
74
-
75
- # فقط روی ناحیه چهره MediaPipe اعمال شود
76
- roi_rgb = cv2.cvtColor(frame[y:y + h, x:x + w], cv2.COLOR_BGR2RGB)
77
  results = face_mesh.process(roi_rgb)
 
78
 
79
  if results.multi_face_landmarks:
80
  for face_landmarks in results.multi_face_landmarks:
81
  landmarks = face_landmarks.landmark
82
-
83
- # محاسبه نسبت حرکت لب
84
- def lip_aspect_ratio():
85
- top_lip = landmarks[13]
86
- bottom_lip = landmarks[14]
87
- left_corner = landmarks[61]
88
- right_corner = landmarks[291]
89
- top = top_lip.y * h
90
- bottom = bottom_lip.y * h
91
- left = left_corner.x * w
92
- right = right_corner.x * w
93
- return (bottom - top) / (right - left)
94
-
95
- har = lip_aspect_ratio()
96
  har_history.append(har)
97
  avg_har = np.mean(har_history)
 
98
 
99
  # رسم نقاط لب
100
- lips_indices = list(range(61, 88))
101
- for i in lips_indices:
102
  x_lip = int(x + landmarks[i].x * w)
103
  y_lip = int(y + landmarks[i].y * h)
104
- cv2.circle(frame, (x_lip, y_lip), 1, (0, 255, 0), -1)
105
-
106
- # تشخیص صحبت
107
- if avg_har > 0.3:
108
- cv2.putText(frame, "Speaking...", (x, y - 10),
109
- cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  out.write(frame)
112
 
113
  cap.release()
114
  out.release()
115
 
116
- print("✅ Processing done! Output video saved as:", output_file)
117
- return text_output
118
 
119
  # -----------------------------
120
  # رابط Gradio
121
  # -----------------------------
122
- interface = gr.Interface(
123
  fn=process_video,
124
- inputs=gr.Video(label="Upload a Video"),
125
- outputs=gr.Textbox(label="Detected Speech (Whisper Medium)"),
126
- title="🎥 Face & Lip Detection (HaarCascade + MediaPipe + Whisper)",
127
- description="Uploads a video, detects faces and lips using HaarCascade + MediaPipe, and transcribes speech using Whisper Medium model."
128
  )
129
 
130
  if __name__ == "__main__":
131
- interface.launch(server_name="0.0.0.0", server_port=7860)
 
5
  import numpy as np
6
  import subprocess
7
  import os
 
8
  from collections import deque
9
+ import tempfile
10
+ import soundfile as sf
11
 
12
  # -----------------------------
13
  # تنظیم MediaPipe
 
16
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
17
 
18
  # -----------------------------
19
+ # بارگذاری مدل‌های Haar Cascade
20
  # -----------------------------
21
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
22
+ smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
23
 
24
  # -----------------------------
25
+ # بارگذاری مدل Whisper Medium
26
  # -----------------------------
27
+ model = whisper.load_model("medium")
28
+
29
+ # -----------------------------
30
+ # تابع تشخیص حرکت لب و صحبت کردن
31
+ # -----------------------------
32
+ def lip_aspect_ratio(landmarks, w, h):
33
+ top_lip = landmarks[13]
34
+ bottom_lip = landmarks[14]
35
+ left_corner = landmarks[61]
36
+ right_corner = landmarks[291]
37
+ return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
38
+
39
+ # -----------------------------
40
+ # تابع پردازش ویدئو
41
+ # -----------------------------
42
+ def process_video(uploaded_video):
43
+ # ذخیره ویدئو آپلود شده
44
+ tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
45
+ tmp_video.write(uploaded_video.read())
46
+ tmp_video.close()
47
+
48
+ # استخراج صدا
49
+ audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
50
+ subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
51
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
52
+
53
+ audio, sr = sf.read(audio_path)
54
+ audio = audio.T.flatten() if audio.ndim > 1 else audio
55
+
56
+ # آماده‌سازی ویدئو خروجی
57
+ cap = cv2.VideoCapture(tmp_video.name)
58
  fps = cap.get(cv2.CAP_PROP_FPS)
59
  width, height = int(cap.get(3)), int(cap.get(4))
60
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
61
+ output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
62
  out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
63
 
64
  har_history = deque(maxlen=5)
65
+ audio_index = 0
66
+ transcribed_text = ""
67
+ chunk_size = int(sr * 2) # پردازش هر 2 ثانیه صدا
68
 
69
  while cap.isOpened():
70
  ret, frame = cap.read()
71
+ if not ret: break
 
72
 
73
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
74
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
75
 
76
+ speaking_flag = False
77
+
78
  for (x, y, w, h) in faces:
 
79
  cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
80
+ roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
 
 
81
  results = face_mesh.process(roi_rgb)
82
+ smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
83
 
84
  if results.multi_face_landmarks:
85
  for face_landmarks in results.multi_face_landmarks:
86
  landmarks = face_landmarks.landmark
87
+ har = lip_aspect_ratio(landmarks, w, h)
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  har_history.append(har)
89
  avg_har = np.mean(har_history)
90
+ speaking_flag = avg_har > 0.3 or len(smiles) > 0
91
 
92
  # رسم نقاط لب
93
+ for i in range(61,88):
 
94
  x_lip = int(x + landmarks[i].x * w)
95
  y_lip = int(y + landmarks[i].y * h)
96
+ cv2.circle(frame, (x_lip, y_lip), 1, (0,255,0), -1)
97
+
98
+ if speaking_flag:
99
+ cv2.putText(frame, "Speaking...", (x, y-10),
100
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
101
+
102
+ # پردازش صدا chunk-by-chunk
103
+ if speaking_flag and audio_index < len(audio):
104
+ chunk = audio[audio_index:audio_index+chunk_size]
105
+ if len(chunk) > 0:
106
+ result = model.transcribe(chunk, fp16=False)
107
+ transcribed_text += result["text"] + " "
108
+ audio_index += chunk_size
109
+
110
+ # اضافه کردن زیرنویس
111
+ if transcribed_text.strip():
112
+ cv2.rectangle(frame, (0, height-50), (width, height), (0,0,0), -1)
113
+ cv2.putText(frame, transcribed_text.strip(), (10, height-10),
114
+ cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)
115
 
116
  out.write(frame)
117
 
118
  cap.release()
119
  out.release()
120
 
121
+ return output_file, transcribed_text.strip()
 
122
 
123
  # -----------------------------
124
  # رابط Gradio
125
  # -----------------------------
126
+ iface = gr.Interface(
127
  fn=process_video,
128
+ inputs=gr.Video(label="Upload Video"),
129
+ outputs=[gr.Video(label="Processed Video"), gr.Textbox(label="Transcribed Speech")],
130
+ title="🎥 Real-time Face & Lip Detection + Whisper",
131
+ description="Upload a video. The system detects faces and lips, starts transcription when the person speaks, and overlays subtitles on the video."
132
  )
133
 
134
  if __name__ == "__main__":
135
+ iface.launch(server_name="0.0.0.0", server_port=7860)