Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,8 +5,9 @@ import whisper
|
|
| 5 |
import numpy as np
|
| 6 |
import subprocess
|
| 7 |
import os
|
| 8 |
-
import shutil
|
| 9 |
from collections import deque
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# -----------------------------
|
| 12 |
# تنظیم MediaPipe
|
|
@@ -15,117 +16,120 @@ mp_face_mesh = mp.solutions.face_mesh
|
|
| 15 |
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
|
| 16 |
|
| 17 |
# -----------------------------
|
| 18 |
-
# بارگذاری
|
| 19 |
# -----------------------------
|
| 20 |
-
|
| 21 |
-
|
| 22 |
|
| 23 |
# -----------------------------
|
| 24 |
-
#
|
| 25 |
# -----------------------------
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 56 |
width, height = int(cap.get(3)), int(cap.get(4))
|
| 57 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 58 |
-
output_file = "
|
| 59 |
out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
|
| 60 |
|
| 61 |
har_history = deque(maxlen=5)
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
while cap.isOpened():
|
| 64 |
ret, frame = cap.read()
|
| 65 |
-
if not ret:
|
| 66 |
-
break
|
| 67 |
|
| 68 |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 69 |
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
|
| 70 |
|
|
|
|
|
|
|
| 71 |
for (x, y, w, h) in faces:
|
| 72 |
-
# رسم کادر دور چهره
|
| 73 |
cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
|
| 74 |
-
|
| 75 |
-
# فقط روی ناحیه چهره MediaPipe اعمال شود
|
| 76 |
-
roi_rgb = cv2.cvtColor(frame[y:y + h, x:x + w], cv2.COLOR_BGR2RGB)
|
| 77 |
results = face_mesh.process(roi_rgb)
|
|
|
|
| 78 |
|
| 79 |
if results.multi_face_landmarks:
|
| 80 |
for face_landmarks in results.multi_face_landmarks:
|
| 81 |
landmarks = face_landmarks.landmark
|
| 82 |
-
|
| 83 |
-
# محاسبه نسبت حرکت لب
|
| 84 |
-
def lip_aspect_ratio():
|
| 85 |
-
top_lip = landmarks[13]
|
| 86 |
-
bottom_lip = landmarks[14]
|
| 87 |
-
left_corner = landmarks[61]
|
| 88 |
-
right_corner = landmarks[291]
|
| 89 |
-
top = top_lip.y * h
|
| 90 |
-
bottom = bottom_lip.y * h
|
| 91 |
-
left = left_corner.x * w
|
| 92 |
-
right = right_corner.x * w
|
| 93 |
-
return (bottom - top) / (right - left)
|
| 94 |
-
|
| 95 |
-
har = lip_aspect_ratio()
|
| 96 |
har_history.append(har)
|
| 97 |
avg_har = np.mean(har_history)
|
|
|
|
| 98 |
|
| 99 |
# رسم نقاط لب
|
| 100 |
-
|
| 101 |
-
for i in lips_indices:
|
| 102 |
x_lip = int(x + landmarks[i].x * w)
|
| 103 |
y_lip = int(y + landmarks[i].y * h)
|
| 104 |
-
cv2.circle(frame, (x_lip, y_lip), 1, (0,
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
out.write(frame)
|
| 112 |
|
| 113 |
cap.release()
|
| 114 |
out.release()
|
| 115 |
|
| 116 |
-
|
| 117 |
-
return text_output
|
| 118 |
|
| 119 |
# -----------------------------
|
| 120 |
# رابط Gradio
|
| 121 |
# -----------------------------
|
| 122 |
-
|
| 123 |
fn=process_video,
|
| 124 |
-
inputs=gr.Video(label="Upload
|
| 125 |
-
outputs=gr.
|
| 126 |
-
title="🎥 Face & Lip Detection
|
| 127 |
-
description="
|
| 128 |
)
|
| 129 |
|
| 130 |
if __name__ == "__main__":
|
| 131 |
-
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import subprocess
|
| 7 |
import os
|
|
|
|
| 8 |
from collections import deque
|
| 9 |
+
import tempfile
|
| 10 |
+
import soundfile as sf
|
| 11 |
|
| 12 |
# -----------------------------
|
| 13 |
# تنظیم MediaPipe
|
|
|
|
| 16 |
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
|
| 17 |
|
| 18 |
# -----------------------------
|
| 19 |
+
# بارگذاری مدلهای Haar Cascade
|
| 20 |
# -----------------------------
|
| 21 |
+
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
|
| 22 |
+
smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
|
| 23 |
|
| 24 |
# -----------------------------
|
| 25 |
+
# بارگذاری مدل Whisper Medium
|
| 26 |
# -----------------------------
|
| 27 |
+
model = whisper.load_model("medium")
|
| 28 |
+
|
| 29 |
+
# -----------------------------
|
| 30 |
+
# تابع تشخیص حرکت لب و صحبت کردن
|
| 31 |
+
# -----------------------------
|
| 32 |
+
def lip_aspect_ratio(landmarks, w, h):
|
| 33 |
+
top_lip = landmarks[13]
|
| 34 |
+
bottom_lip = landmarks[14]
|
| 35 |
+
left_corner = landmarks[61]
|
| 36 |
+
right_corner = landmarks[291]
|
| 37 |
+
return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
|
| 38 |
+
|
| 39 |
+
# -----------------------------
|
| 40 |
+
# تابع پردازش ویدئو
|
| 41 |
+
# -----------------------------
|
| 42 |
+
def process_video(uploaded_video):
|
| 43 |
+
# ذخیره ویدئو آپلود شده
|
| 44 |
+
tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
| 45 |
+
tmp_video.write(uploaded_video.read())
|
| 46 |
+
tmp_video.close()
|
| 47 |
+
|
| 48 |
+
# استخراج صدا
|
| 49 |
+
audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
| 50 |
+
subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
|
| 51 |
+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 52 |
+
|
| 53 |
+
audio, sr = sf.read(audio_path)
|
| 54 |
+
audio = audio.T.flatten() if audio.ndim > 1 else audio
|
| 55 |
+
|
| 56 |
+
# آمادهسازی ویدئو خروجی
|
| 57 |
+
cap = cv2.VideoCapture(tmp_video.name)
|
| 58 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 59 |
width, height = int(cap.get(3)), int(cap.get(4))
|
| 60 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 61 |
+
output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
| 62 |
out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
|
| 63 |
|
| 64 |
har_history = deque(maxlen=5)
|
| 65 |
+
audio_index = 0
|
| 66 |
+
transcribed_text = ""
|
| 67 |
+
chunk_size = int(sr * 2) # پردازش هر 2 ثانیه صدا
|
| 68 |
|
| 69 |
while cap.isOpened():
|
| 70 |
ret, frame = cap.read()
|
| 71 |
+
if not ret: break
|
|
|
|
| 72 |
|
| 73 |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 74 |
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
|
| 75 |
|
| 76 |
+
speaking_flag = False
|
| 77 |
+
|
| 78 |
for (x, y, w, h) in faces:
|
|
|
|
| 79 |
cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
|
| 80 |
+
roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
|
|
|
|
|
|
|
| 81 |
results = face_mesh.process(roi_rgb)
|
| 82 |
+
smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
|
| 83 |
|
| 84 |
if results.multi_face_landmarks:
|
| 85 |
for face_landmarks in results.multi_face_landmarks:
|
| 86 |
landmarks = face_landmarks.landmark
|
| 87 |
+
har = lip_aspect_ratio(landmarks, w, h)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
har_history.append(har)
|
| 89 |
avg_har = np.mean(har_history)
|
| 90 |
+
speaking_flag = avg_har > 0.3 or len(smiles) > 0
|
| 91 |
|
| 92 |
# رسم نقاط لب
|
| 93 |
+
for i in range(61,88):
|
|
|
|
| 94 |
x_lip = int(x + landmarks[i].x * w)
|
| 95 |
y_lip = int(y + landmarks[i].y * h)
|
| 96 |
+
cv2.circle(frame, (x_lip, y_lip), 1, (0,255,0), -1)
|
| 97 |
+
|
| 98 |
+
if speaking_flag:
|
| 99 |
+
cv2.putText(frame, "Speaking...", (x, y-10),
|
| 100 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
|
| 101 |
+
|
| 102 |
+
# پردازش صدا chunk-by-chunk
|
| 103 |
+
if speaking_flag and audio_index < len(audio):
|
| 104 |
+
chunk = audio[audio_index:audio_index+chunk_size]
|
| 105 |
+
if len(chunk) > 0:
|
| 106 |
+
result = model.transcribe(chunk, fp16=False)
|
| 107 |
+
transcribed_text += result["text"] + " "
|
| 108 |
+
audio_index += chunk_size
|
| 109 |
+
|
| 110 |
+
# اضافه کردن زیرنویس
|
| 111 |
+
if transcribed_text.strip():
|
| 112 |
+
cv2.rectangle(frame, (0, height-50), (width, height), (0,0,0), -1)
|
| 113 |
+
cv2.putText(frame, transcribed_text.strip(), (10, height-10),
|
| 114 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)
|
| 115 |
|
| 116 |
out.write(frame)
|
| 117 |
|
| 118 |
cap.release()
|
| 119 |
out.release()
|
| 120 |
|
| 121 |
+
return output_file, transcribed_text.strip()
|
|
|
|
| 122 |
|
| 123 |
# -----------------------------
|
| 124 |
# رابط Gradio
|
| 125 |
# -----------------------------
|
| 126 |
+
iface = gr.Interface(
|
| 127 |
fn=process_video,
|
| 128 |
+
inputs=gr.Video(label="Upload Video"),
|
| 129 |
+
outputs=[gr.Video(label="Processed Video"), gr.Textbox(label="Transcribed Speech")],
|
| 130 |
+
title="🎥 Real-time Face & Lip Detection + Whisper",
|
| 131 |
+
description="Upload a video. The system detects faces and lips, starts transcription when the person speaks, and overlays subtitles on the video."
|
| 132 |
)
|
| 133 |
|
| 134 |
if __name__ == "__main__":
|
| 135 |
+
iface.launch(server_name="0.0.0.0", server_port=7860)
|