Spaces:

farid678
/

lipsDetection

Running

App Files Files Community

farid678 commited on Oct 7, 2025

Commit

d0e9cf7

verified ·

1 Parent(s): e5da71d

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -71

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ import whisper
 import numpy as np
 import subprocess
 import os
-import shutil
 from collections import deque
 # -----------------------------
 # تنظیم MediaPipe
@@ -15,117 +16,120 @@ mp_face_mesh = mp.solutions.face_mesh
 face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
 # -----------------------------
-# بارگذاری مدل تشخیص چهره Haar Cascade
 # -----------------------------
-cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
-face_cascade = cv2.CascadeClassifier(cascade_path)
 # -----------------------------
-# تابع اصلی پردازش ویدیو
 # -----------------------------
-def process_video(video_path):
-    # اطمینان از مسیر ورودی
-    if not os.path.exists(video_path):
-        return "❌ Video file not found."
-    # ذخیره ویدیو آپلودشده
-    input_video = "input_video.mp4"
-    shutil.copy(video_path, input_video)
-    # استخراج صدا با ffmpeg
-    audio_path = "audio.wav"
-    subprocess.call(
-        ['ffmpeg', '-y', '-i', input_video, '-q:a', '0', '-map', 'a', audio_path],
-        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
-    )
-    # بارگذاری مدل Whisper
-    print("Loading Whisper model (medium)...")
-    model = whisper.load_model("medium")
-    # تبدیل گفتار به متن
-    print("Transcribing audio...")
-    result = model.transcribe(audio_path)
-    text_output = result["text"]
-    # -----------------------------
-    # تشخیص چهره + لب در فریم‌ها
-    # -----------------------------
-    cap = cv2.VideoCapture(input_video)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width, height = int(cap.get(3)), int(cap.get(4))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    output_file = "output_faces.mp4"
     out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
     har_history = deque(maxlen=5)
     while cap.isOpened():
         ret, frame = cap.read()
-        if not ret:
-            break
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         faces = face_cascade.detectMultiScale(gray, 1.3, 5)
         for (x, y, w, h) in faces:
-            # رسم کادر دور چهره
             cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
-            # فقط روی ناحیه چهره MediaPipe اعمال شود
-            roi_rgb = cv2.cvtColor(frame[y:y + h, x:x + w], cv2.COLOR_BGR2RGB)
             results = face_mesh.process(roi_rgb)
             if results.multi_face_landmarks:
                 for face_landmarks in results.multi_face_landmarks:
                     landmarks = face_landmarks.landmark
-                    # محاسبه نسبت حرکت لب
-                    def lip_aspect_ratio():
-                        top_lip = landmarks[13]
-                        bottom_lip = landmarks[14]
-                        left_corner = landmarks[61]
-                        right_corner = landmarks[291]
-                        top = top_lip.y * h
-                        bottom = bottom_lip.y * h
-                        left = left_corner.x * w
-                        right = right_corner.x * w
-                        return (bottom - top) / (right - left)
-                    har = lip_aspect_ratio()
                     har_history.append(har)
                     avg_har = np.mean(har_history)
                     # رسم نقاط لب
-                    lips_indices = list(range(61, 88))
-                    for i in lips_indices:
                         x_lip = int(x + landmarks[i].x * w)
                         y_lip = int(y + landmarks[i].y * h)
-                        cv2.circle(frame, (x_lip, y_lip), 1, (0, 255, 0), -1)
-                    # تشخیص صحبت
-                    if avg_har > 0.3:
-                        cv2.putText(frame, "Speaking...", (x, y - 10),
-                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
         out.write(frame)
     cap.release()
     out.release()
-    print("✅ Processing done! Output video saved as:", output_file)
-    return text_output
 # -----------------------------
 # رابط Gradio
 # -----------------------------
-interface = gr.Interface(
     fn=process_video,
-    inputs=gr.Video(label="Upload a Video"),
-    outputs=gr.Textbox(label="Detected Speech (Whisper Medium)"),
-    title="🎥 Face & Lip Detection (HaarCascade + MediaPipe + Whisper)",
-    description="Uploads a video, detects faces and lips using HaarCascade + MediaPipe, and transcribes speech using Whisper Medium model."
 )
 if __name__ == "__main__":
-    interface.launch(server_name="0.0.0.0", server_port=7860)

 import numpy as np
 import subprocess
 import os
 from collections import deque
+import tempfile
+import soundfile as sf
 # -----------------------------
 # تنظیم MediaPipe
 face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
 # -----------------------------
+# بارگذاری مدل‌های Haar Cascade
 # -----------------------------
+face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
+smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
 # -----------------------------
+# بارگذاری مدل Whisper Medium
 # -----------------------------
+model = whisper.load_model("medium")
+# -----------------------------
+# تابع تشخیص حرکت لب و صحبت کردن
+# -----------------------------
+def lip_aspect_ratio(landmarks, w, h):
+    top_lip = landmarks[13]
+    bottom_lip = landmarks[14]
+    left_corner = landmarks[61]
+    right_corner = landmarks[291]
+    return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
+# -----------------------------
+# تابع پردازش ویدئو
+# -----------------------------
+def process_video(uploaded_video):
+    # ذخیره ویدئو آپلود شده
+    tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+    tmp_video.write(uploaded_video.read())
+    tmp_video.close()
+    # استخراج صدا
+    audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+    subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
+                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    audio, sr = sf.read(audio_path)
+    audio = audio.T.flatten() if audio.ndim > 1 else audio
+    # آماده‌سازی ویدئو خروجی
+    cap = cv2.VideoCapture(tmp_video.name)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width, height = int(cap.get(3)), int(cap.get(4))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
     out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
     har_history = deque(maxlen=5)
+    audio_index = 0
+    transcribed_text = ""
+    chunk_size = int(sr * 2)  # پردازش هر 2 ثانیه صدا
     while cap.isOpened():
         ret, frame = cap.read()
+        if not ret: break
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         faces = face_cascade.detectMultiScale(gray, 1.3, 5)
+        speaking_flag = False
         for (x, y, w, h) in faces:
             cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
+            roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
             results = face_mesh.process(roi_rgb)
+            smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
             if results.multi_face_landmarks:
                 for face_landmarks in results.multi_face_landmarks:
                     landmarks = face_landmarks.landmark
+                    har = lip_aspect_ratio(landmarks, w, h)
                     har_history.append(har)
                     avg_har = np.mean(har_history)
+                    speaking_flag = avg_har > 0.3 or len(smiles) > 0
                     # رسم نقاط لب
+                    for i in range(61,88):
                         x_lip = int(x + landmarks[i].x * w)
                         y_lip = int(y + landmarks[i].y * h)
+                        cv2.circle(frame, (x_lip, y_lip), 1, (0,255,0), -1)
+                    if speaking_flag:
+                        cv2.putText(frame, "Speaking...", (x, y-10),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
+        # پردازش صدا chunk-by-chunk
+        if speaking_flag and audio_index < len(audio):
+            chunk = audio[audio_index:audio_index+chunk_size]
+            if len(chunk) > 0:
+                result = model.transcribe(chunk, fp16=False)
+                transcribed_text += result["text"] + " "
+            audio_index += chunk_size
+        # اضافه کردن زیرنویس
+        if transcribed_text.strip():
+            cv2.rectangle(frame, (0, height-50), (width, height), (0,0,0), -1)
+            cv2.putText(frame, transcribed_text.strip(), (10, height-10),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)
         out.write(frame)
     cap.release()
     out.release()
+    return output_file, transcribed_text.strip()
 # -----------------------------
 # رابط Gradio
 # -----------------------------
+iface = gr.Interface(
     fn=process_video,
+    inputs=gr.Video(label="Upload Video"),
+    outputs=[gr.Video(label="Processed Video"), gr.Textbox(label="Transcribed Speech")],
+    title="🎥 Real-time Face & Lip Detection + Whisper",
+    description="Upload a video. The system detects faces and lips, starts transcription when the person speaks, and overlays subtitles on the video."
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)