Spaces:

farid678
/

lipsDetection

Sleeping

App Files Files Community

farid678 commited on Oct 7, 2025

Commit

eb46570

verified ·

1 Parent(s): 09572ab

updating

Browse files

Files changed (1) hide show

app.py +83 -93

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import whisper
 import numpy as np
 import subprocess
 import os
-from PIL import Image, ImageDraw, ImageFont
 # -----------------------------
 # تنظیم MediaPipe
@@ -14,123 +14,113 @@ mp_face_mesh = mp.solutions.face_mesh
 face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
 # -----------------------------
-# بارگذاری مدل Whisper
 # -----------------------------
-print("Loading Whisper model...")
-model = whisper.load_model("small")  # یا 'tiny' برای سرعت بیشتر
 # -----------------------------
-# فونت برای زیرنویس یونیکد
 # -----------------------------
-font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"  # مسیر فونت را طبق سیستم خودت تنظیم کن
-font_size = 32
-font = ImageFont.truetype(font_path, font_size)
-# -----------------------------
-# تابع حرکت لب
-# -----------------------------
-def lip_aspect_ratio(landmarks, width, height):
-    top_lip = landmarks[13]
-    bottom_lip = landmarks[14]
-    left_corner = landmarks[61]
-    right_corner = landmarks[291]
-    top = top_lip.y * height
-    bottom = bottom_lip.y * height
-    left = left_corner.x * width
-    right = right_corner.x * width
-    return (bottom - top) / (right - left)
-# -----------------------------
-# تابع پردازش ویدئو
-# -----------------------------
-def process_video(video_file):
-    video_path = video_file
-    cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width, height = int(cap.get(3)), int(cap.get(4))
-    # استخراج صوت
-    audio_path = "temp_audio.wav"
-    subprocess.run(['ffmpeg', '-y', '-i', video_path, '-q:a', '0', '-map', 'a', audio_path],
-                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-    # تبدیل صدا به متن
-    result = model.transcribe(audio_path, word_timestamps=True)
-    segments = result['segments']
-    # حذف فایل موقت صوتی
-    if os.path.exists(audio_path):
-        os.remove(audio_path)
-    # خروجی ویدئو MP4
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    output_file = "output_subtitle.mp4"
     out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
-    frame_index = 0
-    current_sub = ""
-    segment_index = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
-        frame_index += 1
-        time_sec = frame_index / fps
-        # زیرنویس
-        if segment_index < len(segments):
-            seg = segments[segment_index]
-            if time_sec >= seg["start"] and time_sec <= seg["end"]:
-                current_sub = seg["text"]
-            elif time_sec > seg["end"]:
-                segment_index += 1
-                current_sub = ""
-        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        results = face_mesh.process(rgb_frame)
-        # رسم لب‌ها
-        if results.multi_face_landmarks:
-            for face_landmarks in results.multi_face_landmarks:
-                landmarks = face_landmarks.landmark
-                har = lip_aspect_ratio(landmarks, width, height)
-                lips_indices = list(range(61, 88))
-                for i in lips_indices:
-                    x = int(landmarks[i].x * width)
-                    y = int(landmarks[i].y * height)
-                    cv2.circle(frame, (x, y), 1, (0,255,0), -1)
-                if har > 0.3:
-                    cv2.putText(frame, "Speaking...", (50,50),
-                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
-        # اضافه کردن زیرنویس با PIL
-        if current_sub:
-            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-            draw = ImageDraw.Draw(frame_pil)
-            draw.rectangle([(0, height-80), (width, height)], fill=(0,0,0,127))
-            draw.text((40, height-70), current_sub.strip(), font=font, fill=(255,255,255))
-            frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
         out.write(frame)
     cap.release()
     out.release()
-    return output_file
 # -----------------------------
 # رابط Gradio
 # -----------------------------
-demo = gr.Interface(
     fn=process_video,
-    inputs=gr.Video(label="Upload your video"),
-    outputs=gr.Video(label="Download processed video with subtitles"),
-    title="👄 Lip Detection + Whisper Subtitle",
-    description="Upload a video, detect face & lips, and add subtitles using Whisper (supports Unicode / Persian text)."
 )
 if __name__ == "__main__":
-    demo.launch()

 import numpy as np
 import subprocess
 import os
+from collections import deque
 # -----------------------------
 # تنظیم MediaPipe
 face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
 # -----------------------------
+# بارگذاری مدل تشخیص چهره Haar Cascade
 # -----------------------------
+cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
+face_cascade = cv2.CascadeClassifier(cascade_path)
 # -----------------------------
+# تابع اصلی پردازش ویدیو
 # -----------------------------
+def process_video(video_path):
+    # ذخیره ویدیو
+    input_video = "input_video.mp4"
+    with open(input_video, "wb") as f:
+        f.write(video_path.read())
+    # استخراج صدا با ffmpeg
+    audio_path = "audio.wav"
+    subprocess.call(
+        ['ffmpeg', '-y', '-i', input_video, '-q:a', '0', '-map', 'a', audio_path],
+        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+    )
+    # بارگذاری مدل Whisper
+    print("Loading Whisper model (medium)...")
+    model = whisper.load_model("medium")
+    # تبدیل گفتار به متن
+    print("Transcribing...")
+    result = model.transcribe(audio_path)
+    text_output = result["text"]
+    # -----------------------------
+    # تشخیص چهره + لب در فریم‌ها
+    # -----------------------------
+    cap = cv2.VideoCapture(input_video)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width, height = int(cap.get(3)), int(cap.get(4))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    output_file = "output_faces.mp4"
     out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
+    har_history = deque(maxlen=5)
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        faces = face_cascade.detectMultiScale(gray, 1.3, 5)
+        for (x, y, w, h) in faces:
+            # رسم کادر دور چهره
+            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
+            # فقط روی ناحیه چهره MediaPipe اعمال شود
+            roi_rgb = cv2.cvtColor(frame[y:y + h, x:x + w], cv2.COLOR_BGR2RGB)
+            results = face_mesh.process(roi_rgb)
+            if results.multi_face_landmarks:
+                for face_landmarks in results.multi_face_landmarks:
+                    landmarks = face_landmarks.landmark
+                    # محاسبه نسبت حرکت لب
+                    def lip_aspect_ratio():
+                        top_lip = landmarks[13]
+                        bottom_lip = landmarks[14]
+                        left_corner = landmarks[61]
+                        right_corner = landmarks[291]
+                        top = top_lip.y * h
+                        bottom = bottom_lip.y * h
+                        left = left_corner.x * w
+                        right = right_corner.x * w
+                        return (bottom - top) / (right - left)
+                    har = lip_aspect_ratio()
+                    har_history.append(har)
+                    avg_har = np.mean(har_history)
+                    # رسم نقاط لب
+                    lips_indices = list(range(61, 88))
+                    for i in lips_indices:
+                        x_lip = int(x + landmarks[i].x * w)
+                        y_lip = int(y + landmarks[i].y * h)
+                        cv2.circle(frame, (x_lip, y_lip), 1, (0, 255, 0), -1)
+                    # تشخیص صحبت
+                    if avg_har > 0.3:
+                        cv2.putText(frame, "Speaking...", (x, y - 10),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
         out.write(frame)
     cap.release()
     out.release()
+    return text_output
 # -----------------------------
 # رابط Gradio
 # -----------------------------
+interface = gr.Interface(
     fn=process_video,
+    inputs=gr.Video(label="Upload a Video"),
+    outputs=gr.Textbox(label="Detected Speech (Whisper Medium)"),
+    title="🎥 Face & Lip Detection with HaarCascade + Whisper",
+    description="Upload a video. The system detects faces and lips using HaarCascade + MediaPipe, and transcribes speech using Whisper Medium model."
 )
 if __name__ == "__main__":
+    interface.launch(server_name="0.0.0.0", server_port=7860)