Spaces:

farid678
/

lipsDetection

Sleeping

App Files Files Community

farid678 commited on Oct 7, 2025

Commit

98e8f5d

verified ·

1 Parent(s): 37a1d33

update

Browse files

Files changed (1) hide show

app.py +56 -41

app.py CHANGED Viewed

@@ -3,18 +3,31 @@ import cv2
 import mediapipe as mp
 import whisper
 import numpy as np
-import ffmpeg
 import os
 # -----------------------------
-# تنظیمات MediaPipe و Whisper
 # -----------------------------
 mp_face_mesh = mp.solutions.face_mesh
 face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
-model = whisper.load_model("small")
 # -----------------------------
-# تابع محاسبه حرکت لب
 # -----------------------------
 def lip_aspect_ratio(landmarks, width, height):
     top_lip = landmarks[13]
@@ -28,39 +41,35 @@ def lip_aspect_ratio(landmarks, width, height):
     return (bottom - top) / (right - left)
 # -----------------------------
-# تابع اصلی پردازش ویدئو
 # -----------------------------
 def process_video(video_file):
-    input_path = video_file
-    output_audio = "audio.wav"
-    output_video = "output.mp4"
-    # استخراج صدا از ویدئو
-    try:
-        (
-            ffmpeg
-            .input(input_path)
-            .output(output_audio, format='wav', acodec='pcm_s16le', ac=1, ar='16k')
-            .overwrite_output()
-            .run(quiet=True)
-        )
-    except Exception as e:
-        return f"❌ Error extracting audio: {e}"
     # تبدیل صدا به متن
-    result = model.transcribe(output_audio, word_timestamps=True)
-    segments = result["segments"]
-    # آماده‌سازی ویدئو
-    cap = cv2.VideoCapture(input_path)
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    width, height = int(cap.get(3)), int(cap.get(4))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
     frame_index = 0
-    segment_index = 0
     current_sub = ""
     while cap.isOpened():
         ret, frame = cap.read()
@@ -69,7 +78,7 @@ def process_video(video_file):
         frame_index += 1
         time_sec = frame_index / fps
-        # بررسی بازه زیرنویس
         if segment_index < len(segments):
             seg = segments[segment_index]
             if time_sec >= seg["start"] and time_sec <= seg["end"]:
@@ -78,33 +87,39 @@ def process_video(video_file):
                 segment_index += 1
                 current_sub = ""
-        # تشخیص چهره و لب‌ها
-        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        results = face_mesh.process(rgb)
         if results.multi_face_landmarks:
             for face_landmarks in results.multi_face_landmarks:
                 landmarks = face_landmarks.landmark
                 har = lip_aspect_ratio(landmarks, width, height)
                 lips_indices = list(range(61, 88))
                 for i in lips_indices:
                     x = int(landmarks[i].x * width)
                     y = int(landmarks[i].y * height)
                     cv2.circle(frame, (x, y), 1, (0,255,0), -1)
                 if har > 0.3:
-                    cv2.putText(frame, "Speaking...", (50,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
-        # نمایش زیرنویس
         if current_sub:
-            cv2.rectangle(frame, (0, height-80), (width, height), (0,0,0), -1)
-            cv2.putText(frame, current_sub.strip(), (40, height-30),
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,255,255), 2, cv2.LINE_AA)
         out.write(frame)
     cap.release()
     out.release()
-    return output_video
 # -----------------------------
 # رابط Gradio
@@ -112,9 +127,9 @@ def process_video(video_file):
 demo = gr.Interface(
     fn=process_video,
     inputs=gr.Video(label="Upload your video"),
-    outputs=gr.Video(label="Processed video with lips + subtitles"),
-    title="👄 Lip Detection & Whisper Subtitle",
-    description="Upload a short video. The app detects face & lips, then uses Whisper to generate speech subtitles.",
 )
 if __name__ == "__main__":

 import mediapipe as mp
 import whisper
 import numpy as np
+import subprocess
 import os
+from PIL import Image, ImageDraw, ImageFont
 # -----------------------------
+# تنظیم MediaPipe
 # -----------------------------
 mp_face_mesh = mp.solutions.face_mesh
 face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
 # -----------------------------
+# بارگذاری مدل Whisper
+# -----------------------------
+print("Loading Whisper model...")
+model = whisper.load_model("small")  # یا 'tiny' برای سرعت بیشتر
+# -----------------------------
+# فونت برای زیرنویس یونیکد
+# -----------------------------
+font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"  # مسیر فونت را طبق سیستم خودت تنظیم کن
+font_size = 32
+font = ImageFont.truetype(font_path, font_size)
+# -----------------------------
+# تابع حرکت لب
 # -----------------------------
 def lip_aspect_ratio(landmarks, width, height):
     top_lip = landmarks[13]
     return (bottom - top) / (right - left)
 # -----------------------------
+# تابع پردازش ویدئو
 # -----------------------------
 def process_video(video_file):
+    video_path = video_file
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    width, height = int(cap.get(3)), int(cap.get(4))
+    # استخراج صوت
+    audio_path = "temp_audio.wav"
+    subprocess.run(['ffmpeg', '-y', '-i', video_path, '-q:a', '0', '-map', 'a', audio_path],
+                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     # تبدیل صدا به متن
+    result = model.transcribe(audio_path, word_timestamps=True)
+    segments = result['segments']
+    # حذف فایل موقت صوتی
+    if os.path.exists(audio_path):
+        os.remove(audio_path)
+    # خروجی ویدئو MP4
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    output_file = "output_subtitle.mp4"
+    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
     frame_index = 0
     current_sub = ""
+    segment_index = 0
     while cap.isOpened():
         ret, frame = cap.read()
         frame_index += 1
         time_sec = frame_index / fps
+        # زیرنویس
         if segment_index < len(segments):
             seg = segments[segment_index]
             if time_sec >= seg["start"] and time_sec <= seg["end"]:
                 segment_index += 1
                 current_sub = ""
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = face_mesh.process(rgb_frame)
+        # رسم لب‌ها
         if results.multi_face_landmarks:
             for face_landmarks in results.multi_face_landmarks:
                 landmarks = face_landmarks.landmark
                 har = lip_aspect_ratio(landmarks, width, height)
                 lips_indices = list(range(61, 88))
                 for i in lips_indices:
                     x = int(landmarks[i].x * width)
                     y = int(landmarks[i].y * height)
                     cv2.circle(frame, (x, y), 1, (0,255,0), -1)
                 if har > 0.3:
+                    cv2.putText(frame, "Speaking...", (50,50),
+                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
+        # اضافه کردن زیرنویس با PIL
         if current_sub:
+            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            draw = ImageDraw.Draw(frame_pil)
+            draw.rectangle([(0, height-80), (width, height)], fill=(0,0,0,127))
+            draw.text((40, height-70), current_sub.strip(), font=font, fill=(255,255,255))
+            frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
         out.write(frame)
     cap.release()
     out.release()
+    return output_file
 # -----------------------------
 # رابط Gradio
 demo = gr.Interface(
     fn=process_video,
     inputs=gr.Video(label="Upload your video"),
+    outputs=gr.Video(label="Download processed video with subtitles"),
+    title="👄 Lip Detection + Whisper Subtitle",
+    description="Upload a video, detect face & lips, and add subtitles using Whisper (supports Unicode / Persian text)."
 )
 if __name__ == "__main__":