Spaces:

farid678
/

lipsDetection

Sleeping

App Files Files Community

farid678 commited on Oct 8, 2025

Commit

e59ca4a

verified ·

1 Parent(s): c79f994

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -26

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ print("🎧 Loading Whisper model (large)...")
 model = whisper.load_model("large")
 # -----------------------------
-# تابع تشخیص حرکت لب
 # -----------------------------
 def lip_aspect_ratio(landmarks, w, h):
     top_lip = landmarks[13]
@@ -38,29 +38,34 @@ def lip_aspect_ratio(landmarks, w, h):
     return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
 # -----------------------------
-# تابع پردازش ویدئو و استخراج گفتار
 # -----------------------------
-def process_video(uploaded_video):
-    tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-    tmp_video.write(uploaded_video.read())
-    tmp_video.close()
-    # استخراج صدا از ویدئو
     audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-    subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
-                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-    # خواندن صدا
     audio, sr = sf.read(audio_path)
     audio = audio.T.flatten() if audio.ndim > 1 else audio
-    # تنظیم ویدئو برای پردازش چهره و لب
-    cap = cv2.VideoCapture(tmp_video.name)
     har_history = deque(maxlen=5)
     transcribed_text = ""
-    speaking_flag = False
-    print("🎥 Analyzing lip movement and detecting speech...")
     while cap.isOpened():
         ret, frame = cap.read()
@@ -69,6 +74,8 @@ def process_video(uploaded_video):
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         faces = face_cascade.detectMultiScale(gray, 1.3, 5)
         for (x, y, w, h) in faces:
             roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
             results = face_mesh.process(roi_rgb)
@@ -80,27 +87,31 @@ def process_video(uploaded_video):
                     har = lip_aspect_ratio(landmarks, w, h)
                     har_history.append(har)
                     avg_har = np.mean(har_history)
                     speaking_flag = avg_har > 0.3 or len(smiles) > 0
-        if speaking_flag:
-            print("🗣️ Detected speech — transcribing...")
-            result = model.transcribe(audio_path, fp16=False)
-            transcribed_text = result["text"]
-            break  # فقط یکبار پردازش برای سرعت بیشتر
     cap.release()
     return transcribed_text.strip()
 # -----------------------------
-# رابط Gradio (فقط خروجی متن)
 # -----------------------------
 iface = gr.Interface(
     fn=process_video,
-    inputs=gr.Video(label="Upload a video"),
     outputs=gr.Textbox(label="Transcribed Speech"),
-    title="🎙️ Whisper Large - Face & Lip Detection",
-    description="Upload a video. The system detects lips and transcribes speech using Whisper Large."
 )
 if __name__ == "__main__":

 model = whisper.load_model("large")
 # -----------------------------
+# تابع تشخیص حرکت لب و صحبت کردن
 # -----------------------------
 def lip_aspect_ratio(landmarks, w, h):
     top_lip = landmarks[13]
     return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
 # -----------------------------
+# تابع پردازش ویدئو
 # -----------------------------
+def process_video(video_path):
+    # Gradio ممکن است مسیر فایل بدهد (str) یا فایل باینری
+    if isinstance(video_path, str):
+        tmp_video_path = video_path
+    else:
+        tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+        tmp_video.write(video_path.read())
+        tmp_video.close()
+        tmp_video_path = tmp_video.name
+    # استخراج صدا
     audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+    subprocess.call(
+        ['ffmpeg', '-y', '-i', tmp_video_path, '-q:a', '0', '-map', 'a', audio_path],
+        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+    )
+    # بارگذاری صوت
     audio, sr = sf.read(audio_path)
     audio = audio.T.flatten() if audio.ndim > 1 else audio
+    cap = cv2.VideoCapture(tmp_video_path)
     har_history = deque(maxlen=5)
+    audio_index = 0
     transcribed_text = ""
+    chunk_size = int(sr * 2)  # هر 2 ثانیه صدا
     while cap.isOpened():
         ret, frame = cap.read()
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         faces = face_cascade.detectMultiScale(gray, 1.3, 5)
+        speaking_flag = False
         for (x, y, w, h) in faces:
             roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
             results = face_mesh.process(roi_rgb)
                     har = lip_aspect_ratio(landmarks, w, h)
                     har_history.append(har)
                     avg_har = np.mean(har_history)
                     speaking_flag = avg_har > 0.3 or len(smiles) > 0
+        # فقط وقتی صحبت شروع شد، متن را استخراج کن
+        if speaking_flag and audio_index < len(audio):
+            chunk = audio[audio_index:audio_index+chunk_size]
+            if len(chunk) > 0:
+                # Whisper expects a file, so save chunk temporarily
+                tmp_chunk_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+                sf.write(tmp_chunk_path, chunk, sr)
+                result = model.transcribe(tmp_chunk_path, fp16=False)
+                transcribed_text += result["text"] + " "
+            audio_index += chunk_size
     cap.release()
     return transcribed_text.strip()
 # -----------------------------
+# رابط Gradio
 # -----------------------------
 iface = gr.Interface(
     fn=process_video,
+    inputs=gr.Video(label="Upload Video"),
     outputs=gr.Textbox(label="Transcribed Speech"),
+    title="🎥 Whisper Large + Face & Lip Detection",
+    description="Upload a video. The system detects when a person is speaking and transcribes the speech to text."
 )
 if __name__ == "__main__":