Spaces:

farid678
/

lipsDetection

Sleeping

App Files Files Community

farid678 commited on Oct 8, 2025

Commit

c79f994

verified ·

1 Parent(s): d0e9cf7

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -53

app.py CHANGED Viewed

@@ -22,12 +22,13 @@ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_fronta
 smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
 # -----------------------------
-# بارگذاری مدل Whisper Medium
 # -----------------------------
-model = whisper.load_model("medium")
 # -----------------------------
-# تابع تشخیص حرکت لب و صحبت کردن
 # -----------------------------
 def lip_aspect_ratio(landmarks, w, h):
     top_lip = landmarks[13]
@@ -37,46 +38,38 @@ def lip_aspect_ratio(landmarks, w, h):
     return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
 # -----------------------------
-# تابع پردازش ویدئو
 # -----------------------------
 def process_video(uploaded_video):
-    # ذخیره ویدئو آپلود شده
     tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
     tmp_video.write(uploaded_video.read())
     tmp_video.close()
-    # استخراج صدا
     audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
     subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
                     stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     audio, sr = sf.read(audio_path)
     audio = audio.T.flatten() if audio.ndim > 1 else audio
-    # آماده‌سازی ویدئو خروجی
     cap = cv2.VideoCapture(tmp_video.name)
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    width, height = int(cap.get(3)), int(cap.get(4))
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
     har_history = deque(maxlen=5)
-    audio_index = 0
     transcribed_text = ""
-    chunk_size = int(sr * 2)  # پردازش هر 2 ثانیه صدا
     while cap.isOpened():
         ret, frame = cap.read()
-        if not ret: break
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         faces = face_cascade.detectMultiScale(gray, 1.3, 5)
-        speaking_flag = False
         for (x, y, w, h) in faces:
-            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
             roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
             results = face_mesh.process(roi_rgb)
             smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
@@ -87,48 +80,27 @@ def process_video(uploaded_video):
                     har = lip_aspect_ratio(landmarks, w, h)
                     har_history.append(har)
                     avg_har = np.mean(har_history)
-                    speaking_flag = avg_har > 0.3 or len(smiles) > 0
-                    # رسم نقاط لب
-                    for i in range(61,88):
-                        x_lip = int(x + landmarks[i].x * w)
-                        y_lip = int(y + landmarks[i].y * h)
-                        cv2.circle(frame, (x_lip, y_lip), 1, (0,255,0), -1)
-                    if speaking_flag:
-                        cv2.putText(frame, "Speaking...", (x, y-10),
-                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
-        # پردازش صدا chunk-by-chunk
-        if speaking_flag and audio_index < len(audio):
-            chunk = audio[audio_index:audio_index+chunk_size]
-            if len(chunk) > 0:
-                result = model.transcribe(chunk, fp16=False)
-                transcribed_text += result["text"] + " "
-            audio_index += chunk_size
-        # اضافه کردن زیرنویس
-        if transcribed_text.strip():
-            cv2.rectangle(frame, (0, height-50), (width, height), (0,0,0), -1)
-            cv2.putText(frame, transcribed_text.strip(), (10, height-10),
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)
-        out.write(frame)
     cap.release()
-    out.release()
-    return output_file, transcribed_text.strip()
 # -----------------------------
-# رابط Gradio
 # -----------------------------
 iface = gr.Interface(
     fn=process_video,
-    inputs=gr.Video(label="Upload Video"),
-    outputs=[gr.Video(label="Processed Video"), gr.Textbox(label="Transcribed Speech")],
-    title="🎥 Real-time Face & Lip Detection + Whisper",
-    description="Upload a video. The system detects faces and lips, starts transcription when the person speaks, and overlays subtitles on the video."
 )
 if __name__ == "__main__":

 smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
 # -----------------------------
+# بارگذاری مدل Whisper Large
 # -----------------------------
+print("🎧 Loading Whisper model (large)...")
+model = whisper.load_model("large")
 # -----------------------------
+# تابع تشخیص حرکت لب
 # -----------------------------
 def lip_aspect_ratio(landmarks, w, h):
     top_lip = landmarks[13]
     return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
 # -----------------------------
+# تابع پردازش ویدئو و استخراج گفتار
 # -----------------------------
 def process_video(uploaded_video):
     tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
     tmp_video.write(uploaded_video.read())
     tmp_video.close()
+    # استخراج صدا از ویدئو
     audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
     subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
                     stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    # خواندن صدا
     audio, sr = sf.read(audio_path)
     audio = audio.T.flatten() if audio.ndim > 1 else audio
+    # تنظیم ویدئو برای پردازش چهره و لب
     cap = cv2.VideoCapture(tmp_video.name)
     har_history = deque(maxlen=5)
     transcribed_text = ""
+    speaking_flag = False
+    print("🎥 Analyzing lip movement and detecting speech...")
     while cap.isOpened():
         ret, frame = cap.read()
+        if not ret:
+            break
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         faces = face_cascade.detectMultiScale(gray, 1.3, 5)
         for (x, y, w, h) in faces:
             roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
             results = face_mesh.process(roi_rgb)
             smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
                     har = lip_aspect_ratio(landmarks, w, h)
                     har_history.append(har)
                     avg_har = np.mean(har_history)
+                    speaking_flag = avg_har > 0.3 or len(smiles) > 0
+        if speaking_flag:
+            print("🗣️ Detected speech — transcribing...")
+            result = model.transcribe(audio_path, fp16=False)
+            transcribed_text = result["text"]
+            break  # فقط یکبار پردازش برای سرعت بیشتر
     cap.release()
+    return transcribed_text.strip()
 # -----------------------------
+# رابط Gradio (فقط خروجی متن)
 # -----------------------------
 iface = gr.Interface(
     fn=process_video,
+    inputs=gr.Video(label="Upload a video"),
+    outputs=gr.Textbox(label="Transcribed Speech"),
+    title="🎙️ Whisper Large - Face & Lip Detection",
+    description="Upload a video. The system detects lips and transcribes speech using Whisper Large."
 )
 if __name__ == "__main__":