Spaces:

farid678
/

lipsDetection

Running

App Files Files Community

farid678 commited on Oct 11, 2025

Commit

ce4aa6e

verified ·

1 Parent(s): e59ca4a

simple version

Browse files

Files changed (1) hide show

app.py +27 -94

app.py CHANGED Viewed

@@ -1,117 +1,50 @@
 import gradio as gr
-import cv2
-import mediapipe as mp
-import whisper
-import numpy as np
 import subprocess
-import os
-from collections import deque
 import tempfile
-import soundfile as sf
-# -----------------------------
-# تنظیم MediaPipe
-# -----------------------------
-mp_face_mesh = mp.solutions.face_mesh
-face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
-# -----------------------------
-# بارگذاری مدل‌های Haar Cascade
-# -----------------------------
-face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
-smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
-# -----------------------------
-# بارگذاری مدل Whisper Large
-# -----------------------------
-print("🎧 Loading Whisper model (large)...")
-model = whisper.load_model("large")
 # -----------------------------
-# تابع تشخیص حرکت لب و صحبت کردن
 # -----------------------------
-def lip_aspect_ratio(landmarks, w, h):
-    top_lip = landmarks[13]
-    bottom_lip = landmarks[14]
-    left_corner = landmarks[61]
-    right_corner = landmarks[291]
-    return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
 # -----------------------------
-# تابع پردازش ویدئو
 # -----------------------------
-def process_video(video_path):
-    # Gradio ممکن است مسیر فایل بدهد (str) یا فایل باینری
-    if isinstance(video_path, str):
-        tmp_video_path = video_path
-    else:
-        tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-        tmp_video.write(video_path.read())
-        tmp_video.close()
         tmp_video_path = tmp_video.name
-    # استخراج صدا
-    audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-    subprocess.call(
-        ['ffmpeg', '-y', '-i', tmp_video_path, '-q:a', '0', '-map', 'a', audio_path],
         stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
     )
-    # بارگذاری صوت
-    audio, sr = sf.read(audio_path)
-    audio = audio.T.flatten() if audio.ndim > 1 else audio
-    cap = cv2.VideoCapture(tmp_video_path)
-    har_history = deque(maxlen=5)
-    audio_index = 0
-    transcribed_text = ""
-    chunk_size = int(sr * 2)  # هر 2 ثانیه صدا
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        faces = face_cascade.detectMultiScale(gray, 1.3, 5)
-        speaking_flag = False
-        for (x, y, w, h) in faces:
-            roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
-            results = face_mesh.process(roi_rgb)
-            smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
-            if results.multi_face_landmarks:
-                for face_landmarks in results.multi_face_landmarks:
-                    landmarks = face_landmarks.landmark
-                    har = lip_aspect_ratio(landmarks, w, h)
-                    har_history.append(har)
-                    avg_har = np.mean(har_history)
-                    speaking_flag = avg_har > 0.3 or len(smiles) > 0
-        # فقط وقتی صحبت شروع شد، متن را استخراج کن
-        if speaking_flag and audio_index < len(audio):
-            chunk = audio[audio_index:audio_index+chunk_size]
-            if len(chunk) > 0:
-                # Whisper expects a file, so save chunk temporarily
-                tmp_chunk_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-                sf.write(tmp_chunk_path, chunk, sr)
-                result = model.transcribe(tmp_chunk_path, fp16=False)
-                transcribed_text += result["text"] + " "
-            audio_index += chunk_size
-    cap.release()
-    return transcribed_text.strip()
 # -----------------------------
-# رابط Gradio
 # -----------------------------
 iface = gr.Interface(
-    fn=process_video,
-    inputs=gr.Video(label="Upload Video"),
-    outputs=gr.Textbox(label="Transcribed Speech"),
-    title="🎥 Whisper Large + Face & Lip Detection",
-    description="Upload a video. The system detects when a person is speaking and transcribes the speech to text."
 )
 if __name__ == "__main__":

 import gradio as gr
 import subprocess
 import tempfile
+import whisper
+import os
 # -----------------------------
+# بارگذاری مدل Whisper
 # -----------------------------
+print("🎧 Loading Whisper model (base)...")
+model = whisper.load_model("base")  # می‌توانید "small", "medium", یا "large" هم بگذارید
 # -----------------------------
+# تابع پردازش ویدیو و تبدیل به متن
 # -----------------------------
+def transcribe_video(video):
+    # ذخیره موقت ویدیو
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
+        tmp_video.write(video.read())
         tmp_video_path = tmp_video.name
+    # استخراج صدا با ffmpeg
+    tmp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", tmp_video_path, "-ar", "16000", "-ac", "1", tmp_audio_path],
         stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
     )
+    # تبدیل صدا به متن
+    result = model.transcribe(tmp_audio_path, fp16=False)
+    text = result["text"].strip()
+    # پاک کردن فایل‌های موقت
+    os.remove(tmp_video_path)
+    os.remove(tmp_audio_path)
+    return text or "متنی شناسایی نشد."
 # -----------------------------
+# رابط کاربری Gradio
 # -----------------------------
 iface = gr.Interface(
+    fn=transcribe_video,
+    inputs=gr.Video(label="🎥 ویدیو را آپلود کنید"),
+    outputs=gr.Textbox(label="📝 متن استخراج‌شده"),
+    title="🎧 ویدیو به متن با Whisper",
+    description="ویدیوی خود را آپلود کنید تا گفتار آن به متن تبدیل شود."
 )
 if __name__ == "__main__":