Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,12 +22,13 @@ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_fronta
|
|
| 22 |
smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
|
| 23 |
|
| 24 |
# -----------------------------
|
| 25 |
-
# بارگذاری مدل Whisper
|
| 26 |
# -----------------------------
|
| 27 |
-
model
|
|
|
|
| 28 |
|
| 29 |
# -----------------------------
|
| 30 |
-
# تابع تشخیص حرکت لب
|
| 31 |
# -----------------------------
|
| 32 |
def lip_aspect_ratio(landmarks, w, h):
|
| 33 |
top_lip = landmarks[13]
|
|
@@ -37,46 +38,38 @@ def lip_aspect_ratio(landmarks, w, h):
|
|
| 37 |
return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
|
| 38 |
|
| 39 |
# -----------------------------
|
| 40 |
-
# تابع پردازش ویدئو
|
| 41 |
# -----------------------------
|
| 42 |
def process_video(uploaded_video):
|
| 43 |
-
# ذخیره ویدئو آپلود شده
|
| 44 |
tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
| 45 |
tmp_video.write(uploaded_video.read())
|
| 46 |
tmp_video.close()
|
| 47 |
|
| 48 |
-
# استخراج صدا
|
| 49 |
audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
| 50 |
subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
|
| 51 |
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 52 |
|
|
|
|
| 53 |
audio, sr = sf.read(audio_path)
|
| 54 |
audio = audio.T.flatten() if audio.ndim > 1 else audio
|
| 55 |
|
| 56 |
-
#
|
| 57 |
cap = cv2.VideoCapture(tmp_video.name)
|
| 58 |
-
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 59 |
-
width, height = int(cap.get(3)), int(cap.get(4))
|
| 60 |
-
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 61 |
-
output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
| 62 |
-
out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
|
| 63 |
-
|
| 64 |
har_history = deque(maxlen=5)
|
| 65 |
-
audio_index = 0
|
| 66 |
transcribed_text = ""
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
|
| 69 |
while cap.isOpened():
|
| 70 |
ret, frame = cap.read()
|
| 71 |
-
if not ret:
|
|
|
|
| 72 |
|
| 73 |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 74 |
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
|
| 75 |
-
|
| 76 |
-
speaking_flag = False
|
| 77 |
-
|
| 78 |
for (x, y, w, h) in faces:
|
| 79 |
-
cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
|
| 80 |
roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
|
| 81 |
results = face_mesh.process(roi_rgb)
|
| 82 |
smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
|
|
@@ -87,48 +80,27 @@ def process_video(uploaded_video):
|
|
| 87 |
har = lip_aspect_ratio(landmarks, w, h)
|
| 88 |
har_history.append(har)
|
| 89 |
avg_har = np.mean(har_history)
|
| 90 |
-
speaking_flag = avg_har > 0.3 or len(smiles) > 0
|
| 91 |
|
| 92 |
-
|
| 93 |
-
for i in range(61,88):
|
| 94 |
-
x_lip = int(x + landmarks[i].x * w)
|
| 95 |
-
y_lip = int(y + landmarks[i].y * h)
|
| 96 |
-
cv2.circle(frame, (x_lip, y_lip), 1, (0,255,0), -1)
|
| 97 |
-
|
| 98 |
-
if speaking_flag:
|
| 99 |
-
cv2.putText(frame, "Speaking...", (x, y-10),
|
| 100 |
-
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
|
| 101 |
-
|
| 102 |
-
# پردازش صدا chunk-by-chunk
|
| 103 |
-
if speaking_flag and audio_index < len(audio):
|
| 104 |
-
chunk = audio[audio_index:audio_index+chunk_size]
|
| 105 |
-
if len(chunk) > 0:
|
| 106 |
-
result = model.transcribe(chunk, fp16=False)
|
| 107 |
-
transcribed_text += result["text"] + " "
|
| 108 |
-
audio_index += chunk_size
|
| 109 |
-
|
| 110 |
-
# اضافه کردن زیرنویس
|
| 111 |
-
if transcribed_text.strip():
|
| 112 |
-
cv2.rectangle(frame, (0, height-50), (width, height), (0,0,0), -1)
|
| 113 |
-
cv2.putText(frame, transcribed_text.strip(), (10, height-10),
|
| 114 |
-
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)
|
| 115 |
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
cap.release()
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
return output_file, transcribed_text.strip()
|
| 122 |
|
| 123 |
# -----------------------------
|
| 124 |
-
# رابط Gradio
|
| 125 |
# -----------------------------
|
| 126 |
iface = gr.Interface(
|
| 127 |
fn=process_video,
|
| 128 |
-
inputs=gr.Video(label="Upload
|
| 129 |
-
outputs=
|
| 130 |
-
title="
|
| 131 |
-
description="Upload a video. The system detects
|
| 132 |
)
|
| 133 |
|
| 134 |
if __name__ == "__main__":
|
|
|
|
| 22 |
smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
|
| 23 |
|
| 24 |
# -----------------------------
|
| 25 |
+
# بارگذاری مدل Whisper Large
|
| 26 |
# -----------------------------
|
| 27 |
+
print("🎧 Loading Whisper model (large)...")
|
| 28 |
+
model = whisper.load_model("large")
|
| 29 |
|
| 30 |
# -----------------------------
|
| 31 |
+
# تابع تشخیص حرکت لب
|
| 32 |
# -----------------------------
|
| 33 |
def lip_aspect_ratio(landmarks, w, h):
|
| 34 |
top_lip = landmarks[13]
|
|
|
|
| 38 |
return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
|
| 39 |
|
| 40 |
# -----------------------------
|
| 41 |
+
# تابع پردازش ویدئو و استخراج گفتار
|
| 42 |
# -----------------------------
|
| 43 |
def process_video(uploaded_video):
|
|
|
|
| 44 |
tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
| 45 |
tmp_video.write(uploaded_video.read())
|
| 46 |
tmp_video.close()
|
| 47 |
|
| 48 |
+
# استخراج صدا از ویدئو
|
| 49 |
audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
| 50 |
subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
|
| 51 |
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 52 |
|
| 53 |
+
# خواندن صدا
|
| 54 |
audio, sr = sf.read(audio_path)
|
| 55 |
audio = audio.T.flatten() if audio.ndim > 1 else audio
|
| 56 |
|
| 57 |
+
# تنظیم ویدئو برای پردازش چهره و لب
|
| 58 |
cap = cv2.VideoCapture(tmp_video.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
har_history = deque(maxlen=5)
|
|
|
|
| 60 |
transcribed_text = ""
|
| 61 |
+
speaking_flag = False
|
| 62 |
+
|
| 63 |
+
print("🎥 Analyzing lip movement and detecting speech...")
|
| 64 |
|
| 65 |
while cap.isOpened():
|
| 66 |
ret, frame = cap.read()
|
| 67 |
+
if not ret:
|
| 68 |
+
break
|
| 69 |
|
| 70 |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 71 |
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
|
|
|
|
|
|
|
|
|
|
| 72 |
for (x, y, w, h) in faces:
|
|
|
|
| 73 |
roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
|
| 74 |
results = face_mesh.process(roi_rgb)
|
| 75 |
smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
|
|
|
|
| 80 |
har = lip_aspect_ratio(landmarks, w, h)
|
| 81 |
har_history.append(har)
|
| 82 |
avg_har = np.mean(har_history)
|
|
|
|
| 83 |
|
| 84 |
+
speaking_flag = avg_har > 0.3 or len(smiles) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
if speaking_flag:
|
| 87 |
+
print("🗣️ Detected speech — transcribing...")
|
| 88 |
+
result = model.transcribe(audio_path, fp16=False)
|
| 89 |
+
transcribed_text = result["text"]
|
| 90 |
+
break # فقط یکبار پردازش برای سرعت بیشتر
|
| 91 |
|
| 92 |
cap.release()
|
| 93 |
+
return transcribed_text.strip()
|
|
|
|
|
|
|
| 94 |
|
| 95 |
# -----------------------------
|
| 96 |
+
# رابط Gradio (فقط خروجی متن)
|
| 97 |
# -----------------------------
|
| 98 |
iface = gr.Interface(
|
| 99 |
fn=process_video,
|
| 100 |
+
inputs=gr.Video(label="Upload a video"),
|
| 101 |
+
outputs=gr.Textbox(label="Transcribed Speech"),
|
| 102 |
+
title="🎙️ Whisper Large - Face & Lip Detection",
|
| 103 |
+
description="Upload a video. The system detects lips and transcribes speech using Whisper Large."
|
| 104 |
)
|
| 105 |
|
| 106 |
if __name__ == "__main__":
|