farid678 commited on
Commit
c79f994
·
verified ·
1 Parent(s): d0e9cf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -53
app.py CHANGED
@@ -22,12 +22,13 @@ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_fronta
22
  smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
23
 
24
  # -----------------------------
25
- # بارگذاری مدل Whisper Medium
26
  # -----------------------------
27
- model = whisper.load_model("medium")
 
28
 
29
  # -----------------------------
30
- # تابع تشخیص حرکت لب و صحبت کردن
31
  # -----------------------------
32
  def lip_aspect_ratio(landmarks, w, h):
33
  top_lip = landmarks[13]
@@ -37,46 +38,38 @@ def lip_aspect_ratio(landmarks, w, h):
37
  return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
38
 
39
  # -----------------------------
40
- # تابع پردازش ویدئو
41
  # -----------------------------
42
  def process_video(uploaded_video):
43
- # ذخیره ویدئو آپلود شده
44
  tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
45
  tmp_video.write(uploaded_video.read())
46
  tmp_video.close()
47
 
48
- # استخراج صدا
49
  audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
50
  subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
51
  stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
52
 
 
53
  audio, sr = sf.read(audio_path)
54
  audio = audio.T.flatten() if audio.ndim > 1 else audio
55
 
56
- # آماده‌سازی ویدئو خروجی
57
  cap = cv2.VideoCapture(tmp_video.name)
58
- fps = cap.get(cv2.CAP_PROP_FPS)
59
- width, height = int(cap.get(3)), int(cap.get(4))
60
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
61
- output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
62
- out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
63
-
64
  har_history = deque(maxlen=5)
65
- audio_index = 0
66
  transcribed_text = ""
67
- chunk_size = int(sr * 2) # پردازش هر 2 ثانیه صدا
 
 
68
 
69
  while cap.isOpened():
70
  ret, frame = cap.read()
71
- if not ret: break
 
72
 
73
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
74
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
75
-
76
- speaking_flag = False
77
-
78
  for (x, y, w, h) in faces:
79
- cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
80
  roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
81
  results = face_mesh.process(roi_rgb)
82
  smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
@@ -87,48 +80,27 @@ def process_video(uploaded_video):
87
  har = lip_aspect_ratio(landmarks, w, h)
88
  har_history.append(har)
89
  avg_har = np.mean(har_history)
90
- speaking_flag = avg_har > 0.3 or len(smiles) > 0
91
 
92
- # رسم نقاط لب
93
- for i in range(61,88):
94
- x_lip = int(x + landmarks[i].x * w)
95
- y_lip = int(y + landmarks[i].y * h)
96
- cv2.circle(frame, (x_lip, y_lip), 1, (0,255,0), -1)
97
-
98
- if speaking_flag:
99
- cv2.putText(frame, "Speaking...", (x, y-10),
100
- cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
101
-
102
- # پردازش صدا chunk-by-chunk
103
- if speaking_flag and audio_index < len(audio):
104
- chunk = audio[audio_index:audio_index+chunk_size]
105
- if len(chunk) > 0:
106
- result = model.transcribe(chunk, fp16=False)
107
- transcribed_text += result["text"] + " "
108
- audio_index += chunk_size
109
-
110
- # اضافه کردن زیرنویس
111
- if transcribed_text.strip():
112
- cv2.rectangle(frame, (0, height-50), (width, height), (0,0,0), -1)
113
- cv2.putText(frame, transcribed_text.strip(), (10, height-10),
114
- cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)
115
 
116
- out.write(frame)
 
 
 
 
117
 
118
  cap.release()
119
- out.release()
120
-
121
- return output_file, transcribed_text.strip()
122
 
123
  # -----------------------------
124
- # رابط Gradio
125
  # -----------------------------
126
  iface = gr.Interface(
127
  fn=process_video,
128
- inputs=gr.Video(label="Upload Video"),
129
- outputs=[gr.Video(label="Processed Video"), gr.Textbox(label="Transcribed Speech")],
130
- title="🎥 Real-time Face & Lip Detection + Whisper",
131
- description="Upload a video. The system detects faces and lips, starts transcription when the person speaks, and overlays subtitles on the video."
132
  )
133
 
134
  if __name__ == "__main__":
 
22
  smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
23
 
24
  # -----------------------------
25
+ # بارگذاری مدل Whisper Large
26
  # -----------------------------
27
+ print("🎧 Loading Whisper model (large)...")
28
+ model = whisper.load_model("large")
29
 
30
  # -----------------------------
31
+ # تابع تشخیص حرکت لب
32
  # -----------------------------
33
  def lip_aspect_ratio(landmarks, w, h):
34
  top_lip = landmarks[13]
 
38
  return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
39
 
40
  # -----------------------------
41
+ # تابع پردازش ویدئو و استخراج گفتار
42
  # -----------------------------
43
  def process_video(uploaded_video):
 
44
  tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
45
  tmp_video.write(uploaded_video.read())
46
  tmp_video.close()
47
 
48
+ # استخراج صدا از ویدئو
49
  audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
50
  subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
51
  stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
52
 
53
+ # خواندن صدا
54
  audio, sr = sf.read(audio_path)
55
  audio = audio.T.flatten() if audio.ndim > 1 else audio
56
 
57
+ # تنظیم ویدئو برای پردازش چهره و لب
58
  cap = cv2.VideoCapture(tmp_video.name)
 
 
 
 
 
 
59
  har_history = deque(maxlen=5)
 
60
  transcribed_text = ""
61
+ speaking_flag = False
62
+
63
+ print("🎥 Analyzing lip movement and detecting speech...")
64
 
65
  while cap.isOpened():
66
  ret, frame = cap.read()
67
+ if not ret:
68
+ break
69
 
70
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
71
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
 
 
 
72
  for (x, y, w, h) in faces:
 
73
  roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
74
  results = face_mesh.process(roi_rgb)
75
  smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
 
80
  har = lip_aspect_ratio(landmarks, w, h)
81
  har_history.append(har)
82
  avg_har = np.mean(har_history)
 
83
 
84
+ speaking_flag = avg_har > 0.3 or len(smiles) > 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ if speaking_flag:
87
+ print("🗣️ Detected speech — transcribing...")
88
+ result = model.transcribe(audio_path, fp16=False)
89
+ transcribed_text = result["text"]
90
+ break # فقط یکبار پردازش برای سرعت بیشتر
91
 
92
  cap.release()
93
+ return transcribed_text.strip()
 
 
94
 
95
  # -----------------------------
96
+ # رابط Gradio (فقط خروجی متن)
97
  # -----------------------------
98
  iface = gr.Interface(
99
  fn=process_video,
100
+ inputs=gr.Video(label="Upload a video"),
101
+ outputs=gr.Textbox(label="Transcribed Speech"),
102
+ title="🎙️ Whisper Large - Face & Lip Detection",
103
+ description="Upload a video. The system detects lips and transcribes speech using Whisper Large."
104
  )
105
 
106
  if __name__ == "__main__":