farid678 commited on
Commit
e59ca4a
·
verified ·
1 Parent(s): c79f994

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -26
app.py CHANGED
@@ -28,7 +28,7 @@ print("🎧 Loading Whisper model (large)...")
28
  model = whisper.load_model("large")
29
 
30
  # -----------------------------
31
- # تابع تشخیص حرکت لب
32
  # -----------------------------
33
  def lip_aspect_ratio(landmarks, w, h):
34
  top_lip = landmarks[13]
@@ -38,29 +38,34 @@ def lip_aspect_ratio(landmarks, w, h):
38
  return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
39
 
40
  # -----------------------------
41
- # تابع پردازش ویدئو و استخراج گفتار
42
  # -----------------------------
43
- def process_video(uploaded_video):
44
- tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
45
- tmp_video.write(uploaded_video.read())
46
- tmp_video.close()
47
-
48
- # استخراج صدا از ویدئو
 
 
 
 
 
49
  audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
50
- subprocess.call(['ffmpeg', '-y', '-i', tmp_video.name, '-q:a', '0', '-map', 'a', audio_path],
51
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
 
52
 
53
- # خواندن صدا
54
  audio, sr = sf.read(audio_path)
55
  audio = audio.T.flatten() if audio.ndim > 1 else audio
56
 
57
- # تنظیم ویدئو برای پردازش چهره و لب
58
- cap = cv2.VideoCapture(tmp_video.name)
59
  har_history = deque(maxlen=5)
 
60
  transcribed_text = ""
61
- speaking_flag = False
62
-
63
- print("🎥 Analyzing lip movement and detecting speech...")
64
 
65
  while cap.isOpened():
66
  ret, frame = cap.read()
@@ -69,6 +74,8 @@ def process_video(uploaded_video):
69
 
70
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
71
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
 
 
72
  for (x, y, w, h) in faces:
73
  roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
74
  results = face_mesh.process(roi_rgb)
@@ -80,27 +87,31 @@ def process_video(uploaded_video):
80
  har = lip_aspect_ratio(landmarks, w, h)
81
  har_history.append(har)
82
  avg_har = np.mean(har_history)
83
-
84
  speaking_flag = avg_har > 0.3 or len(smiles) > 0
85
 
86
- if speaking_flag:
87
- print("🗣️ Detected speech transcribing...")
88
- result = model.transcribe(audio_path, fp16=False)
89
- transcribed_text = result["text"]
90
- break # فقط یکبار پردازش برای سرعت بیشتر
 
 
 
 
 
91
 
92
  cap.release()
93
  return transcribed_text.strip()
94
 
95
  # -----------------------------
96
- # رابط Gradio (فقط خروجی متن)
97
  # -----------------------------
98
  iface = gr.Interface(
99
  fn=process_video,
100
- inputs=gr.Video(label="Upload a video"),
101
  outputs=gr.Textbox(label="Transcribed Speech"),
102
- title="🎙️ Whisper Large - Face & Lip Detection",
103
- description="Upload a video. The system detects lips and transcribes speech using Whisper Large."
104
  )
105
 
106
  if __name__ == "__main__":
 
28
  model = whisper.load_model("large")
29
 
30
  # -----------------------------
31
+ # تابع تشخیص حرکت لب و صحبت کردن
32
  # -----------------------------
33
  def lip_aspect_ratio(landmarks, w, h):
34
  top_lip = landmarks[13]
 
38
  return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
39
 
40
  # -----------------------------
41
+ # تابع پردازش ویدئو
42
  # -----------------------------
43
+ def process_video(video_path):
44
+ # Gradio ممکن است مسیر فایل بدهد (str) یا فایل باینری
45
+ if isinstance(video_path, str):
46
+ tmp_video_path = video_path
47
+ else:
48
+ tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
49
+ tmp_video.write(video_path.read())
50
+ tmp_video.close()
51
+ tmp_video_path = tmp_video.name
52
+
53
+ # استخراج صدا
54
  audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
55
+ subprocess.call(
56
+ ['ffmpeg', '-y', '-i', tmp_video_path, '-q:a', '0', '-map', 'a', audio_path],
57
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
58
+ )
59
 
60
+ # بارگذاری صوت
61
  audio, sr = sf.read(audio_path)
62
  audio = audio.T.flatten() if audio.ndim > 1 else audio
63
 
64
+ cap = cv2.VideoCapture(tmp_video_path)
 
65
  har_history = deque(maxlen=5)
66
+ audio_index = 0
67
  transcribed_text = ""
68
+ chunk_size = int(sr * 2) # هر 2 ثانیه صدا
 
 
69
 
70
  while cap.isOpened():
71
  ret, frame = cap.read()
 
74
 
75
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
76
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
77
+ speaking_flag = False
78
+
79
  for (x, y, w, h) in faces:
80
  roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
81
  results = face_mesh.process(roi_rgb)
 
87
  har = lip_aspect_ratio(landmarks, w, h)
88
  har_history.append(har)
89
  avg_har = np.mean(har_history)
 
90
  speaking_flag = avg_har > 0.3 or len(smiles) > 0
91
 
92
+ # فقط وقتی صحبت شروع شد، متن را استخراج کن
93
+ if speaking_flag and audio_index < len(audio):
94
+ chunk = audio[audio_index:audio_index+chunk_size]
95
+ if len(chunk) > 0:
96
+ # Whisper expects a file, so save chunk temporarily
97
+ tmp_chunk_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
98
+ sf.write(tmp_chunk_path, chunk, sr)
99
+ result = model.transcribe(tmp_chunk_path, fp16=False)
100
+ transcribed_text += result["text"] + " "
101
+ audio_index += chunk_size
102
 
103
  cap.release()
104
  return transcribed_text.strip()
105
 
106
  # -----------------------------
107
+ # رابط Gradio
108
  # -----------------------------
109
  iface = gr.Interface(
110
  fn=process_video,
111
+ inputs=gr.Video(label="Upload Video"),
112
  outputs=gr.Textbox(label="Transcribed Speech"),
113
+ title="🎥 Whisper Large + Face & Lip Detection",
114
+ description="Upload a video. The system detects when a person is speaking and transcribes the speech to text."
115
  )
116
 
117
  if __name__ == "__main__":