farid678 commited on
Commit
ce4aa6e
·
verified ·
1 Parent(s): e59ca4a

simple version

Browse files
Files changed (1) hide show
  1. app.py +27 -94
app.py CHANGED
@@ -1,117 +1,50 @@
1
  import gradio as gr
2
- import cv2
3
- import mediapipe as mp
4
- import whisper
5
- import numpy as np
6
  import subprocess
7
- import os
8
- from collections import deque
9
  import tempfile
10
- import soundfile as sf
11
-
12
- # -----------------------------
13
- # تنظیم MediaPipe
14
- # -----------------------------
15
- mp_face_mesh = mp.solutions.face_mesh
16
- face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
17
-
18
- # -----------------------------
19
- # بارگذاری مدل‌های Haar Cascade
20
- # -----------------------------
21
- face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
22
- smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_smile.xml")
23
-
24
- # -----------------------------
25
- # بارگذاری مدل Whisper Large
26
- # -----------------------------
27
- print("🎧 Loading Whisper model (large)...")
28
- model = whisper.load_model("large")
29
 
30
  # -----------------------------
31
- # تابع تشخیص حرکت لب و صحبت کردن
32
  # -----------------------------
33
- def lip_aspect_ratio(landmarks, w, h):
34
- top_lip = landmarks[13]
35
- bottom_lip = landmarks[14]
36
- left_corner = landmarks[61]
37
- right_corner = landmarks[291]
38
- return (bottom_lip.y - top_lip.y) / (right_corner.x - left_corner.x)
39
 
40
  # -----------------------------
41
- # تابع پردازش ویدئو
42
  # -----------------------------
43
- def process_video(video_path):
44
- # Gradio ممکن است مسیر فایل بدهد (str) یا فایل باینری
45
- if isinstance(video_path, str):
46
- tmp_video_path = video_path
47
- else:
48
- tmp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
49
- tmp_video.write(video_path.read())
50
- tmp_video.close()
51
  tmp_video_path = tmp_video.name
52
 
53
- # استخراج صدا
54
- audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
55
- subprocess.call(
56
- ['ffmpeg', '-y', '-i', tmp_video_path, '-q:a', '0', '-map', 'a', audio_path],
57
  stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
58
  )
59
 
60
- # بارگذاری صوت
61
- audio, sr = sf.read(audio_path)
62
- audio = audio.T.flatten() if audio.ndim > 1 else audio
63
-
64
- cap = cv2.VideoCapture(tmp_video_path)
65
- har_history = deque(maxlen=5)
66
- audio_index = 0
67
- transcribed_text = ""
68
- chunk_size = int(sr * 2) # هر 2 ثانیه صدا
69
-
70
- while cap.isOpened():
71
- ret, frame = cap.read()
72
- if not ret:
73
- break
74
-
75
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
76
- faces = face_cascade.detectMultiScale(gray, 1.3, 5)
77
- speaking_flag = False
78
-
79
- for (x, y, w, h) in faces:
80
- roi_rgb = cv2.cvtColor(frame[y:y+h, x:x+w], cv2.COLOR_BGR2RGB)
81
- results = face_mesh.process(roi_rgb)
82
- smiles = smile_cascade.detectMultiScale(gray[y:y+h, x:x+w], scaleFactor=1.7, minNeighbors=20)
83
-
84
- if results.multi_face_landmarks:
85
- for face_landmarks in results.multi_face_landmarks:
86
- landmarks = face_landmarks.landmark
87
- har = lip_aspect_ratio(landmarks, w, h)
88
- har_history.append(har)
89
- avg_har = np.mean(har_history)
90
- speaking_flag = avg_har > 0.3 or len(smiles) > 0
91
 
92
- # فقط وقتی صحبت شروع شد، متن را استخراج کن
93
- if speaking_flag and audio_index < len(audio):
94
- chunk = audio[audio_index:audio_index+chunk_size]
95
- if len(chunk) > 0:
96
- # Whisper expects a file, so save chunk temporarily
97
- tmp_chunk_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
98
- sf.write(tmp_chunk_path, chunk, sr)
99
- result = model.transcribe(tmp_chunk_path, fp16=False)
100
- transcribed_text += result["text"] + " "
101
- audio_index += chunk_size
102
 
103
- cap.release()
104
- return transcribed_text.strip()
105
 
106
  # -----------------------------
107
- # رابط Gradio
108
  # -----------------------------
109
  iface = gr.Interface(
110
- fn=process_video,
111
- inputs=gr.Video(label="Upload Video"),
112
- outputs=gr.Textbox(label="Transcribed Speech"),
113
- title="🎥 Whisper Large + Face & Lip Detection",
114
- description="Upload a video. The system detects when a person is speaking and transcribes the speech to text."
115
  )
116
 
117
  if __name__ == "__main__":
 
1
  import gradio as gr
 
 
 
 
2
  import subprocess
 
 
3
  import tempfile
4
+ import whisper
5
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # -----------------------------
8
+ # بارگذاری مدل Whisper
9
  # -----------------------------
10
+ print("🎧 Loading Whisper model (base)...")
11
+ model = whisper.load_model("base") # می‌توانید "small", "medium", یا "large" هم بگذارید
 
 
 
 
12
 
13
  # -----------------------------
14
+ # تابع پردازش ویدیو و تبدیل به متن
15
  # -----------------------------
16
+ def transcribe_video(video):
17
+ # ذخیره موقت ویدیو
18
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
19
+ tmp_video.write(video.read())
 
 
 
 
20
  tmp_video_path = tmp_video.name
21
 
22
+ # استخراج صدا با ffmpeg
23
+ tmp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
24
+ subprocess.run(
25
+ ["ffmpeg", "-y", "-i", tmp_video_path, "-ar", "16000", "-ac", "1", tmp_audio_path],
26
  stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
27
  )
28
 
29
+ # تبدیل صدا به متن
30
+ result = model.transcribe(tmp_audio_path, fp16=False)
31
+ text = result["text"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # پاک کردن فایل‌های موقت
34
+ os.remove(tmp_video_path)
35
+ os.remove(tmp_audio_path)
 
 
 
 
 
 
 
36
 
37
+ return text or "متنی شناسایی نشد."
 
38
 
39
  # -----------------------------
40
+ # رابط کاربری Gradio
41
  # -----------------------------
42
  iface = gr.Interface(
43
+ fn=transcribe_video,
44
+ inputs=gr.Video(label="🎥 ویدیو را آپلود کنید"),
45
+ outputs=gr.Textbox(label="📝 متن استخراج‌شده"),
46
+ title="🎧 ویدیو به متن با Whisper",
47
+ description="ویدیوی خود را آپلود کنید تا گفتار آن به متن تبدیل شود."
48
  )
49
 
50
  if __name__ == "__main__":