Update app.py
Browse files
app.py
CHANGED
|
@@ -94,7 +94,7 @@ class OptimizedModelManager:
|
|
| 94 |
feature_extractor=self.processor.feature_extractor,
|
| 95 |
chunk_length_s=CHUNK_LENGTH,
|
| 96 |
device=device,
|
| 97 |
-
|
| 98 |
model_kwargs={
|
| 99 |
"use_flash_attention_2": True if hasattr(self.model.config, 'use_flash_attention_2') else False
|
| 100 |
}
|
|
@@ -201,6 +201,11 @@ SUPPORTED_AUDIO_FORMATS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wm
|
|
| 201 |
def fast_audio_preprocessing(file_path):
|
| 202 |
"""پردازش سریع فایل صوتی"""
|
| 203 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
# استفاده از librosa برای بارگذاری سریعتر
|
| 205 |
audio, sr = librosa.load(file_path, sr=16000, mono=True)
|
| 206 |
|
|
@@ -214,9 +219,13 @@ def fast_audio_preprocessing(file_path):
|
|
| 214 |
except Exception as e:
|
| 215 |
logger.error(f"Error in fast audio preprocessing: {e}")
|
| 216 |
# بازگشت به روش قدیمی در صورت خطا
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
def extract_audio_from_video_fast(video_path, output_path):
|
| 222 |
"""استخراج سریع صدا از ویدیو"""
|
|
@@ -240,9 +249,48 @@ def extract_audio_from_video_fast(video_path, output_path):
|
|
| 240 |
return False
|
| 241 |
|
| 242 |
def parallel_chunk_processing(audio_chunks, pipe, task, language):
|
| 243 |
-
"""پردازش موازی چانکها
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
def chunks_to_srt(chunks):
|
| 248 |
"""تبدیل سریع چانکها به SRT"""
|
|
@@ -315,8 +363,15 @@ def process_audio_file_optimized(file_path, task="transcribe", language="auto",
|
|
| 315 |
start_time = time.time()
|
| 316 |
pipe = model_manager.get_model()
|
| 317 |
|
|
|
|
|
|
|
| 318 |
# پردازش سریع صدا
|
| 319 |
audio, sr = fast_audio_preprocessing(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
inputs = {"array": audio, "sampling_rate": sr}
|
| 321 |
|
| 322 |
# تنظیمات generation
|
|
@@ -378,8 +433,8 @@ def process_audio_file_optimized(file_path, task="transcribe", language="auto",
|
|
| 378 |
if return_timestamps:
|
| 379 |
return {
|
| 380 |
"text": result['text'],
|
| 381 |
-
"chunks": result
|
| 382 |
-
"srt": chunks_to_srt(result
|
| 383 |
}
|
| 384 |
else:
|
| 385 |
return {"text": result['text']}
|
|
|
|
| 94 |
feature_extractor=self.processor.feature_extractor,
|
| 95 |
chunk_length_s=CHUNK_LENGTH,
|
| 96 |
device=device,
|
| 97 |
+
dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # استفاده از dtype به جای torch_dtype
|
| 98 |
model_kwargs={
|
| 99 |
"use_flash_attention_2": True if hasattr(self.model.config, 'use_flash_attention_2') else False
|
| 100 |
}
|
|
|
|
| 201 |
def fast_audio_preprocessing(file_path):
|
| 202 |
"""پردازش سریع فایل صوتی"""
|
| 203 |
try:
|
| 204 |
+
# غیرفعال کردن کش librosa برای جلوگیری از خطا
|
| 205 |
+
import librosa.cache
|
| 206 |
+
librosa.cache.clear()
|
| 207 |
+
librosa.cache.set_cache(None)
|
| 208 |
+
|
| 209 |
# استفاده از librosa برای بارگذاری سریعتر
|
| 210 |
audio, sr = librosa.load(file_path, sr=16000, mono=True)
|
| 211 |
|
|
|
|
| 219 |
except Exception as e:
|
| 220 |
logger.error(f"Error in fast audio preprocessing: {e}")
|
| 221 |
# بازگشت به روش قدیمی در صورت خطا
|
| 222 |
+
try:
|
| 223 |
+
with open(file_path, "rb") as f:
|
| 224 |
+
inputs = f.read()
|
| 225 |
+
return ffmpeg_read(inputs, 16000), 16000
|
| 226 |
+
except Exception as ffmpeg_error:
|
| 227 |
+
logger.error(f"FFmpeg fallback also failed: {ffmpeg_error}")
|
| 228 |
+
raise Exception("Both librosa and ffmpeg audio processing failed")
|
| 229 |
|
| 230 |
def extract_audio_from_video_fast(video_path, output_path):
|
| 231 |
"""استخراج سریع صدا از ویدیو"""
|
|
|
|
| 249 |
return False
|
| 250 |
|
| 251 |
def parallel_chunk_processing(audio_chunks, pipe, task, language):
|
| 252 |
+
"""پردازش موازی چانکها"""
|
| 253 |
+
results = []
|
| 254 |
+
|
| 255 |
+
for chunk_data in audio_chunks:
|
| 256 |
+
chunk, start_time = chunk_data
|
| 257 |
+
try:
|
| 258 |
+
inputs = {"array": chunk, "sampling_rate": 16000}
|
| 259 |
+
|
| 260 |
+
generate_kwargs = {
|
| 261 |
+
"task": task,
|
| 262 |
+
"do_sample": False,
|
| 263 |
+
"num_beams": 1,
|
| 264 |
+
"use_cache": True,
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
if language != "auto" and language in SUPPORTED_LANGUAGES:
|
| 268 |
+
generate_kwargs["language"] = f"<|{language}|>"
|
| 269 |
+
|
| 270 |
+
result = pipe(
|
| 271 |
+
inputs,
|
| 272 |
+
batch_size=BATCH_SIZE,
|
| 273 |
+
generate_kwargs=generate_kwargs,
|
| 274 |
+
return_timestamps=True
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Adjust timestamps
|
| 278 |
+
if result.get('chunks'):
|
| 279 |
+
for chunk_result in result['chunks']:
|
| 280 |
+
if chunk_result.get('timestamp'):
|
| 281 |
+
chunk_result['timestamp'] = (
|
| 282 |
+
chunk_result['timestamp'][0] + start_time,
|
| 283 |
+
chunk_result['timestamp'][1] + start_time
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
results.append(result)
|
| 287 |
+
|
| 288 |
+
except Exception as e:
|
| 289 |
+
logger.error(f"Error processing chunk: {e}")
|
| 290 |
+
# Add empty result to continue processing
|
| 291 |
+
results.append({"text": "", "chunks": []})
|
| 292 |
+
|
| 293 |
+
return results
|
| 294 |
|
| 295 |
def chunks_to_srt(chunks):
|
| 296 |
"""تبدیل سریع چانکها به SRT"""
|
|
|
|
| 363 |
start_time = time.time()
|
| 364 |
pipe = model_manager.get_model()
|
| 365 |
|
| 366 |
+
logger.info(f"Starting audio processing for: {file_path}")
|
| 367 |
+
|
| 368 |
# پردازش سریع صدا
|
| 369 |
audio, sr = fast_audio_preprocessing(file_path)
|
| 370 |
+
logger.info(f"Audio loaded: {len(audio)} samples at {sr}Hz")
|
| 371 |
+
|
| 372 |
+
if audio is None:
|
| 373 |
+
raise Exception("Audio preprocessing returned None")
|
| 374 |
+
|
| 375 |
inputs = {"array": audio, "sampling_rate": sr}
|
| 376 |
|
| 377 |
# تنظیمات generation
|
|
|
|
| 433 |
if return_timestamps:
|
| 434 |
return {
|
| 435 |
"text": result['text'],
|
| 436 |
+
"chunks": result.get('chunks', []),
|
| 437 |
+
"srt": chunks_to_srt(result.get('chunks', []))
|
| 438 |
}
|
| 439 |
else:
|
| 440 |
return {"text": result['text']}
|