danicor commited on
Commit
61872a3
·
verified ·
1 Parent(s): 9f0e614

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +406 -0
app.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import time
4
+ import json
5
+ from pathlib import Path
6
+ import uuid
7
+ import logging
8
+
9
+ import torch
10
+ import yt_dlp as youtube_dl
11
+ from flask import Flask, request, jsonify
12
+ from transformers import pipeline
13
+ from transformers.pipelines.audio_utils import ffmpeg_read
14
+ import ffmpeg
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ app = Flask(__name__)
21
+
22
+ # Configuration
23
+ MODEL_NAME = "openai/whisper-large-v3"
24
+ BATCH_SIZE = 8
25
+ FILE_LIMIT_MB = 1000
26
+ YT_LENGTH_LIMIT_S = 3600 # 1 hour limit for YouTube
27
+ MAX_FILE_SIZE = FILE_LIMIT_MB * 1024 * 1024 # Convert to bytes
28
+
29
+ # Device configuration
30
+ device = 0 if torch.cuda.is_available() else "cpu"
31
+ logger.info(f"Using device: {device}")
32
+
33
+ # Initialize Whisper pipeline
34
+ try:
35
+ pipe = pipeline(
36
+ task="automatic-speech-recognition",
37
+ model=MODEL_NAME,
38
+ chunk_length_s=30,
39
+ device=device,
40
+ )
41
+ logger.info("Whisper model loaded successfully")
42
+ except Exception as e:
43
+ logger.error(f"Error loading Whisper model: {e}")
44
+ raise
45
+
46
+ # Supported languages for Whisper (99 languages)
47
+ SUPPORTED_LANGUAGES = {
48
+ "af": "afrikaans", "am": "amharic", "ar": "arabic", "as": "assamese", "az": "azerbaijani",
49
+ "ba": "bashkir", "be": "belarusian", "bg": "bulgarian", "bn": "bengali", "bo": "tibetan",
50
+ "br": "breton", "bs": "bosnian", "ca": "catalan", "cs": "czech", "cy": "welsh",
51
+ "da": "danish", "de": "german", "el": "greek", "en": "english", "es": "spanish",
52
+ "et": "estonian", "eu": "basque", "fa": "persian", "fi": "finnish", "fo": "faroese",
53
+ "fr": "french", "gl": "galician", "gu": "gujarati", "ha": "hausa", "haw": "hawaiian",
54
+ "he": "hebrew", "hi": "hindi", "hr": "croatian", "ht": "haitian creole", "hu": "hungarian",
55
+ "hy": "armenian", "id": "indonesian", "is": "icelandic", "it": "italian", "ja": "japanese",
56
+ "jw": "javanese", "ka": "georgian", "kk": "kazakh", "km": "khmer", "kn": "kannada",
57
+ "ko": "korean", "la": "latin", "lb": "luxembourgish", "ln": "lingala", "lo": "lao",
58
+ "lt": "lithuanian", "lv": "latvian", "mg": "malagasy", "mi": "maori", "mk": "macedonian",
59
+ "ml": "malayalam", "mn": "mongolian", "mr": "marathi", "ms": "malay", "mt": "maltese",
60
+ "my": "myanmar", "ne": "nepali", "nl": "dutch", "nn": "nynorsk", "no": "norwegian",
61
+ "oc": "occitan", "pa": "punjabi", "pl": "polish", "ps": "pashto", "pt": "portuguese",
62
+ "ro": "romanian", "ru": "russian", "sa": "sanskrit", "sd": "sindhi", "si": "sinhala",
63
+ "sk": "slovak", "sl": "slovenian", "sn": "shona", "so": "somali", "sq": "albanian",
64
+ "sr": "serbian", "su": "sundanese", "sv": "swedish", "sw": "swahili", "ta": "tamil",
65
+ "te": "telugu", "tg": "tajik", "th": "thai", "tk": "turkmen", "tl": "tagalog",
66
+ "tr": "turkish", "tt": "tatar", "uk": "ukrainian", "ur": "urdu", "uz": "uzbek",
67
+ "vi": "vietnamese", "yi": "yiddish", "yo": "yoruba", "zh": "chinese"
68
+ }
69
+
70
+ # Video formats supported
71
+ SUPPORTED_VIDEO_FORMATS = ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm', '.m4v', '.3gp']
72
+ SUPPORTED_AUDIO_FORMATS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma']
73
+
74
+ def extract_audio_from_video(video_path, output_path):
75
+ """Extract audio from video file using ffmpeg"""
76
+ try:
77
+ (
78
+ ffmpeg
79
+ .input(video_path)
80
+ .output(output_path, acodec='pcm_s16le', ac=1, ar=16000)
81
+ .overwrite_output()
82
+ .run(quiet=True)
83
+ )
84
+ return True
85
+ except Exception as e:
86
+ logger.error(f"Error extracting audio: {e}")
87
+ return False
88
+
89
+ def chunks_to_srt(chunks):
90
+ """Convert chunks to SRT format"""
91
+ srt_format = ""
92
+ for i, chunk in enumerate(chunks, 1):
93
+ start_time, end_time = chunk['timestamp']
94
+ start_time_hms = "{:02}:{:02}:{:02},{:03}".format(
95
+ int(start_time // 3600),
96
+ int((start_time % 3600) // 60),
97
+ int(start_time % 60),
98
+ int((start_time % 1) * 1000)
99
+ )
100
+ end_time_hms = "{:02}:{:02}:{:02},{:03}".format(
101
+ int(end_time // 3600),
102
+ int((end_time % 3600) // 60),
103
+ int(end_time % 60),
104
+ int((end_time % 1) * 1000)
105
+ )
106
+ srt_format += f"{i}\n{start_time_hms} --> {end_time_hms}\n{chunk['text']}\n\n"
107
+ return srt_format
108
+
109
+ def download_youtube_audio(yt_url, output_path):
110
+ """Download audio from YouTube URL"""
111
+ info_loader = youtube_dl.YoutubeDL()
112
+
113
+ try:
114
+ info = info_loader.extract_info(yt_url, download=False)
115
+ except youtube_dl.utils.DownloadError as err:
116
+ raise Exception(f"YouTube extraction error: {str(err)}")
117
+
118
+ # Check video length
119
+ file_length_s = info.get("duration", 0)
120
+ if file_length_s > YT_LENGTH_LIMIT_S:
121
+ yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
122
+ file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length_s))
123
+ raise Exception(f"Video too long. Maximum: {yt_length_limit_hms}, got: {file_length_hms}")
124
+
125
+ ydl_opts = {
126
+ "outtmpl": output_path,
127
+ "format": "bestaudio[ext=m4a]/bestaudio/best",
128
+ "extractaudio": True,
129
+ "audioformat": "wav",
130
+ "audioquality": "192K",
131
+ }
132
+
133
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
134
+ try:
135
+ ydl.download([yt_url])
136
+ except youtube_dl.utils.ExtractorError as err:
137
+ raise Exception(f"YouTube download error: {str(err)}")
138
+
139
+ def process_audio_file(file_path, task="transcribe", language="auto", return_timestamps=False):
140
+ """Process audio file with Whisper"""
141
+ try:
142
+ # Read audio file
143
+ with open(file_path, "rb") as f:
144
+ inputs = f.read()
145
+
146
+ # Convert to format expected by Whisper
147
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
148
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
149
+
150
+ # Prepare generation kwargs
151
+ generate_kwargs = {"task": task}
152
+ if language != "auto" and language in SUPPORTED_LANGUAGES:
153
+ generate_kwargs["language"] = f"<|{language}|>"
154
+
155
+ # Run transcription
156
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs, return_timestamps=return_timestamps)
157
+
158
+ if return_timestamps:
159
+ return {
160
+ "text": result['text'],
161
+ "chunks": result['chunks'],
162
+ "srt": chunks_to_srt(result['chunks'])
163
+ }
164
+ else:
165
+ return {"text": result['text']}
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error processing audio: {e}")
169
+ raise Exception(f"Audio processing error: {str(e)}")
170
+
171
+ @app.route('/health', methods=['GET'])
172
+ def health_check():
173
+ """Health check endpoint"""
174
+ return jsonify({
175
+ "status": "healthy",
176
+ "model": MODEL_NAME,
177
+ "device": str(device),
178
+ "supported_languages": list(SUPPORTED_LANGUAGES.keys())
179
+ })
180
+
181
+ @app.route('/languages', methods=['GET'])
182
+ def get_supported_languages():
183
+ """Get list of supported languages"""
184
+ return jsonify({
185
+ "supported_languages": SUPPORTED_LANGUAGES,
186
+ "total_count": len(SUPPORTED_LANGUAGES)
187
+ })
188
+
189
+ @app.route('/transcribe', methods=['POST'])
190
+ def transcribe_endpoint():
191
+ """Main transcription endpoint"""
192
+ try:
193
+ # Get parameters
194
+ task = request.form.get('task', 'transcribe')
195
+ language = request.form.get('language', 'auto')
196
+ return_timestamps = request.form.get('return_timestamps', 'false').lower() == 'true'
197
+
198
+ # Validate task
199
+ if task not in ['transcribe', 'translate']:
200
+ return jsonify({"error": "Task must be 'transcribe' or 'translate'"}), 400
201
+
202
+ # Validate language
203
+ if language != 'auto' and language not in SUPPORTED_LANGUAGES:
204
+ return jsonify({"error": f"Language '{language}' not supported"}), 400
205
+
206
+ with tempfile.TemporaryDirectory() as temp_dir:
207
+ # Handle different input types
208
+ if 'file' in request.files:
209
+ # File upload
210
+ file = request.files['file']
211
+ if file.filename == '':
212
+ return jsonify({"error": "No file selected"}), 400
213
+
214
+ # Check file size
215
+ file.seek(0, os.SEEK_END)
216
+ file_size = file.tell()
217
+ file.seek(0)
218
+
219
+ if file_size > MAX_FILE_SIZE:
220
+ return jsonify({"error": f"File too large. Maximum size: {FILE_LIMIT_MB}MB"}), 400
221
+
222
+ # Save uploaded file
223
+ file_extension = Path(file.filename).suffix.lower()
224
+ temp_file_path = os.path.join(temp_dir, f"upload{file_extension}")
225
+ file.save(temp_file_path)
226
+
227
+ # Process video files (extract audio)
228
+ if file_extension in SUPPORTED_VIDEO_FORMATS:
229
+ audio_path = os.path.join(temp_dir, "extracted_audio.wav")
230
+ if not extract_audio_from_video(temp_file_path, audio_path):
231
+ return jsonify({"error": "Failed to extract audio from video"}), 500
232
+ temp_file_path = audio_path
233
+ elif file_extension not in SUPPORTED_AUDIO_FORMATS:
234
+ return jsonify({"error": f"Unsupported file format: {file_extension}"}), 400
235
+
236
+ elif 'youtube_url' in request.form:
237
+ # YouTube URL
238
+ youtube_url = request.form.get('youtube_url')
239
+ if not youtube_url:
240
+ return jsonify({"error": "YouTube URL is required"}), 400
241
+
242
+ temp_file_path = os.path.join(temp_dir, "youtube_audio.%(ext)s")
243
+ try:
244
+ download_youtube_audio(youtube_url, temp_file_path)
245
+ # Find the actual downloaded file
246
+ for file in os.listdir(temp_dir):
247
+ if file.startswith("youtube_audio"):
248
+ temp_file_path = os.path.join(temp_dir, file)
249
+ break
250
+ except Exception as e:
251
+ return jsonify({"error": str(e)}), 400
252
+
253
+ elif 'audio_url' in request.form:
254
+ # Direct audio/video URL
255
+ audio_url = request.form.get('audio_url')
256
+ if not audio_url:
257
+ return jsonify({"error": "Audio URL is required"}), 400
258
+
259
+ # Download file from URL
260
+ import requests
261
+ try:
262
+ response = requests.get(audio_url, stream=True, timeout=30)
263
+ response.raise_for_status()
264
+
265
+ # Determine file extension from URL or content type
266
+ file_extension = Path(audio_url).suffix.lower()
267
+ if not file_extension:
268
+ content_type = response.headers.get('content-type', '')
269
+ if 'audio' in content_type:
270
+ file_extension = '.mp3'
271
+ elif 'video' in content_type:
272
+ file_extension = '.mp4'
273
+ else:
274
+ file_extension = '.mp3' # default
275
+
276
+ temp_file_path = os.path.join(temp_dir, f"download{file_extension}")
277
+
278
+ with open(temp_file_path, 'wb') as f:
279
+ for chunk in response.iter_content(chunk_size=8192):
280
+ f.write(chunk)
281
+
282
+ # Process video files (extract audio)
283
+ if file_extension in SUPPORTED_VIDEO_FORMATS:
284
+ audio_path = os.path.join(temp_dir, "extracted_audio.wav")
285
+ if not extract_audio_from_video(temp_file_path, audio_path):
286
+ return jsonify({"error": "Failed to extract audio from video"}), 500
287
+ temp_file_path = audio_path
288
+
289
+ except requests.RequestException as e:
290
+ return jsonify({"error": f"Failed to download file: {str(e)}"}), 400
291
+ else:
292
+ return jsonify({"error": "No input provided. Use 'file', 'youtube_url', or 'audio_url'"}), 400
293
+
294
+ # Process the audio file
295
+ result = process_audio_file(temp_file_path, task, language, return_timestamps)
296
+
297
+ return jsonify({
298
+ "success": True,
299
+ "task": task,
300
+ "language": language,
301
+ "return_timestamps": return_timestamps,
302
+ **result
303
+ })
304
+
305
+ except Exception as e:
306
+ logger.error(f"Transcription error: {e}")
307
+ return jsonify({"error": str(e)}), 500
308
+
309
+ @app.route('/batch_transcribe', methods=['POST'])
310
+ def batch_transcribe_endpoint():
311
+ """Batch transcription endpoint for multiple files"""
312
+ try:
313
+ files = request.files.getlist('files')
314
+ task = request.form.get('task', 'transcribe')
315
+ language = request.form.get('language', 'auto')
316
+ return_timestamps = request.form.get('return_timestamps', 'false').lower() == 'true'
317
+
318
+ if not files:
319
+ return jsonify({"error": "No files provided"}), 400
320
+
321
+ if len(files) > 10: # Limit batch size
322
+ return jsonify({"error": "Maximum 10 files per batch"}), 400
323
+
324
+ results = []
325
+
326
+ for idx, file in enumerate(files):
327
+ try:
328
+ with tempfile.TemporaryDirectory() as temp_dir:
329
+ # Save and process file
330
+ file_extension = Path(file.filename).suffix.lower()
331
+ temp_file_path = os.path.join(temp_dir, f"batch_{idx}{file_extension}")
332
+ file.save(temp_file_path)
333
+
334
+ # Handle video files
335
+ if file_extension in SUPPORTED_VIDEO_FORMATS:
336
+ audio_path = os.path.join(temp_dir, f"batch_{idx}_audio.wav")
337
+ if extract_audio_from_video(temp_file_path, audio_path):
338
+ temp_file_path = audio_path
339
+ else:
340
+ results.append({
341
+ "filename": file.filename,
342
+ "success": False,
343
+ "error": "Failed to extract audio from video"
344
+ })
345
+ continue
346
+
347
+ # Process audio
348
+ result = process_audio_file(temp_file_path, task, language, return_timestamps)
349
+ results.append({
350
+ "filename": file.filename,
351
+ "success": True,
352
+ **result
353
+ })
354
+
355
+ except Exception as e:
356
+ results.append({
357
+ "filename": file.filename,
358
+ "success": False,
359
+ "error": str(e)
360
+ })
361
+
362
+ return jsonify({
363
+ "success": True,
364
+ "batch_size": len(files),
365
+ "results": results
366
+ })
367
+
368
+ except Exception as e:
369
+ logger.error(f"Batch transcription error: {e}")
370
+ return jsonify({"error": str(e)}), 500
371
+
372
+ # Extension hooks for future plugins
373
+ class ExtensionManager:
374
+ def __init__(self):
375
+ self.hooks = {
376
+ 'before_transcription': [],
377
+ 'after_transcription': [],
378
+ 'before_translation': [],
379
+ 'after_translation': []
380
+ }
381
+
382
+ def register_hook(self, hook_name, callback):
383
+ if hook_name in self.hooks:
384
+ self.hooks[hook_name].append(callback)
385
+
386
+ def run_hooks(self, hook_name, data):
387
+ for callback in self.hooks.get(hook_name, []):
388
+ try:
389
+ data = callback(data)
390
+ except Exception as e:
391
+ logger.error(f"Hook error in {hook_name}: {e}")
392
+ return data
393
+
394
+ # Global extension manager
395
+ extension_manager = ExtensionManager()
396
+
397
+ @app.route('/extensions/hooks', methods=['GET'])
398
+ def get_extension_hooks():
399
+ """Get available extension hooks"""
400
+ return jsonify({
401
+ "available_hooks": list(extension_manager.hooks.keys()),
402
+ "description": "Extension hooks for plugins like CSS customization, myCred integration, etc."
403
+ })
404
+
405
+ if __name__ == '__main__':
406
+ app.run(host='0.0.0.0', port=7860, debug=False)