Pragmaticl commited on
Commit
ce4fb96
·
verified ·
1 Parent(s): 63396da

Upload 2 files

Browse files
Files changed (2) hide show
  1. app (1) (22).py +261 -0
  2. app (1) (23).py +456 -0
app (1) (22).py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import zipfile
4
+ import tempfile
5
+ from pathlib import Path
6
+ from faster_whisper import WhisperModel
7
+ import librosa
8
+ import soundfile as sf
9
+ import pandas as pd
10
+ import numpy as np
11
+ from typing import List, Tuple
12
+ import shutil
13
+
14
+ # Khởi tạo model Whisper
15
+ model = WhisperModel("large-v3-turbo", device="cpu", compute_type="int8")
16
+
17
+ def extract_audio_files(input_file: str, temp_dir: str) -> List[str]:
18
+ """Giải nén file zip hoặc copy file audio đơn"""
19
+ audio_files = []
20
+ audio_extensions = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'}
21
+
22
+ if input_file.endswith('.zip'):
23
+ with zipfile.ZipFile(input_file, 'r') as zip_ref:
24
+ zip_ref.extractall(temp_dir)
25
+
26
+ for root, _, files in os.walk(temp_dir):
27
+ for file in files:
28
+ if Path(file).suffix.lower() in audio_extensions:
29
+ audio_files.append(os.path.join(root, file))
30
+ else:
31
+ if Path(input_file).suffix.lower() in audio_extensions:
32
+ audio_files.append(input_file)
33
+
34
+ return audio_files
35
+
36
+ def transcribe_with_timestamps(audio_path: str) -> List[dict]:
37
+ """Transcribe audio và lấy timestamps"""
38
+ segments, info = model.transcribe(
39
+ audio_path,
40
+ beam_size=5,
41
+ vad_filter=True,
42
+ vad_parameters=dict(min_silence_duration_ms=500)
43
+ )
44
+
45
+ results = []
46
+ for segment in segments:
47
+ results.append({
48
+ 'start': segment.start,
49
+ 'end': segment.end,
50
+ 'text': segment.text.strip()
51
+ })
52
+
53
+ return results
54
+
55
+ def merge_short_segments(segments: List[dict], min_duration: float = 2.0) -> List[dict]:
56
+ """Gộp các segment ngắn lại với nhau"""
57
+ if not segments:
58
+ return []
59
+
60
+ merged = []
61
+ current = segments[0].copy()
62
+
63
+ for seg in segments[1:]:
64
+ current_duration = current['end'] - current['start']
65
+
66
+ if current_duration < min_duration:
67
+ # Gộp với segment tiếp theo
68
+ current['end'] = seg['end']
69
+ current['text'] = current['text'] + ' ' + seg['text']
70
+ else:
71
+ merged.append(current)
72
+ current = seg.copy()
73
+
74
+ merged.append(current)
75
+ return merged
76
+
77
+ def cut_audio_by_timestamps(audio_path: str, segments: List[dict], output_dir: str, base_name: str) -> List[dict]:
78
+ """Cắt audio theo timestamps"""
79
+ audio, sr = librosa.load(audio_path, sr=None)
80
+
81
+ audio_records = []
82
+
83
+ for idx, seg in enumerate(segments):
84
+ start_sample = int(seg['start'] * sr)
85
+ end_sample = int(seg['end'] * sr)
86
+
87
+ audio_segment = audio[start_sample:end_sample]
88
+
89
+ output_filename = f"{base_name}_{idx+1:05d}.wav"
90
+ output_path = os.path.join(output_dir, output_filename)
91
+
92
+ sf.write(output_path, audio_segment, sr)
93
+
94
+ audio_records.append({
95
+ 'audio_path': output_path,
96
+ 'transcription': seg['text'],
97
+ 'file_name': f"audio/{output_filename}"
98
+ })
99
+
100
+ return audio_records
101
+
102
+ def save_to_parquet(records: List[dict], output_dir: str, max_size_mb: int = 500):
103
+ """Lưu records vào file parquet, chia nhỏ nếu quá lớn"""
104
+ df = pd.DataFrame(records)
105
+
106
+ # Đọc audio files và convert sang bytes
107
+ audio_data = []
108
+ for path in df['audio_path']:
109
+ with open(path, 'rb') as f:
110
+ audio_data.append(f.read())
111
+
112
+ df['audio'] = audio_data
113
+ df = df[['audio', 'transcription', 'file_name']]
114
+
115
+ # Tính kích thước ước lượng
116
+ temp_path = os.path.join(output_dir, 'temp.parquet')
117
+ df.to_parquet(temp_path, engine='pyarrow')
118
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
119
+ os.remove(temp_path)
120
+
121
+ parquet_files = []
122
+
123
+ if file_size_mb <= max_size_mb:
124
+ # Lưu thành 1 file
125
+ output_path = os.path.join(output_dir, 'dataset.parquet')
126
+ df.to_parquet(output_path, engine='pyarrow')
127
+ parquet_files.append(output_path)
128
+ else:
129
+ # Chia nhỏ thành nhiều parts
130
+ num_parts = int(np.ceil(file_size_mb / max_size_mb))
131
+ chunk_size = len(df) // num_parts + 1
132
+
133
+ for i in range(num_parts):
134
+ start_idx = i * chunk_size
135
+ end_idx = min((i + 1) * chunk_size, len(df))
136
+
137
+ df_chunk = df.iloc[start_idx:end_idx]
138
+ output_path = os.path.join(output_dir, f'dataset_part{i+1:03d}.parquet')
139
+ df_chunk.to_parquet(output_path, engine='pyarrow')
140
+ parquet_files.append(output_path)
141
+
142
+ return parquet_files
143
+
144
+ def process_audio(input_file):
145
+ """Xử lý chính"""
146
+ if input_file is None:
147
+ return None, "Vui lòng upload file audio hoặc file zip!"
148
+
149
+ with tempfile.TemporaryDirectory() as temp_dir:
150
+ # Tạo thư mục con
151
+ extract_dir = os.path.join(temp_dir, 'extracted')
152
+ audio_output_dir = os.path.join(temp_dir, 'audio')
153
+ final_output_dir = os.path.join(temp_dir, 'output')
154
+
155
+ os.makedirs(extract_dir, exist_ok=True)
156
+ os.makedirs(audio_output_dir, exist_ok=True)
157
+ os.makedirs(final_output_dir, exist_ok=True)
158
+
159
+ # Giải nén và lấy danh sách audio files
160
+ audio_files = extract_audio_files(input_file, extract_dir)
161
+
162
+ if not audio_files:
163
+ return None, "Không tìm thấy file audio nào!"
164
+
165
+ all_records = []
166
+
167
+ # Xử lý từng file audio
168
+ for audio_file in audio_files:
169
+ base_name = Path(audio_file).stem
170
+
171
+ # Transcribe
172
+ segments = transcribe_with_timestamps(audio_file)
173
+
174
+ # Gộp các segment ngắn
175
+ merged_segments = merge_short_segments(segments, min_duration=2.0)
176
+
177
+ # Cắt audio
178
+ records = cut_audio_by_timestamps(
179
+ audio_file,
180
+ merged_segments,
181
+ audio_output_dir,
182
+ base_name
183
+ )
184
+
185
+ all_records.extend(records)
186
+
187
+ # Lưu vào parquet
188
+ parquet_files = save_to_parquet(all_records, final_output_dir)
189
+
190
+ # Copy audio folder vào output
191
+ final_audio_dir = os.path.join(final_output_dir, 'audio')
192
+ shutil.copytree(audio_output_dir, final_audio_dir)
193
+
194
+ # Tạo file zip
195
+ zip_path = os.path.join(temp_dir, 'dataset_output.zip')
196
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
197
+ # Thêm audio files
198
+ for root, _, files in os.walk(final_audio_dir):
199
+ for file in files:
200
+ file_path = os.path.join(root, file)
201
+ arcname = os.path.join('audio', file)
202
+ zipf.write(file_path, arcname)
203
+
204
+ # Thêm parquet files
205
+ for pq_file in parquet_files:
206
+ zipf.write(pq_file, os.path.basename(pq_file))
207
+
208
+ # Copy sang vị trí tạm để Gradio có thể trả về
209
+ final_zip = os.path.join(tempfile.gettempdir(), 'dataset_output.zip')
210
+ shutil.copy(zip_path, final_zip)
211
+
212
+ summary = f"""
213
+ ✅ Xử lý thành công!
214
+ - Số file audio đầu vào: {len(audio_files)}
215
+ - Số segment đã tạo: {len(all_records)}
216
+ - Số file parquet: {len(parquet_files)}
217
+ - File zip đầu ra: dataset_output.zip
218
+ """
219
+
220
+ return final_zip, summary
221
+
222
+ # Tạo giao diện Gradio
223
+ with gr.Blocks(title="Audio Transcription & Dataset Creator") as app:
224
+ gr.Markdown("""
225
+ # 🎙️ Audio Transcription & Dataset Creator
226
+ Upload file audio hoặc file zip chứa nhiều file audio.
227
+ Hệ thống sẽ:
228
+ 1. Transcribe bằng Whisper Large-v3-Turbo
229
+ 2. Cắt audio theo timestamps (gộp câu ngắn)
230
+ 3. Tạo dataset Parquet chuẩn với audio bytes
231
+ """)
232
+
233
+ with gr.Row():
234
+ with gr.Column():
235
+ input_file = gr.File(
236
+ label="Upload Audio File hoặc ZIP",
237
+ file_types=['.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac', '.zip']
238
+ )
239
+ process_btn = gr.Button("🚀 Bắt đầu xử lý", variant="primary")
240
+
241
+ with gr.Column():
242
+ output_file = gr.File(label="📦 Tải về Dataset ZIP")
243
+ status_text = gr.Textbox(label="📊 Trạng thái", lines=8)
244
+
245
+ process_btn.click(
246
+ fn=process_audio,
247
+ inputs=input_file,
248
+ outputs=[output_file, status_text]
249
+ )
250
+
251
+ gr.Markdown("""
252
+ ### 📝 Ghi chú:
253
+ - Dataset Parquet sẽ được chia nhỏ nếu > 500MB
254
+ - Cột `audio`: audio bytes (binary)
255
+ - Cột `transcription`: văn bản transcription
256
+ - Cột `file_name`: đường dẫn dạng `audio/filename_00001.wav`
257
+ - Các câu ngắn (< 2s) sẽ được gộp lại
258
+ """)
259
+
260
+ if __name__ == "__main__":
261
+ app.launch()
app (1) (23).py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import zipfile
4
+ import tempfile
5
+ from pathlib import Path
6
+ from faster_whisper import WhisperModel
7
+ import librosa
8
+ import soundfile as sf
9
+ import pandas as pd
10
+ import numpy as np
11
+ from typing import List, Dict
12
+ import shutil
13
+ import threading
14
+ import time
15
+ from datetime import datetime
16
+ import json
17
+ import traceback
18
+
19
+ # Khởi tạo model Whisper
20
+ model = WhisperModel("large-v3-turbo", device="cpu", compute_type="int8")
21
+
22
+ # Lưu trữ tasks và history
23
+ TASKS = {}
24
+ TASK_LOCK = threading.Lock()
25
+ STORAGE_DIR = "task_storage"
26
+ os.makedirs(STORAGE_DIR, exist_ok=True)
27
+
28
+ class TaskStatus:
29
+ WAITING = "⏳ Đang chờ"
30
+ PROCESSING = "🔄 Đang xử lý"
31
+ SUCCESS = "✅ Thành công"
32
+ ERROR = "❌ Lỗi"
33
+
34
+ def extract_audio_files(input_file: str, temp_dir: str) -> List[str]:
35
+ """Giải nén file zip hoặc copy file audio đơn"""
36
+ audio_files = []
37
+ audio_extensions = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'}
38
+
39
+ if input_file.endswith('.zip'):
40
+ with zipfile.ZipFile(input_file, 'r') as zip_ref:
41
+ zip_ref.extractall(temp_dir)
42
+
43
+ for root, _, files in os.walk(temp_dir):
44
+ for file in files:
45
+ if Path(file).suffix.lower() in audio_extensions:
46
+ audio_files.append(os.path.join(root, file))
47
+ else:
48
+ if Path(input_file).suffix.lower() in audio_extensions:
49
+ audio_files.append(input_file)
50
+
51
+ return audio_files
52
+
53
+ def transcribe_with_timestamps(audio_path: str) -> List[dict]:
54
+ """Transcribe audio và lấy timestamps"""
55
+ segments, info = model.transcribe(
56
+ audio_path,
57
+ beam_size=5,
58
+ vad_filter=True,
59
+ vad_parameters=dict(min_silence_duration_ms=500)
60
+ )
61
+
62
+ results = []
63
+ for segment in segments:
64
+ results.append({
65
+ 'start': segment.start,
66
+ 'end': segment.end,
67
+ 'text': segment.text.strip()
68
+ })
69
+
70
+ return results
71
+
72
+ def merge_short_segments(segments: List[dict], min_duration: float = 2.0) -> List[dict]:
73
+ """Gộp các segment ngắn lại với nhau"""
74
+ if not segments:
75
+ return []
76
+
77
+ merged = []
78
+ current = segments[0].copy()
79
+
80
+ for seg in segments[1:]:
81
+ current_duration = current['end'] - current['start']
82
+
83
+ if current_duration < min_duration:
84
+ current['end'] = seg['end']
85
+ current['text'] = current['text'] + ' ' + seg['text']
86
+ else:
87
+ merged.append(current)
88
+ current = seg.copy()
89
+
90
+ merged.append(current)
91
+ return merged
92
+
93
+ def cut_audio_by_timestamps(audio_path: str, segments: List[dict], output_dir: str, base_name: str) -> List[dict]:
94
+ """Cắt audio theo timestamps"""
95
+ audio, sr = librosa.load(audio_path, sr=None)
96
+
97
+ audio_records = []
98
+
99
+ for idx, seg in enumerate(segments):
100
+ start_sample = int(seg['start'] * sr)
101
+ end_sample = int(seg['end'] * sr)
102
+
103
+ audio_segment = audio[start_sample:end_sample]
104
+
105
+ output_filename = f"{base_name}_{idx+1:05d}.wav"
106
+ output_path = os.path.join(output_dir, output_filename)
107
+
108
+ sf.write(output_path, audio_segment, sr)
109
+
110
+ audio_records.append({
111
+ 'audio_path': output_path,
112
+ 'transcription': seg['text'],
113
+ 'file_name': f"audio/{output_filename}"
114
+ })
115
+
116
+ return audio_records
117
+
118
+ def save_to_parquet(records: List[dict], output_dir: str, max_size_mb: int = 500):
119
+ """Lưu records vào file parquet, chia nhỏ nếu quá lớn"""
120
+ df = pd.DataFrame(records)
121
+
122
+ # Đọc audio files và convert sang bytes
123
+ audio_data = []
124
+ for path in df['audio_path']:
125
+ with open(path, 'rb') as f:
126
+ audio_data.append(f.read())
127
+
128
+ df['audio'] = audio_data
129
+ df = df[['audio', 'transcription', 'file_name']]
130
+
131
+ # Tính kích thước ước lượng
132
+ temp_path = os.path.join(output_dir, 'temp.parquet')
133
+ df.to_parquet(temp_path, engine='pyarrow')
134
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
135
+ os.remove(temp_path)
136
+
137
+ parquet_files = []
138
+
139
+ if file_size_mb <= max_size_mb:
140
+ output_path = os.path.join(output_dir, 'train-00000-of-00001.parquet')
141
+ df.to_parquet(output_path, engine='pyarrow')
142
+ parquet_files.append(output_path)
143
+ else:
144
+ num_parts = int(np.ceil(file_size_mb / max_size_mb))
145
+ chunk_size = len(df) // num_parts + 1
146
+
147
+ for i in range(num_parts):
148
+ start_idx = i * chunk_size
149
+ end_idx = min((i + 1) * chunk_size, len(df))
150
+
151
+ df_chunk = df.iloc[start_idx:end_idx]
152
+ output_path = os.path.join(output_dir, f'train-{i:05d}-of-{num_parts:05d}.parquet')
153
+ df_chunk.to_parquet(output_path, engine='pyarrow')
154
+ parquet_files.append(output_path)
155
+
156
+ return parquet_files
157
+
158
+ def update_task_status(task_id: str, status: str, details: dict = None):
159
+ """Cập nhật trạng thái task"""
160
+ with TASK_LOCK:
161
+ if task_id in TASKS:
162
+ TASKS[task_id]['status'] = status
163
+ TASKS[task_id]['updated_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
164
+ if details:
165
+ TASKS[task_id].update(details)
166
+
167
+ # Lưu vào file
168
+ with open(os.path.join(STORAGE_DIR, f"{task_id}.json"), 'w', encoding='utf-8') as f:
169
+ json.dump(TASKS[task_id], f, ensure_ascii=False, indent=2)
170
+
171
+ def process_audio_background(task_id: str, input_file: str, original_filename: str):
172
+ """Xử lý audio trong background"""
173
+ try:
174
+ update_task_status(task_id, TaskStatus.PROCESSING, {
175
+ 'progress': 'Đang giải nén và phát hiện file audio...'
176
+ })
177
+
178
+ task_dir = os.path.join(STORAGE_DIR, task_id)
179
+ os.makedirs(task_dir, exist_ok=True)
180
+
181
+ extract_dir = os.path.join(task_dir, 'extracted')
182
+ audio_output_dir = os.path.join(task_dir, 'audio')
183
+ final_output_dir = os.path.join(task_dir, 'output')
184
+
185
+ os.makedirs(extract_dir, exist_ok=True)
186
+ os.makedirs(audio_output_dir, exist_ok=True)
187
+ os.makedirs(final_output_dir, exist_ok=True)
188
+
189
+ # Giải nén và lấy danh sách audio files
190
+ audio_files = extract_audio_files(input_file, extract_dir)
191
+
192
+ if not audio_files:
193
+ update_task_status(task_id, TaskStatus.ERROR, {
194
+ 'error': 'Không tìm thấy file audio nào trong file tải lên!'
195
+ })
196
+ return
197
+
198
+ update_task_status(task_id, TaskStatus.PROCESSING, {
199
+ 'progress': f'Tìm thấy {len(audio_files)} file audio. Đang transcribe...',
200
+ 'total_files': len(audio_files)
201
+ })
202
+
203
+ all_records = []
204
+
205
+ # Xử lý từng file audio
206
+ for idx, audio_file in enumerate(audio_files):
207
+ update_task_status(task_id, TaskStatus.PROCESSING, {
208
+ 'progress': f'Đang xử lý file {idx+1}/{len(audio_files)}: {Path(audio_file).name}'
209
+ })
210
+
211
+ base_name = Path(audio_file).stem
212
+
213
+ # Transcribe
214
+ segments = transcribe_with_timestamps(audio_file)
215
+
216
+ # Gộp các segment ngắn
217
+ merged_segments = merge_short_segments(segments, min_duration=2.0)
218
+
219
+ # Cắt audio
220
+ records = cut_audio_by_timestamps(
221
+ audio_file,
222
+ merged_segments,
223
+ audio_output_dir,
224
+ base_name
225
+ )
226
+
227
+ all_records.extend(records)
228
+
229
+ update_task_status(task_id, TaskStatus.PROCESSING, {
230
+ 'progress': f'Đã tạo {len(all_records)} segments. Đang lưu vào Parquet...'
231
+ })
232
+
233
+ # Lưu vào parquet
234
+ parquet_files = save_to_parquet(all_records, final_output_dir)
235
+
236
+ # Copy audio folder vào output
237
+ final_audio_dir = os.path.join(final_output_dir, 'audio')
238
+ shutil.copytree(audio_output_dir, final_audio_dir)
239
+
240
+ # Tạo file zip
241
+ zip_path = os.path.join(task_dir, 'dataset_output.zip')
242
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
243
+ # Thêm audio files
244
+ for root, _, files in os.walk(final_audio_dir):
245
+ for file in files:
246
+ file_path = os.path.join(root, file)
247
+ arcname = os.path.join('audio', file)
248
+ zipf.write(file_path, arcname)
249
+
250
+ # Thêm parquet files
251
+ for pq_file in parquet_files:
252
+ zipf.write(pq_file, os.path.basename(pq_file))
253
+
254
+ # Tính kích thước file
255
+ zip_size_mb = os.path.getsize(zip_path) / (1024 * 1024)
256
+
257
+ update_task_status(task_id, TaskStatus.SUCCESS, {
258
+ 'progress': 'Hoàn thành!',
259
+ 'input_files': len(audio_files),
260
+ 'total_segments': len(all_records),
261
+ 'parquet_files': len(parquet_files),
262
+ 'output_zip': zip_path,
263
+ 'zip_size_mb': round(zip_size_mb, 2)
264
+ })
265
+
266
+ except Exception as e:
267
+ error_msg = f"{str(e)}\n\n{traceback.format_exc()}"
268
+ update_task_status(task_id, TaskStatus.ERROR, {
269
+ 'error': error_msg
270
+ })
271
+
272
+ def submit_task(input_file):
273
+ """Submit task mới"""
274
+ if input_file is None:
275
+ return "❌ Vui lòng upload file audio hoặc file zip!", ""
276
+
277
+ task_id = f"task_{int(time.time() * 1000)}"
278
+ original_filename = Path(input_file).name
279
+
280
+ task_info = {
281
+ 'task_id': task_id,
282
+ 'status': TaskStatus.WAITING,
283
+ 'created_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
284
+ 'updated_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
285
+ 'original_filename': original_filename,
286
+ 'progress': 'Task đã được tạo, đang chờ xử lý...'
287
+ }
288
+
289
+ with TASK_LOCK:
290
+ TASKS[task_id] = task_info
291
+
292
+ # Chạy background thread
293
+ thread = threading.Thread(
294
+ target=process_audio_background,
295
+ args=(task_id, input_file, original_filename),
296
+ daemon=True
297
+ )
298
+ thread.start()
299
+
300
+ return f"✅ Task {task_id} đã được tạo và đang xử lý trong background!", task_id
301
+
302
+ def load_all_tasks():
303
+ """Load tất cả tasks từ storage"""
304
+ with TASK_LOCK:
305
+ for file in os.listdir(STORAGE_DIR):
306
+ if file.endswith('.json'):
307
+ task_id = file.replace('.json', '')
308
+ if task_id not in TASKS:
309
+ with open(os.path.join(STORAGE_DIR, file), 'r', encoding='utf-8') as f:
310
+ TASKS[task_id] = json.load(f)
311
+
312
+ def get_task_list():
313
+ """Lấy danh sách tasks để hiển thị trong dropdown"""
314
+ load_all_tasks()
315
+ with TASK_LOCK:
316
+ task_list = [(f"{task['task_id']} - {task['status']} - {task['original_filename']}",
317
+ task['task_id'])
318
+ for task in sorted(TASKS.values(),
319
+ key=lambda x: x['created_at'],
320
+ reverse=True)]
321
+ return task_list
322
+
323
+ def get_task_info(task_id):
324
+ """Lấy thông tin chi tiết của task"""
325
+ if not task_id:
326
+ return "Chọn task để xem thông tin", None
327
+
328
+ load_all_tasks()
329
+
330
+ with TASK_LOCK:
331
+ if task_id not in TASKS:
332
+ return "Task không tồn tại!", None
333
+
334
+ task = TASKS[task_id]
335
+
336
+ info = f"""
337
+ ## 📋 Thông tin Task: {task_id}
338
+
339
+ **Trạng thái:** {task['status']}
340
+ **File gốc:** {task.get('original_filename', 'N/A')}
341
+ **Thời gian tạo:** {task['created_at']}
342
+ **Cập nhật lần cuối:** {task['updated_at']}
343
+
344
+ ---
345
+
346
+ ### 📊 Chi tiết
347
+
348
+ **Tiến trình:** {task.get('progress', 'N/A')}
349
+ """
350
+
351
+ if task['status'] == TaskStatus.SUCCESS:
352
+ info += f"""
353
+ **Số file audio đầu vào:** {task.get('input_files', 'N/A')}
354
+ **Tổng số segments:** {task.get('total_segments', 'N/A')}
355
+ **Số file Parquet:** {task.get('parquet_files', 'N/A')}
356
+ **Kích thước ZIP:** {task.get('zip_size_mb', 'N/A')} MB
357
+ """
358
+ zip_path = task.get('output_zip')
359
+ if zip_path and os.path.exists(zip_path):
360
+ return info, zip_path
361
+
362
+ elif task['status'] == TaskStatus.ERROR:
363
+ info += f"""
364
+ **Lỗi:**
365
+ ```
366
+ {task.get('error', 'Unknown error')}
367
+ ```
368
+ """
369
+
370
+ return info, None
371
+
372
+ def refresh_task_list():
373
+ """Refresh danh sách tasks"""
374
+ choices = get_task_list()
375
+ return gr.Dropdown(choices=choices, value=choices[0][1] if choices else None)
376
+
377
+ # Load tasks khi khởi động
378
+ load_all_tasks()
379
+
380
+ # Tạo giao diện Gradio
381
+ with gr.Blocks(title="Audio Transcription & Dataset Creator", theme=gr.themes.Soft()) as app:
382
+ gr.Markdown("""
383
+ # 🎙️ Audio Transcription & Dataset Creator with Background Processing
384
+ Upload file audio hoặc file zip - Hệ thống xử lý trong background và lưu lịch sử
385
+ """)
386
+
387
+ with gr.Tabs():
388
+ # Tab Upload
389
+ with gr.Tab("📤 Upload & Submit"):
390
+ gr.Markdown("### Tải lên file và submit task")
391
+
392
+ with gr.Row():
393
+ with gr.Column():
394
+ input_file = gr.File(
395
+ label="Upload Audio File hoặc ZIP",
396
+ file_types=['.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac', '.zip']
397
+ )
398
+ submit_btn = gr.Button("🚀 Submit Task", variant="primary", size="lg")
399
+
400
+ with gr.Column():
401
+ submit_status = gr.Textbox(label="📋 Trạng thái Submit", lines=3)
402
+ current_task_id = gr.Textbox(label="Task ID", visible=False)
403
+
404
+ gr.Markdown("""
405
+ ### ℹ️ Hướng dẫn:
406
+ 1. Upload file audio hoặc ZIP chứa nhiều file audio
407
+ 2. Click "Submit Task" - task sẽ chạy trong background
408
+ 3. Chuyển sang tab "History" để xem tiến trình và tải kết quả
409
+ """)
410
+
411
+ # Tab History
412
+ with gr.Tab("📜 History"):
413
+ gr.Markdown("### Xem lại lịch sử tasks và tải kết quả")
414
+
415
+ with gr.Row():
416
+ refresh_btn = gr.Button("🔄 Refresh", size="sm")
417
+ task_dropdown = gr.Dropdown(
418
+ label="Chọn Task",
419
+ choices=get_task_list(),
420
+ value=get_task_list()[0][1] if get_task_list() else None,
421
+ interactive=True
422
+ )
423
+
424
+ task_info_display = gr.Markdown("Chọn task để xem thông tin")
425
+
426
+ download_btn = gr.File(label="📦 Tải về Dataset ZIP")
427
+
428
+ # Auto refresh mỗi 3 giây
429
+ timer = gr.Timer(3)
430
+
431
+ # Event handlers
432
+ submit_btn.click(
433
+ fn=submit_task,
434
+ inputs=input_file,
435
+ outputs=[submit_status, current_task_id]
436
+ )
437
+
438
+ refresh_btn.click(
439
+ fn=refresh_task_list,
440
+ outputs=task_dropdown
441
+ )
442
+
443
+ task_dropdown.change(
444
+ fn=get_task_info,
445
+ inputs=task_dropdown,
446
+ outputs=[task_info_display, download_btn]
447
+ )
448
+
449
+ timer.tick(
450
+ fn=get_task_info,
451
+ inputs=task_dropdown,
452
+ outputs=[task_info_display, download_btn]
453
+ )
454
+
455
+ if __name__ == "__main__":
456
+ app.launch()