Spaces:

archivartaunik
/

DatasetChecker

Sleeping

App Files Files Community

DatasetChecker / analysis /standard.py

archivartaunik

Update analysis/standard.py

a7d909c verified 17 days ago

raw

history blame contribute delete

29.1 kB

	"""Standard analysis with batch mode support."""
	import os
	import time
	import tempfile
	import re
	import gradio as gr
	import soundfile as sf
	from google import genai

	import utils
	from core.state import get_global_results, set_global_results
	from core.cache import get_cached_dataset, cache_dataset
	from core.comparison import select_best_model_result
	from ui.dashboard import generate_dashboard_outputs
	from gemini_api import GeminiIntegrator, BatchTask, DEFAULT_TRANSCRIPTION_PROMPT
	from hf_asr import is_hf_asr_model, get_hf_asr_client, HF_BATCH_SIZE


	def sanitize_filename(name):
	"""Sanitize string to be used as a filename."""
	if not name:
	return "results"
	# Replace non-alphanumeric with underscore
	s = re.sub(r'[^\w\s-]', '_', name).strip().lower()
	# Replace whitespace with underscore
	s = re.sub(r'[-\s]+', '_', s)
	return s


	def run_analysis(
	api_key: str,
	dataset_name: str,
	model_name: str,
	limit_files: int,
	temperature: float,
	thinking_budget: int,
	similarity_threshold: int,
	batch_mode: bool = False,
	recheck_problematic: bool = False,
	progress=gr.Progress()
	):
	global_results = get_global_results()

	# Robust type conversion for Gradio inputs
	limit_files = int(float(limit_files)) if limit_files else 0
	thinking_budget = int(float(thinking_budget)) if thinking_budget else 0
	similarity_threshold = int(float(similarity_threshold)) if similarity_threshold else 90
	temperature = float(temperature)

	# ---------------------------------------------------------
	# HUGGING FACE ASR MODE
	# ---------------------------------------------------------
	if is_hf_asr_model(model_name):
	return _run_hf_asr_analysis(
	model_name, dataset_name, limit_files,
	similarity_threshold, recheck_problematic, progress
	)

	# For Gemini models, API key is required
	if not api_key:
	raise gr.Error("Калі ласка, увядзіце Gemini API ключ.")

	try:
	gemini_tool = GeminiIntegrator(api_key=api_key)

	config_args = {"temperature": temperature}
	use_thinking = "thinking" in model_name

	if use_thinking and thinking_budget > 0:
	config_args["thinking_config"] = {
	"include_thoughts": True,
	"budget_tokens": thinking_budget
	}

	gen_config = genai.types.GenerateContentConfig(**config_args)

	# ---------------------------------------------------------
	# BATCH MODE
	# ---------------------------------------------------------
	if batch_mode:
	return _run_batch_analysis(
	gemini_tool, model_name, dataset_name, limit_files,
	similarity_threshold, recheck_problematic, progress
	)

	# ---------------------------------------------------------
	# STANDARD SYNC MODE
	# ---------------------------------------------------------
	if recheck_problematic:
	return _run_recheck_analysis(
	gemini_tool, model_name, dataset_name, limit_files,
	similarity_threshold, gen_config, progress
	)
	else:
	return _run_fresh_analysis(
	gemini_tool, model_name, dataset_name, limit_files,
	similarity_threshold, gen_config, progress
	)

	except Exception as e:
	raise gr.Error(f"Памылка: {e}")


	def _run_batch_analysis(
	gemini_tool, model_name, dataset_name, limit_files,
	similarity_threshold, recheck_problematic, progress
	):
	"""Run batch processing mode."""
	global_results = get_global_results()
	ds = None

	# 1. Prepare Data
	if recheck_problematic:
	if not global_results:
	gr.Warning("Няма вынікаў для пераправеркі.")
	return generate_dashboard_outputs(similarity_threshold)

	# Load full dataset for audio fallback
	limit = None
	cached_ds = get_cached_dataset(dataset_name, limit)
	if cached_ds:
	ds = cached_ds
	else:
	progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
	ds = utils.load_hf_dataset(dataset_name, limit=limit)
	cache_dataset(dataset_name, limit, ds)
	else:
	limit = int(limit_files) if limit_files > 0 else None
	cached_ds = get_cached_dataset(dataset_name, limit)
	if cached_ds:
	ds = cached_ds
	else:
	progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
	ds = utils.load_hf_dataset(dataset_name, limit=limit)
	cache_dataset(dataset_name, limit, ds)

	# Init results if fresh run
	progress(0.1, desc="Ініцыялізацыя спісу...")
	new_results = []
	for idx, item in enumerate(ds):
	ref_text = item.get('sentence') or item.get('text') or item.get('transcription') or item.get('transcript') or ""
	new_results.append({
	"id": idx,
	"path": item['audio']['path'],
	"ref_text": ref_text,
	"hyp_text": "",
	"score": 0,
	"audio_array": item['audio']['array'],
	"sampling_rate": item['audio']['sampling_rate'],
	"model_used": model_name,
	"verification_status": "pending"
	})
	set_global_results(new_results)
	global_results = get_global_results()

	# 2. Prepare Tasks
	tasks = []
	tmp_dir_obj = tempfile.TemporaryDirectory()
	tmp_dir = tmp_dir_obj.name

	def prepare_task(idx, row_data, audio_ref):
	key = f"task_{idx}"
	fpath = audio_ref['audio']['path']

	# Verify file existence or dump numpy to WAV
	if not fpath or not os.path.exists(fpath):
	audio_arr = audio_ref['audio']['array']
	sr = audio_ref['audio']['sampling_rate']
	if len(audio_arr) == 0:
	return None

	clean_name = sanitize_filename(f"audio_{idx}")
	dump_path = os.path.join(tmp_dir, f"{clean_name}.wav")
	sf.write(dump_path, audio_arr, int(sr), format='WAV')
	fpath = dump_path

	return BatchTask(key=key, path=fpath, mime_type="audio/wav")

	progress(0.2, desc="Падрыхтоўка задач для пакетнага рэжыму...")
	task_map_idx = {} # task_key -> result_index

	if recheck_problematic:
	# Identification logic
	target_indices = [
	i for i, r in enumerate(global_results)
	if r['score'] < similarity_threshold
	and r.get('verification_status') != 'correct'
	]
	if limit_files > 0:
	target_indices = target_indices[:limit_files]

	if not target_indices:
	gr.Info("Няма праблемных файлаў для пераправеркі.")
	try: tmp_dir_obj.cleanup()
	except: pass
	return generate_dashboard_outputs(similarity_threshold)

	# Create DS Map
	ds_map = {}
	for di, d_item in enumerate(ds):
	p = d_item['audio']['path']
	if p:
	ds_map[p] = d_item
	ds_map[os.path.basename(p)] = d_item
	ds_map[di] = d_item

	for global_res_idx in target_indices:
	res = global_results[global_res_idx]
	path = res.get('path', '')

	# Try finding item
	item = ds_map.get(path) or ds_map.get(os.path.basename(path))
	if not item and res.get('id') is not None:
	try: item = ds[int(res.get('id'))]
	except: pass

	if item:
	t = prepare_task(global_res_idx, res, item)
	if t:
	tasks.append(t)
	task_map_idx[t.key] = global_res_idx
	else:
	# Tasks for all
	for idx, res in enumerate(global_results):
	item = ds[idx]
	t = prepare_task(idx, res, item)
	if t:
	tasks.append(t)
	task_map_idx[t.key] = idx

	if not tasks:
	gr.Warning("Не знойдзена задач для выканання (магчыма, адсутнічае аўдыя).")
	try: tmp_dir_obj.cleanup()
	except: pass
	return generate_dashboard_outputs(similarity_threshold)

	# 3. Execute Batch
	progress(0.3, desc=f"Запуск пакетнай апрацоўкі ({len(tasks)} файлаў). Гэта зойме час...")
	prompt = DEFAULT_TRANSCRIPTION_PROMPT

	try:
	batch_results = gemini_tool.run_batch(tasks, model_name, prompt)
	except Exception as e:
	try: tmp_dir_obj.cleanup()
	except: pass
	raise gr.Error(f"Batch failed: {e}")

	progress(0.9, desc="Апрацоўка вынікаў...")

	# 4. Map Results
	for key, text in batch_results.items():
	if key in task_map_idx:
	idx = task_map_idx[key]
	if idx < len(global_results):
	ref_text = global_results[idx]['ref_text']
	score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, text)

	global_results[idx].update({
	"hyp_text": text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp,
	"verification_status": "correct" if score >= similarity_threshold else "incorrect",
	"model_used": f"batch_{model_name}"
	})

	if 'model_results' not in global_results[idx]:
	global_results[idx]['model_results'] = {}
	global_results[idx]['model_results'][model_name] = {
	"hyp_text": text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp
	}

	try: tmp_dir_obj.cleanup()
	except: pass

	return generate_dashboard_outputs(similarity_threshold)


	def _run_recheck_analysis(
	gemini_tool, model_name, dataset_name, limit_files,
	similarity_threshold, gen_config, progress
	):
	"""Run recheck of problematic files."""
	global_results = get_global_results()

	if not global_results:
	gr.Warning("Няма вынікаў для пераправеркі.")
	return generate_dashboard_outputs(similarity_threshold)

	# Identify problematic records
	target_indices = [
	i for i, r in enumerate(global_results)
	if r['score'] < similarity_threshold
	and r.get('verification_status') != 'correct'
	]

	if limit_files > 0:
	target_indices = target_indices[:limit_files]

	if not target_indices:
	gr.Info("Няма праблемных файлаў для пераправеркі.")
	return generate_dashboard_outputs(similarity_threshold)

	# Load dataset to get audio for files that might be missing it
	limit = None
	cached_ds = get_cached_dataset(dataset_name, limit)
	if cached_ds is not None:
	progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
	ds = cached_ds
	else:
	progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
	ds = utils.load_hf_dataset(dataset_name, limit=limit)
	cache_dataset(dataset_name, limit, ds)
	progress(0.05, desc=f"Датасет закэшаваны")

	# Build audio map by filename
	audio_map = {}
	for item in ds:
	path = item['audio']['path']
	if path:
	fname = os.path.basename(path)
	audio_map[fname] = item
	audio_map[path] = item

	progress(0.1, desc=f"Пераправерка {len(target_indices)} файлаў...")

	for j, idx in enumerate(target_indices):
	progress(0.1 + (j + 1) / len(target_indices) * 0.9, desc=f"Праверка {j+1}/{len(target_indices)}")

	result = global_results[idx]
	audio_data = result.get('audio_array')
	sampling_rate = result.get('sampling_rate')
	ref_text = result.get('ref_text', "")

	# If audio is missing, try to fetch from dataset
	if audio_data is None or len(audio_data) == 0:
	path = result.get('path', '')
	item = audio_map.get(path) or audio_map.get(os.path.basename(path))

	# Fallback: try to find by ID if path lookup failed
	if not item:
	rec_id = result.get('id')
	if rec_id is not None:
	try:
	rec_id = int(rec_id)
	if 0 <= rec_id < len(ds):
	item = ds[rec_id]
	except:
	pass

	if item:
	audio_data = item['audio']['array']
	sampling_rate = item['audio']['sampling_rate']
	global_results[idx]['audio_array'] = audio_data
	global_results[idx]['sampling_rate'] = sampling_rate
	else:
	print(f"Problematic Recheck: Skipping index {idx}, path '{path}', id {result.get('id')}: Audio not found in dataset.")
	continue

	hyp_text = gemini_tool.transcribe_audio(model_name, audio_data, sampling_rate, config=gen_config)
	score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)

	print(f"🔄 Updated: {result.get('path')} \| Score: {result.get('score')} -> {score} \| Text: {hyp_text}")

	# Save model result
	if 'model_results' not in global_results[idx]:
	global_results[idx]['model_results'] = {}

	global_results[idx]['model_results'][model_name] = {
	"hyp_text": hyp_text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp
	}

	# Select best result from all models
	best_model, best_result = select_best_model_result(
	global_results[idx]['model_results'],
	similarity_threshold
	)

	if best_result:
	global_results[idx].update({
	"hyp_text": best_result['hyp_text'],
	"score": best_result['score'],
	"norm_ref": best_result['norm_ref'],
	"norm_hyp": best_result['norm_hyp'],
	"model_used": best_model,
	"verification_status": "correct" if best_result['score'] >= similarity_threshold else "incorrect"
	})

	return generate_dashboard_outputs(similarity_threshold)


	def _run_fresh_analysis(
	gemini_tool, model_name, dataset_name, limit_files,
	similarity_threshold, gen_config, progress
	):
	"""Run fresh analysis on all files."""
	limit = int(limit_files) if limit_files > 0 else None

	cached_ds = get_cached_dataset(dataset_name, limit)
	if cached_ds is not None:
	progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
	ds = cached_ds
	else:
	progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
	ds = utils.load_hf_dataset(dataset_name, limit=limit)
	cache_dataset(dataset_name, limit, ds)
	progress(0.1, desc=f"Датасет закэшаваны для паўторнага выкарыстання")

	results = []

	for idx, item in enumerate(ds):
	progress((idx + 1) / len(ds), desc=f"Апрацоўка файла {idx+1}/{len(ds)}")

	audio_data = item['audio']['array']
	sampling_rate = item['audio']['sampling_rate']
	ref_text = item.get('sentence') or item.get('text') or item.get('transcription') or item.get('transcript') or ""

	hyp_text = gemini_tool.transcribe_audio(model_name, audio_data, sampling_rate, config=gen_config)
	score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)

	results.append({
	"id": idx,
	"path": item['audio']['path'],
	"ref_text": ref_text,
	"hyp_text": hyp_text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp,
	"audio_array": audio_data,
	"sampling_rate": sampling_rate,
	"model_used": model_name,
	"verification_status": "correct" if score >= similarity_threshold else "incorrect",
	"model_results": {
	model_name: {
	"hyp_text": hyp_text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp
	}
	}
	})

	set_global_results(results)
	return generate_dashboard_outputs(similarity_threshold)


	def _run_hf_asr_analysis(
	model_name: str,
	dataset_name: str,
	limit_files: int,
	similarity_threshold: int,
	recheck_problematic: bool,
	progress
	):
	"""Run analysis using Hugging Face ASR model."""
	global_results = get_global_results()

	try:
	hf_client = get_hf_asr_client(model_name)
	progress(0.05, desc=f"Падключэнне да HF Space: {model_name}...")
	except Exception as e:
	raise gr.Error(f"Памылка падключэння да HF: {e}")

	if recheck_problematic:
	return _run_hf_recheck_analysis(
	hf_client, model_name, dataset_name, limit_files,
	similarity_threshold, progress
	)
	else:
	return _run_hf_fresh_analysis(
	hf_client, model_name, dataset_name, limit_files,
	similarity_threshold, progress
	)


	def _run_hf_fresh_analysis(
	hf_client, model_name, dataset_name, limit_files,
	similarity_threshold, progress
	):
	"""Run fresh analysis using HF ASR with batch processing."""
	limit = int(limit_files) if limit_files > 0 else None

	cached_ds = get_cached_dataset(dataset_name, limit)
	if cached_ds is not None:
	progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
	ds = cached_ds
	else:
	progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
	ds = utils.load_hf_dataset(dataset_name, limit=limit)
	cache_dataset(dataset_name, limit, ds)
	progress(0.1, desc=f"Датасет закэшаваны для паўторнага выкарыстання")

	# Pre-collect all items with their data
	all_items = []
	for idx, item in enumerate(ds):
	audio_data = item['audio']['array']
	sampling_rate = item['audio']['sampling_rate']
	ref_text = item.get('sentence') or item.get('text') or item.get('transcription') or item.get('transcript') or ""
	all_items.append({
	"idx": idx,
	"path": item['audio']['path'],
	"audio_data": audio_data,
	"sampling_rate": sampling_rate,
	"ref_text": ref_text
	})

	total_items = len(all_items)
	results = [None] * total_items # Pre-allocate for correct ordering

	# Process in batches of HF_BATCH_SIZE (100)
	batch_size = HF_BATCH_SIZE
	num_batches = (total_items + batch_size - 1) // batch_size

	for batch_num in range(num_batches):
	# Delay between batches to avoid rate limiting (skip for first batch)
	if batch_num > 0:
	print(f"⏳ Чакаем 5с перад наступным пакетам...")
	time.sleep(5)

	start_idx = batch_num * batch_size
	end_idx = min(start_idx + batch_size, total_items)
	batch_items = all_items[start_idx:end_idx]

	progress_val = 0.1 + (batch_num / num_batches) * 0.9
	progress(progress_val, desc=f"Пакет {batch_num + 1}/{num_batches}: апрацоўка {len(batch_items)} файлаў (HF ASR)...")

	# Prepare batch for transcription: (key, audio_array, sampling_rate)
	batch_audio = [
	(item["idx"], item["audio_data"], item["sampling_rate"])
	for item in batch_items
	]

	# Send batch to HF ASR (retry logic is inside transcribe_batch)
	transcriptions = hf_client.transcribe_batch(batch_audio)

	# Process results - only save successful transcriptions
	transcribed_count = 0
	for item in batch_items:
	idx = item["idx"]
	ref_text = item["ref_text"]
	hyp_text = transcriptions.get(idx, "")

	# Only record result if transcription was successful
	if hyp_text:
	score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)
	transcribed_count += 1

	results[idx] = {
	"id": idx,
	"path": item["path"],
	"ref_text": ref_text,
	"hyp_text": hyp_text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp,
	"audio_array": item["audio_data"],
	"sampling_rate": item["sampling_rate"],
	"model_used": model_name,
	"verification_status": "correct" if score >= similarity_threshold else "incorrect",
	"model_results": {
	model_name: {
	"hyp_text": hyp_text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp
	}
	}
	}
	# Skip items with no transcription result

	print(f"✅ Пакет {batch_num + 1}/{num_batches} завершаны: {transcribed_count}/{len(batch_items)} транскрыбавана")

	set_global_results(results)
	return generate_dashboard_outputs(similarity_threshold)


	def _run_hf_recheck_analysis(
	hf_client, model_name, dataset_name, limit_files,
	similarity_threshold, progress
	):
	"""Run recheck of problematic files using HF ASR with batch processing."""
	global_results = get_global_results()

	if not global_results:
	gr.Warning("Няма вынікаў для пераправеркі.")
	return generate_dashboard_outputs(similarity_threshold)

	# Identify problematic records
	target_indices = [
	i for i, r in enumerate(global_results)
	if r['score'] < similarity_threshold
	and r.get('verification_status') != 'correct'
	]

	if limit_files > 0:
	target_indices = target_indices[:limit_files]

	if not target_indices:
	gr.Info("Няма праблемных файлаў для пераправеркі.")
	return generate_dashboard_outputs(similarity_threshold)

	# Load dataset to get audio for files that might be missing it
	limit = None
	cached_ds = get_cached_dataset(dataset_name, limit)
	if cached_ds is not None:
	progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
	ds = cached_ds
	else:
	progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
	ds = utils.load_hf_dataset(dataset_name, limit=limit)
	cache_dataset(dataset_name, limit, ds)
	progress(0.05, desc=f"Датасет закэшаваны")

	# Build audio map by filename
	audio_map = {}
	for item in ds:
	path = item['audio']['path']
	if path:
	fname = os.path.basename(path)
	audio_map[fname] = item
	audio_map[path] = item

	# Collect all items to process with their audio data
	items_to_process = []
	for idx in target_indices:
	result = global_results[idx]
	audio_data = result.get('audio_array')
	sampling_rate = result.get('sampling_rate')
	ref_text = result.get('ref_text', "")

	# If audio is missing, try to fetch from dataset
	if audio_data is None or (hasattr(audio_data, '__len__') and len(audio_data) == 0):
	path = result.get('path', '')
	item = audio_map.get(path) or audio_map.get(os.path.basename(path))

	# Fallback: try to find by ID if path lookup failed
	if not item:
	rec_id = result.get('id')
	if rec_id is not None:
	try:
	rec_id = int(rec_id)
	if 0 <= rec_id < len(ds):
	item = ds[rec_id]
	except:
	pass

	if item:
	audio_data = item['audio']['array']
	sampling_rate = item['audio']['sampling_rate']
	global_results[idx]['audio_array'] = audio_data
	global_results[idx]['sampling_rate'] = sampling_rate
	else:
	print(f"HF Recheck: Skipping index {idx}, path '{path}': Audio not found.")
	continue

	items_to_process.append({
	"idx": idx,
	"audio_data": audio_data,
	"sampling_rate": sampling_rate,
	"ref_text": ref_text
	})

	if not items_to_process:
	gr.Info("Няма файлаў з аўдыя для пераправеркі.")
	return generate_dashboard_outputs(similarity_threshold)

	# Process in batches of HF_BATCH_SIZE (100)
	batch_size = HF_BATCH_SIZE
	total_items = len(items_to_process)
	num_batches = (total_items + batch_size - 1) // batch_size

	progress(0.1, desc=f"Пераправерка {total_items} файлаў у {num_batches} пакетах (HF ASR)...")

	for batch_num in range(num_batches):
	# Delay between batches to avoid rate limiting (skip for first batch)
	if batch_num > 0:
	print(f"⏳ Чакаем 5с перад наступным пакетам...")
	time.sleep(5)

	start_idx = batch_num * batch_size
	end_idx = min(start_idx + batch_size, total_items)
	batch_items = items_to_process[start_idx:end_idx]

	progress_val = 0.1 + (batch_num / num_batches) * 0.9
	progress(progress_val, desc=f"Пакет {batch_num + 1}/{num_batches}: апрацоўка {len(batch_items)} файлаў...")

	# Prepare batch for transcription: (key, audio_array, sampling_rate)
	batch_audio = [
	(item["idx"], item["audio_data"], item["sampling_rate"])
	for item in batch_items
	]

	# Send batch to HF ASR (retry logic is inside transcribe_batch)
	transcriptions = hf_client.transcribe_batch(batch_audio)

	# Process results - only save successful transcriptions
	transcribed_count = 0
	for item in batch_items:
	idx = item["idx"]
	ref_text = item["ref_text"]
	hyp_text = transcriptions.get(idx, "")

	if not hyp_text:
	continue

	transcribed_count += 1
	score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)

	print(f"🔄 HF Updated: {global_results[idx].get('path')} \| Score: {global_results[idx].get('score')} -> {score}")

	# Save model result
	if 'model_results' not in global_results[idx]:
	global_results[idx]['model_results'] = {}

	global_results[idx]['model_results'][model_name] = {
	"hyp_text": hyp_text,
	"score": score,
	"norm_ref": norm_ref,
	"norm_hyp": norm_hyp
	}

	# Select best result from all models
	best_model, best_result = select_best_model_result(
	global_results[idx]['model_results'],
	similarity_threshold
	)

	if best_result:
	global_results[idx].update({
	"hyp_text": best_result['hyp_text'],
	"score": best_result['score'],
	"norm_ref": best_result['norm_ref'],
	"norm_hyp": best_result['norm_hyp'],
	"model_used": best_model,
	"verification_status": "correct" if best_result['score'] >= similarity_threshold else "incorrect"
	})

	print(f"✅ Пакет {batch_num + 1}/{num_batches} завершаны: {transcribed_count}/{len(batch_items)} транскрыбавана")

	return generate_dashboard_outputs(similarity_threshold)