"""Standard analysis with batch mode support."""
import os
import time
import tempfile
import re
import gradio as gr
import soundfile as sf
from google import genai

import utils
from core.state import get_global_results, set_global_results
from core.cache import get_cached_dataset, cache_dataset
from core.comparison import select_best_model_result
from ui.dashboard import generate_dashboard_outputs
from gemini_api import GeminiIntegrator, BatchTask, DEFAULT_TRANSCRIPTION_PROMPT
from hf_asr import is_hf_asr_model, get_hf_asr_client, HF_BATCH_SIZE


def sanitize_filename(name):
    """Sanitize string to be used as a filename."""
    if not name:
        return "results"
    # Replace non-alphanumeric with underscore
    s = re.sub(r'[^\w\s-]', '_', name).strip().lower()
    # Replace whitespace with underscore
    s = re.sub(r'[-\s]+', '_', s)
    return s


def run_analysis(
    api_key: str,
    dataset_name: str,
    model_name: str,
    limit_files: int,
    temperature: float,
    thinking_budget: int,
    similarity_threshold: int,
    batch_mode: bool = False,
    recheck_problematic: bool = False,
    progress=gr.Progress()
):
    global_results = get_global_results()
    
    # Robust type conversion for Gradio inputs
    limit_files = int(float(limit_files)) if limit_files else 0
    thinking_budget = int(float(thinking_budget)) if thinking_budget else 0
    similarity_threshold = int(float(similarity_threshold)) if similarity_threshold else 90
    temperature = float(temperature)

    # ---------------------------------------------------------
    # HUGGING FACE ASR MODE
    # ---------------------------------------------------------
    if is_hf_asr_model(model_name):
        return _run_hf_asr_analysis(
            model_name, dataset_name, limit_files,
            similarity_threshold, recheck_problematic, progress
        )

    # For Gemini models, API key is required
    if not api_key:
        raise gr.Error("Калі ласка, увядзіце Gemini API ключ.")

    try:
        gemini_tool = GeminiIntegrator(api_key=api_key)

        config_args = {"temperature": temperature}
        use_thinking = "thinking" in model_name

        if use_thinking and thinking_budget > 0:
            config_args["thinking_config"] = {
                "include_thoughts": True,
                "budget_tokens": thinking_budget
            }

        gen_config = genai.types.GenerateContentConfig(**config_args)

        # ---------------------------------------------------------
        # BATCH MODE
        # ---------------------------------------------------------
        if batch_mode:
            return _run_batch_analysis(
                gemini_tool, model_name, dataset_name, limit_files,
                similarity_threshold, recheck_problematic, progress
            )

        # ---------------------------------------------------------
        # STANDARD SYNC MODE
        # ---------------------------------------------------------
        if recheck_problematic:
            return _run_recheck_analysis(
                gemini_tool, model_name, dataset_name, limit_files,
                similarity_threshold, gen_config, progress
            )
        else:
            return _run_fresh_analysis(
                gemini_tool, model_name, dataset_name, limit_files,
                similarity_threshold, gen_config, progress
            )

    except Exception as e:
        raise gr.Error(f"Памылка: {e}")


def _run_batch_analysis(
    gemini_tool, model_name, dataset_name, limit_files,
    similarity_threshold, recheck_problematic, progress
):
    """Run batch processing mode."""
    global_results = get_global_results()
    ds = None
    
    # 1. Prepare Data
    if recheck_problematic:
        if not global_results:
            gr.Warning("Няма вынікаў для пераправеркі.")
            return generate_dashboard_outputs(similarity_threshold)
        
        # Load full dataset for audio fallback
        limit = None 
        cached_ds = get_cached_dataset(dataset_name, limit)
        if cached_ds:
            ds = cached_ds
        else:
            progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
            ds = utils.load_hf_dataset(dataset_name, limit=limit)
            cache_dataset(dataset_name, limit, ds)
    else:
        limit = int(limit_files) if limit_files > 0 else None
        cached_ds = get_cached_dataset(dataset_name, limit)
        if cached_ds:
            ds = cached_ds
        else:
            progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
            ds = utils.load_hf_dataset(dataset_name, limit=limit)
            cache_dataset(dataset_name, limit, ds)
        
        # Init results if fresh run
        progress(0.1, desc="Ініцыялізацыя спісу...")
        new_results = []
        for idx, item in enumerate(ds):
            ref_text = item.get('sentence') or item.get('text') or item.get('transcription') or item.get('transcript') or ""
            new_results.append({
                "id": idx,
                "path": item['audio']['path'],
                "ref_text": ref_text,
                "hyp_text": "",
                "score": 0,
                "audio_array": item['audio']['array'],
                "sampling_rate": item['audio']['sampling_rate'],
                "model_used": model_name,
                "verification_status": "pending"
            })
        set_global_results(new_results)
        global_results = get_global_results()

    # 2. Prepare Tasks
    tasks = []
    tmp_dir_obj = tempfile.TemporaryDirectory()
    tmp_dir = tmp_dir_obj.name
    
    def prepare_task(idx, row_data, audio_ref):
        key = f"task_{idx}"
        fpath = audio_ref['audio']['path']
        
        # Verify file existence or dump numpy to WAV
        if not fpath or not os.path.exists(fpath):
            audio_arr = audio_ref['audio']['array']
            sr = audio_ref['audio']['sampling_rate']
            if len(audio_arr) == 0:
                return None
            
            clean_name = sanitize_filename(f"audio_{idx}")
            dump_path = os.path.join(tmp_dir, f"{clean_name}.wav")
            sf.write(dump_path, audio_arr, int(sr), format='WAV')
            fpath = dump_path
        
        return BatchTask(key=key, path=fpath, mime_type="audio/wav")

    progress(0.2, desc="Падрыхтоўка задач для пакетнага рэжыму...")
    task_map_idx = {}  # task_key -> result_index
    
    if recheck_problematic:
        # Identification logic
        target_indices = [
             i for i, r in enumerate(global_results) 
             if r['score'] < similarity_threshold 
             and r.get('verification_status') != 'correct'
        ]
        if limit_files > 0:
            target_indices = target_indices[:limit_files]
        
        if not target_indices:
             gr.Info("Няма праблемных файлаў для пераправеркі.")
             try: tmp_dir_obj.cleanup() 
             except: pass
             return generate_dashboard_outputs(similarity_threshold)

        # Create DS Map
        ds_map = {}
        for di, d_item in enumerate(ds):
            p = d_item['audio']['path']
            if p:
                ds_map[p] = d_item
                ds_map[os.path.basename(p)] = d_item
            ds_map[di] = d_item

        for global_res_idx in target_indices:
            res = global_results[global_res_idx]
            path = res.get('path', '')
            
            # Try finding item
            item = ds_map.get(path) or ds_map.get(os.path.basename(path))
            if not item and res.get('id') is not None:
                 try: item = ds[int(res.get('id'))]
                 except: pass

            if item:
                t = prepare_task(global_res_idx, res, item)
                if t:
                    tasks.append(t)
                    task_map_idx[t.key] = global_res_idx
    else:
        # Tasks for all
        for idx, res in enumerate(global_results):
            item = ds[idx] 
            t = prepare_task(idx, res, item)
            if t:
                tasks.append(t)
                task_map_idx[t.key] = idx

    if not tasks:
        gr.Warning("Не знойдзена задач для выканання (магчыма, адсутнічае аўдыя).")
        try: tmp_dir_obj.cleanup() 
        except: pass
        return generate_dashboard_outputs(similarity_threshold)

    # 3. Execute Batch
    progress(0.3, desc=f"Запуск пакетнай апрацоўкі ({len(tasks)} файлаў). Гэта зойме час...")
    prompt = DEFAULT_TRANSCRIPTION_PROMPT
    
    try:
        batch_results = gemini_tool.run_batch(tasks, model_name, prompt)
    except Exception as e:
        try: tmp_dir_obj.cleanup() 
        except: pass
        raise gr.Error(f"Batch failed: {e}")
    
    progress(0.9, desc="Апрацоўка вынікаў...")
    
    # 4. Map Results
    for key, text in batch_results.items():
        if key in task_map_idx:
            idx = task_map_idx[key]
            if idx < len(global_results):
                ref_text = global_results[idx]['ref_text']
                score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, text)
                
                global_results[idx].update({
                    "hyp_text": text,
                    "score": score,
                    "norm_ref": norm_ref,
                    "norm_hyp": norm_hyp,
                    "verification_status": "correct" if score >= similarity_threshold else "incorrect",
                    "model_used": f"batch_{model_name}"
                })
                
                if 'model_results' not in global_results[idx]:
                    global_results[idx]['model_results'] = {}
                global_results[idx]['model_results'][model_name] = {
                    "hyp_text": text,
                    "score": score,
                    "norm_ref": norm_ref,
                    "norm_hyp": norm_hyp
                }

    try: tmp_dir_obj.cleanup() 
    except: pass
    
    return generate_dashboard_outputs(similarity_threshold)


def _run_recheck_analysis(
    gemini_tool, model_name, dataset_name, limit_files,
    similarity_threshold, gen_config, progress
):
    """Run recheck of problematic files."""
    global_results = get_global_results()
    
    if not global_results:
        gr.Warning("Няма вынікаў для пераправеркі.")
        return generate_dashboard_outputs(similarity_threshold)
    
    # Identify problematic records
    target_indices = [
        i for i, r in enumerate(global_results) 
        if r['score'] < similarity_threshold 
        and r.get('verification_status') != 'correct'
    ]
    
    if limit_files > 0:
        target_indices = target_indices[:limit_files]
    
    if not target_indices:
        gr.Info("Няма праблемных файлаў для пераправеркі.")
        return generate_dashboard_outputs(similarity_threshold)

    # Load dataset to get audio for files that might be missing it
    limit = None
    cached_ds = get_cached_dataset(dataset_name, limit)
    if cached_ds is not None:
        progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
        ds = cached_ds
    else:
        progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
        ds = utils.load_hf_dataset(dataset_name, limit=limit)
        cache_dataset(dataset_name, limit, ds)
        progress(0.05, desc=f"Датасет закэшаваны")
    
    # Build audio map by filename
    audio_map = {}
    for item in ds:
        path = item['audio']['path']
        if path:
            fname = os.path.basename(path)
            audio_map[fname] = item
            audio_map[path] = item

    progress(0.1, desc=f"Пераправерка {len(target_indices)} файлаў...")
    
    for j, idx in enumerate(target_indices):
        progress(0.1 + (j + 1) / len(target_indices) * 0.9, desc=f"Праверка {j+1}/{len(target_indices)}")
        
        result = global_results[idx]
        audio_data = result.get('audio_array')
        sampling_rate = result.get('sampling_rate')
        ref_text = result.get('ref_text', "")
        
        # If audio is missing, try to fetch from dataset
        if audio_data is None or len(audio_data) == 0:
            path = result.get('path', '')
            item = audio_map.get(path) or audio_map.get(os.path.basename(path))
            
            # Fallback: try to find by ID if path lookup failed
            if not item:
                rec_id = result.get('id')
                if rec_id is not None:
                    try:
                        rec_id = int(rec_id)
                        if 0 <= rec_id < len(ds):
                            item = ds[rec_id]
                    except:
                        pass

            if item:
                audio_data = item['audio']['array']
                sampling_rate = item['audio']['sampling_rate']
                global_results[idx]['audio_array'] = audio_data
                global_results[idx]['sampling_rate'] = sampling_rate
            else:
                print(f"Problematic Recheck: Skipping index {idx}, path '{path}', id {result.get('id')}: Audio not found in dataset.")
                continue

        hyp_text = gemini_tool.transcribe_audio(model_name, audio_data, sampling_rate, config=gen_config)
        score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)

        print(f"🔄 Updated: {result.get('path')} | Score: {result.get('score')} -> {score} | Text: {hyp_text}")

        # Save model result
        if 'model_results' not in global_results[idx]:
            global_results[idx]['model_results'] = {}
        
        global_results[idx]['model_results'][model_name] = {
            "hyp_text": hyp_text,
            "score": score,
            "norm_ref": norm_ref,
            "norm_hyp": norm_hyp
        }
        
        # Select best result from all models
        best_model, best_result = select_best_model_result(
            global_results[idx]['model_results'], 
            similarity_threshold
        )
        
        if best_result:
            global_results[idx].update({
                "hyp_text": best_result['hyp_text'],
                "score": best_result['score'],
                "norm_ref": best_result['norm_ref'],
                "norm_hyp": best_result['norm_hyp'],
                "model_used": best_model,
                "verification_status": "correct" if best_result['score'] >= similarity_threshold else "incorrect"
            })

    return generate_dashboard_outputs(similarity_threshold)


def _run_fresh_analysis(
    gemini_tool, model_name, dataset_name, limit_files,
    similarity_threshold, gen_config, progress
):
    """Run fresh analysis on all files."""
    limit = int(limit_files) if limit_files > 0 else None

    cached_ds = get_cached_dataset(dataset_name, limit)
    if cached_ds is not None:
        progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
        ds = cached_ds
    else:
        progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
        ds = utils.load_hf_dataset(dataset_name, limit=limit)
        cache_dataset(dataset_name, limit, ds)
        progress(0.1, desc=f"Датасет закэшаваны для паўторнага выкарыстання")

    results = []

    for idx, item in enumerate(ds):
        progress((idx + 1) / len(ds), desc=f"Апрацоўка файла {idx+1}/{len(ds)}")

        audio_data = item['audio']['array']
        sampling_rate = item['audio']['sampling_rate']
        ref_text = item.get('sentence') or item.get('text') or item.get('transcription') or item.get('transcript') or ""

        hyp_text = gemini_tool.transcribe_audio(model_name, audio_data, sampling_rate, config=gen_config)
        score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)

        results.append({
            "id": idx,
            "path": item['audio']['path'],
            "ref_text": ref_text,
            "hyp_text": hyp_text,
            "score": score,
            "norm_ref": norm_ref,
            "norm_hyp": norm_hyp,
            "audio_array": audio_data,
            "sampling_rate": sampling_rate,
            "model_used": model_name,
            "verification_status": "correct" if score >= similarity_threshold else "incorrect",
            "model_results": {
                model_name: {
                    "hyp_text": hyp_text,
                    "score": score,
                    "norm_ref": norm_ref,
                    "norm_hyp": norm_hyp
                }
            }
        })
    
    set_global_results(results)
    return generate_dashboard_outputs(similarity_threshold)


def _run_hf_asr_analysis(
    model_name: str,
    dataset_name: str,
    limit_files: int,
    similarity_threshold: int,
    recheck_problematic: bool,
    progress
):
    """Run analysis using Hugging Face ASR model."""
    global_results = get_global_results()
    
    try:
        hf_client = get_hf_asr_client(model_name)
        progress(0.05, desc=f"Падключэнне да HF Space: {model_name}...")
    except Exception as e:
        raise gr.Error(f"Памылка падключэння да HF: {e}")
    
    if recheck_problematic:
        return _run_hf_recheck_analysis(
            hf_client, model_name, dataset_name, limit_files,
            similarity_threshold, progress
        )
    else:
        return _run_hf_fresh_analysis(
            hf_client, model_name, dataset_name, limit_files,
            similarity_threshold, progress
        )


def _run_hf_fresh_analysis(
    hf_client, model_name, dataset_name, limit_files,
    similarity_threshold, progress
):
    """Run fresh analysis using HF ASR with batch processing."""
    limit = int(limit_files) if limit_files > 0 else None
    
    cached_ds = get_cached_dataset(dataset_name, limit)
    if cached_ds is not None:
        progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
        ds = cached_ds
    else:
        progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
        ds = utils.load_hf_dataset(dataset_name, limit=limit)
        cache_dataset(dataset_name, limit, ds)
        progress(0.1, desc=f"Датасет закэшаваны для паўторнага выкарыстання")
    
    # Pre-collect all items with their data
    all_items = []
    for idx, item in enumerate(ds):
        audio_data = item['audio']['array']
        sampling_rate = item['audio']['sampling_rate']
        ref_text = item.get('sentence') or item.get('text') or item.get('transcription') or item.get('transcript') or ""
        all_items.append({
            "idx": idx,
            "path": item['audio']['path'],
            "audio_data": audio_data,
            "sampling_rate": sampling_rate,
            "ref_text": ref_text
        })
    
    total_items = len(all_items)
    results = [None] * total_items  # Pre-allocate for correct ordering
    
    # Process in batches of HF_BATCH_SIZE (100)
    batch_size = HF_BATCH_SIZE
    num_batches = (total_items + batch_size - 1) // batch_size
    
    for batch_num in range(num_batches):
        # Delay between batches to avoid rate limiting (skip for first batch)
        if batch_num > 0:
            print(f"⏳ Чакаем 5с перад наступным пакетам...")
            time.sleep(5)
        
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, total_items)
        batch_items = all_items[start_idx:end_idx]
        
        progress_val = 0.1 + (batch_num / num_batches) * 0.9
        progress(progress_val, desc=f"Пакет {batch_num + 1}/{num_batches}: апрацоўка {len(batch_items)} файлаў (HF ASR)...")
        
        # Prepare batch for transcription: (key, audio_array, sampling_rate)
        batch_audio = [
            (item["idx"], item["audio_data"], item["sampling_rate"])
            for item in batch_items
        ]
        
        # Send batch to HF ASR (retry logic is inside transcribe_batch)
        transcriptions = hf_client.transcribe_batch(batch_audio)
        
        # Process results - only save successful transcriptions
        transcribed_count = 0
        for item in batch_items:
            idx = item["idx"]
            ref_text = item["ref_text"]
            hyp_text = transcriptions.get(idx, "")
            
            # Only record result if transcription was successful
            if hyp_text:
                score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)
                transcribed_count += 1
                
                results[idx] = {
                    "id": idx,
                    "path": item["path"],
                    "ref_text": ref_text,
                    "hyp_text": hyp_text,
                    "score": score,
                    "norm_ref": norm_ref,
                    "norm_hyp": norm_hyp,
                    "audio_array": item["audio_data"],
                    "sampling_rate": item["sampling_rate"],
                    "model_used": model_name,
                    "verification_status": "correct" if score >= similarity_threshold else "incorrect",
                    "model_results": {
                        model_name: {
                            "hyp_text": hyp_text,
                            "score": score,
                            "norm_ref": norm_ref,
                            "norm_hyp": norm_hyp
                        }
                    }
                }
            # Skip items with no transcription result
        
        print(f"✅ Пакет {batch_num + 1}/{num_batches} завершаны: {transcribed_count}/{len(batch_items)} транскрыбавана")
    
    set_global_results(results)
    return generate_dashboard_outputs(similarity_threshold)


def _run_hf_recheck_analysis(
    hf_client, model_name, dataset_name, limit_files,
    similarity_threshold, progress
):
    """Run recheck of problematic files using HF ASR with batch processing."""
    global_results = get_global_results()
    
    if not global_results:
        gr.Warning("Няма вынікаў для пераправеркі.")
        return generate_dashboard_outputs(similarity_threshold)
    
    # Identify problematic records
    target_indices = [
        i for i, r in enumerate(global_results) 
        if r['score'] < similarity_threshold 
        and r.get('verification_status') != 'correct'
    ]
    
    if limit_files > 0:
        target_indices = target_indices[:limit_files]
    
    if not target_indices:
        gr.Info("Няма праблемных файлаў для пераправеркі.")
        return generate_dashboard_outputs(similarity_threshold)
    
    # Load dataset to get audio for files that might be missing it
    limit = None
    cached_ds = get_cached_dataset(dataset_name, limit)
    if cached_ds is not None:
        progress(0, desc=f"Выкарыстоўваю закэшаваны датасет '{dataset_name}'...")
        ds = cached_ds
    else:
        progress(0, desc=f"Загрузка датасета '{dataset_name}'...")
        ds = utils.load_hf_dataset(dataset_name, limit=limit)
        cache_dataset(dataset_name, limit, ds)
        progress(0.05, desc=f"Датасет закэшаваны")
    
    # Build audio map by filename
    audio_map = {}
    for item in ds:
        path = item['audio']['path']
        if path:
            fname = os.path.basename(path)
            audio_map[fname] = item
            audio_map[path] = item
    
    # Collect all items to process with their audio data
    items_to_process = []
    for idx in target_indices:
        result = global_results[idx]
        audio_data = result.get('audio_array')
        sampling_rate = result.get('sampling_rate')
        ref_text = result.get('ref_text', "")
        
        # If audio is missing, try to fetch from dataset
        if audio_data is None or (hasattr(audio_data, '__len__') and len(audio_data) == 0):
            path = result.get('path', '')
            item = audio_map.get(path) or audio_map.get(os.path.basename(path))
            
            # Fallback: try to find by ID if path lookup failed
            if not item:
                rec_id = result.get('id')
                if rec_id is not None:
                    try:
                        rec_id = int(rec_id)
                        if 0 <= rec_id < len(ds):
                            item = ds[rec_id]
                    except:
                        pass
            
            if item:
                audio_data = item['audio']['array']
                sampling_rate = item['audio']['sampling_rate']
                global_results[idx]['audio_array'] = audio_data
                global_results[idx]['sampling_rate'] = sampling_rate
            else:
                print(f"HF Recheck: Skipping index {idx}, path '{path}': Audio not found.")
                continue
        
        items_to_process.append({
            "idx": idx,
            "audio_data": audio_data,
            "sampling_rate": sampling_rate,
            "ref_text": ref_text
        })
    
    if not items_to_process:
        gr.Info("Няма файлаў з аўдыя для пераправеркі.")
        return generate_dashboard_outputs(similarity_threshold)
    
    # Process in batches of HF_BATCH_SIZE (100)
    batch_size = HF_BATCH_SIZE
    total_items = len(items_to_process)
    num_batches = (total_items + batch_size - 1) // batch_size
    
    progress(0.1, desc=f"Пераправерка {total_items} файлаў у {num_batches} пакетах (HF ASR)...")
    
    for batch_num in range(num_batches):
        # Delay between batches to avoid rate limiting (skip for first batch)
        if batch_num > 0:
            print(f"⏳ Чакаем 5с перад наступным пакетам...")
            time.sleep(5)
        
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, total_items)
        batch_items = items_to_process[start_idx:end_idx]
        
        progress_val = 0.1 + (batch_num / num_batches) * 0.9
        progress(progress_val, desc=f"Пакет {batch_num + 1}/{num_batches}: апрацоўка {len(batch_items)} файлаў...")
        
        # Prepare batch for transcription: (key, audio_array, sampling_rate)
        batch_audio = [
            (item["idx"], item["audio_data"], item["sampling_rate"])
            for item in batch_items
        ]
        
        # Send batch to HF ASR (retry logic is inside transcribe_batch)
        transcriptions = hf_client.transcribe_batch(batch_audio)
        
        # Process results - only save successful transcriptions
        transcribed_count = 0
        for item in batch_items:
            idx = item["idx"]
            ref_text = item["ref_text"]
            hyp_text = transcriptions.get(idx, "")
            
            if not hyp_text:
                continue
            
            transcribed_count += 1
            score, norm_ref, norm_hyp = utils.calculate_similarity(ref_text, hyp_text)
            
            print(f"🔄 HF Updated: {global_results[idx].get('path')} | Score: {global_results[idx].get('score')} -> {score}")
            
            # Save model result
            if 'model_results' not in global_results[idx]:
                global_results[idx]['model_results'] = {}
            
            global_results[idx]['model_results'][model_name] = {
                "hyp_text": hyp_text,
                "score": score,
                "norm_ref": norm_ref,
                "norm_hyp": norm_hyp
            }
            
            # Select best result from all models
            best_model, best_result = select_best_model_result(
                global_results[idx]['model_results'], 
                similarity_threshold
            )
            
            if best_result:
                global_results[idx].update({
                    "hyp_text": best_result['hyp_text'],
                    "score": best_result['score'],
                    "norm_ref": best_result['norm_ref'],
                    "norm_hyp": best_result['norm_hyp'],
                    "model_used": best_model,
                    "verification_status": "correct" if best_result['score'] >= similarity_threshold else "incorrect"
                })
        
        print(f"✅ Пакет {batch_num + 1}/{num_batches} завершаны: {transcribed_count}/{len(batch_items)} транскрыбавана")
    
    return generate_dashboard_outputs(similarity_threshold)