"""
Interview phrases data loader.
Reads the transcripciones.json and builds the structured phrases list.
"""

import json
import logging
import os
import re
from django.conf import settings

log = logging.getLogger("interview.phrases_data")

TRANSCRIPTIONS_FILE = os.path.join(settings.BASE_DIR, 'data', 'transcripciones.json')


def _numeric_id_key(phrase: dict) -> int:
    """Extract the numeric index from a phrase id for sorting."""
    match = re.search(r"(\d+)", phrase["id"])
    return int(match.group(1)) if match else 0


def setup_data() -> list[dict]:
    """
    Read transcripciones.json and return a list of phrase dicts with:
    {id, text, category, sentiment, original_file}
    """
    if not os.path.exists(TRANSCRIPTIONS_FILE):
        log.error("Transcriptions file not found: %s", TRANSCRIPTIONS_FILE)
        return []

    with open(TRANSCRIPTIONS_FILE, "r", encoding="utf-8") as f:
        transcriptions = json.load(f)

    phrases = []

    # Process transcriptions
    for filename, text in transcriptions.items():
        file_id = filename.replace(".mp4", "")

        category = "unknown"
        sentiment = "neutral"
        if filename.startswith("preguntas_"):
            category = "questions"
        elif filename.startswith("positivas_"):
            category = "feedback"
            sentiment = "positive"
        elif filename.startswith("negativos_"):
            category = "feedback"
            sentiment = "negative"

        phrases.append({
            "id": file_id,
            "text": text.strip(),
            "category": category,
            "sentiment": sentiment,
            "original_file": filename,
        })

    log.info("Loaded %d phrases from transcriptions.", len(phrases))
    return phrases