""" Interview phrases data loader. Reads the transcripciones.json and builds the structured phrases list. """ import json import logging import os import re from django.conf import settings log = logging.getLogger("interview.phrases_data") TRANSCRIPTIONS_FILE = os.path.join(settings.BASE_DIR, 'data', 'transcripciones.json') def _numeric_id_key(phrase: dict) -> int: """Extract the numeric index from a phrase id for sorting.""" match = re.search(r"(\d+)", phrase["id"]) return int(match.group(1)) if match else 0 def setup_data() -> list[dict]: """ Read transcripciones.json and return a list of phrase dicts with: {id, text, category, sentiment, original_file} """ if not os.path.exists(TRANSCRIPTIONS_FILE): log.error("Transcriptions file not found: %s", TRANSCRIPTIONS_FILE) return [] with open(TRANSCRIPTIONS_FILE, "r", encoding="utf-8") as f: transcriptions = json.load(f) phrases = [] # Process transcriptions for filename, text in transcriptions.items(): file_id = filename.replace(".mp4", "") category = "unknown" sentiment = "neutral" if filename.startswith("preguntas_"): category = "questions" elif filename.startswith("positivas_"): category = "feedback" sentiment = "positive" elif filename.startswith("negativos_"): category = "feedback" sentiment = "negative" phrases.append({ "id": file_id, "text": text.strip(), "category": category, "sentiment": sentiment, "original_file": filename, }) log.info("Loaded %d phrases from transcriptions.", len(phrases)) return phrases