| """ |
| Interview phrases data loader. |
| Reads the transcripciones.json and builds the structured phrases list. |
| """ |
|
|
| import json |
| import logging |
| import os |
| import re |
| from django.conf import settings |
|
|
| log = logging.getLogger("interview.phrases_data") |
|
|
| TRANSCRIPTIONS_FILE = os.path.join(settings.BASE_DIR, 'data', 'transcripciones.json') |
|
|
|
|
| def _numeric_id_key(phrase: dict) -> int: |
| """Extract the numeric index from a phrase id for sorting.""" |
| match = re.search(r"(\d+)", phrase["id"]) |
| return int(match.group(1)) if match else 0 |
|
|
|
|
| def setup_data() -> list[dict]: |
| """ |
| Read transcripciones.json and return a list of phrase dicts with: |
| {id, text, category, sentiment, original_file} |
| """ |
| if not os.path.exists(TRANSCRIPTIONS_FILE): |
| log.error("Transcriptions file not found: %s", TRANSCRIPTIONS_FILE) |
| return [] |
|
|
| with open(TRANSCRIPTIONS_FILE, "r", encoding="utf-8") as f: |
| transcriptions = json.load(f) |
|
|
| phrases = [] |
|
|
| |
| for filename, text in transcriptions.items(): |
| file_id = filename.replace(".mp4", "") |
|
|
| category = "unknown" |
| sentiment = "neutral" |
| if filename.startswith("preguntas_"): |
| category = "questions" |
| elif filename.startswith("positivas_"): |
| category = "feedback" |
| sentiment = "positive" |
| elif filename.startswith("negativos_"): |
| category = "feedback" |
| sentiment = "negative" |
|
|
| phrases.append({ |
| "id": file_id, |
| "text": text.strip(), |
| "category": category, |
| "sentiment": sentiment, |
| "original_file": filename, |
| }) |
|
|
| log.info("Loaded %d phrases from transcriptions.", len(phrases)) |
| return phrases |
|
|