testing / interview /phrases_data.py
Danielsz's picture
deploy
16e8be2
Raw
History Blame Contribute Delete
1.76 kB
"""
Interview phrases data loader.
Reads the transcripciones.json and builds the structured phrases list.
"""
import json
import logging
import os
import re
from django.conf import settings
log = logging.getLogger("interview.phrases_data")
TRANSCRIPTIONS_FILE = os.path.join(settings.BASE_DIR, 'data', 'transcripciones.json')
def _numeric_id_key(phrase: dict) -> int:
"""Extract the numeric index from a phrase id for sorting."""
match = re.search(r"(\d+)", phrase["id"])
return int(match.group(1)) if match else 0
def setup_data() -> list[dict]:
"""
Read transcripciones.json and return a list of phrase dicts with:
{id, text, category, sentiment, original_file}
"""
if not os.path.exists(TRANSCRIPTIONS_FILE):
log.error("Transcriptions file not found: %s", TRANSCRIPTIONS_FILE)
return []
with open(TRANSCRIPTIONS_FILE, "r", encoding="utf-8") as f:
transcriptions = json.load(f)
phrases = []
# Process transcriptions
for filename, text in transcriptions.items():
file_id = filename.replace(".mp4", "")
category = "unknown"
sentiment = "neutral"
if filename.startswith("preguntas_"):
category = "questions"
elif filename.startswith("positivas_"):
category = "feedback"
sentiment = "positive"
elif filename.startswith("negativos_"):
category = "feedback"
sentiment = "negative"
phrases.append({
"id": file_id,
"text": text.strip(),
"category": category,
"sentiment": sentiment,
"original_file": filename,
})
log.info("Loaded %d phrases from transcriptions.", len(phrases))
return phrases