File size: 7,383 Bytes
b7934cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Profile Extractor - Uses LLM (Groq) to extract structured skills and experience from document text.
Returns JSON for dashboard: skills (by category/level) and experience (timeline).
"""
import json
import re
from typing import List, Dict, Any


EXTRACT_PROMPT = """Analiza el siguiente texto de CV/perfil profesional y extrae SOLO información que aparezca explícitamente.

Responde ÚNICAMENTE con un bloque JSON válido (sin markdown, sin texto antes o después), con esta estructura exacta:

{{
  "summary": {{
    "headline": "titular corto del perfil (opcional)",
    "estimated_seniority": "junior|mid|senior|lead|unknown",
    "total_years_experience": 0
  }},
  "skills": [
    {{ "name": "nombre del skill", "category": "technical" | "soft" | "tools" | "language", "level": "basic" | "intermediate" | "advanced", "evidence": "frase corta del documento (opcional)" }}
  ],
  "experience": [
    {{ "company": "nombre empresa", "role": "puesto", "start_date": "YYYY-MM o año", "end_date": "YYYY-MM o null si actual", "current": true/false, "location": "opcional", "description": "breve descripción opcional", "highlights": ["logro 1", "logro 2"] }}
  ]
}}

Reglas:
- skills: category "technical" = lenguajes, frameworks, bases de datos; "soft" = comunicación, liderazgo; "tools" = Herramientas (Git, Jira); "language" = idiomas.
- experience: start_date y end_date en formato "YYYY" o "YYYY-MM" si se puede inferir. Si es el trabajo actual, end_date puede ser null y current true.
- Extrae SOLO lo que esté en el texto. No inventes datos.
- Si no hay información para skills o experience, devuelve listas vacías [].
- El JSON debe ser válido (comillas dobles, sin comas finales).
- Si no puedes determinar seniority o años, usa \"unknown\" y 0.

TEXTO DEL DOCUMENTO:
---
{text}
---
Responde solo con el JSON, nada más."""

def _extract_json_candidate(text: str) -> str:
    """Best-effort: pull a JSON object from model output."""
    if not text:
        return ""
    s = text.strip()
    fence = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s)
    if fence:
        s = fence.group(1).strip()

    # If there's extra text, keep the first {...} block.
    start = s.find("{")
    end = s.rfind("}")
    if start != -1 and end != -1 and end > start:
        s = s[start : end + 1]

    # Remove trailing commas (common LLM issue)
    s = re.sub(r",\s*([}\]])", r"\1", s)
    return s.strip()


def extract_profile_from_text(text: str, llm) -> Dict[str, Any]:
    """
    Call LLM to extract structured profile (skills + experience) from document text.
    llm: LangChain ChatGroq instance (e.g. from CareerAssistant.llm).
    Returns dict with "skills" and "experience" lists; on error returns empty structure.
    """
    if not text or not text.strip():
        return {"skills": [], "experience": []}

    # Limit size to avoid token limits (keep first ~12k chars)
    text_trimmed = text.strip()[:12000]
    prompt = EXTRACT_PROMPT.format(text=text_trimmed)

    try:
        from langchain_core.messages import HumanMessage
        response = llm.invoke([HumanMessage(content=prompt)])
        content = response.content if hasattr(response, "content") else str(response)
        candidate = _extract_json_candidate(content)
        data = json.loads(candidate)
        skills = data.get("skills") or []
        experience = data.get("experience") or []
        summary = data.get("summary") or {}
        # Normalize
        if not isinstance(skills, list):
            skills = []
        if not isinstance(experience, list):
            experience = []
        if not isinstance(summary, dict):
            summary = {}
        return {"summary": summary, "skills": skills, "experience": experience}
    except (json.JSONDecodeError, Exception):
        return {"summary": {}, "skills": [], "experience": []}

INSIGHTS_PROMPT = """Eres un analista de carrera. Te paso un perfil ya extraído de documentos reales (skills + experiencia).\n\nTu tarea: generar insights accionables SIN inventar información.\n\nResponde ÚNICAMENTE JSON válido (sin markdown), con esta estructura exacta:\n\n{\n  \"strengths\": [\"...\"],\n  \"potential_gaps\": [\"...\"],\n  \"role_suggestions\": [\"...\"],\n  \"next_actions\": [\"...\"]\n}\n\nReglas:\n- Todo debe derivarse SOLO del perfil que recibes. Si falta info, dilo en el texto del insight (ej: \"No hay evidencia de X en los documentos\").\n- Sé concreto y breve (bullets de 1 línea).\n- No menciones que eres una IA.\n\nPERFIL (JSON):\n{profile_json}\n"""


def generate_dashboard_insights(profile: Dict[str, Any], llm) -> Dict[str, Any]:
    """Generate 'smart' insights based on extracted profile JSON."""
    try:
        from langchain_core.messages import HumanMessage
        profile_json = json.dumps(profile or {}, ensure_ascii=False)[:12000]
        prompt = INSIGHTS_PROMPT.format(profile_json=profile_json)
        resp = llm.invoke([HumanMessage(content=prompt)])
        content = resp.content if hasattr(resp, "content") else str(resp)
        candidate = _extract_json_candidate(content)
        data = json.loads(candidate)
        out = {
            "strengths": data.get("strengths") or [],
            "potential_gaps": data.get("potential_gaps") or [],
            "role_suggestions": data.get("role_suggestions") or [],
            "next_actions": data.get("next_actions") or [],
        }
        for k in list(out.keys()):
            if not isinstance(out[k], list):
                out[k] = []
            out[k] = [str(x).strip() for x in out[k] if str(x).strip()][:12]
        return out
    except Exception:
        return {"strengths": [], "potential_gaps": [], "role_suggestions": [], "next_actions": []}


def skills_by_category(skills: List[Dict]) -> Dict[str, int]:
    """Count skills per category for bar chart."""
    counts = {}
    for s in skills:
        if not isinstance(s, dict):
            continue
        cat = (s.get("category") or "other").lower()
        counts[cat] = counts.get(cat, 0) + 1
    return counts


def skills_by_level(skills: List[Dict]) -> Dict[str, int]:
    """Count skills per level for chart."""
    counts = {"basic": 0, "intermediate": 0, "advanced": 0}
    for s in skills:
        if not isinstance(s, dict):
            continue
        level = (s.get("level") or "intermediate").lower()
        if level in counts:
            counts[level] += 1
        else:
            counts["intermediate"] += 1
    return counts


def experience_for_timeline(experience: List[Dict]) -> List[Dict]:
    """
    Normalize experience entries for timeline: ensure start_date/end_date for plotting.
    Returns list of dicts with company, role, start_date, end_date, current, description.
    """
    out = []
    for e in experience:
        if not isinstance(e, dict):
            continue
        start = (e.get("start_date") or "").strip() or "Unknown"
        end = e.get("end_date")
        if end is None and e.get("current"):
            end = "Actualidad"
        elif not end:
            end = "?"
        out.append({
            "company": (e.get("company") or "?").strip(),
            "role": (e.get("role") or "?").strip(),
            "start_date": start,
            "end_date": end,
            "current": bool(e.get("current")),
            "description": (e.get("description") or "").strip()[:200],
        })
    return out