""" Profile Extractor - Uses LLM (Groq) to extract structured skills and experience from document text. Returns JSON for dashboard: skills (by category/level) and experience (timeline). """ import json import re from typing import List, Dict, Any EXTRACT_PROMPT = """Analiza el siguiente texto de CV/perfil profesional y extrae SOLO información que aparezca explícitamente. Responde ÚNICAMENTE con un bloque JSON válido (sin markdown, sin texto antes o después), con esta estructura exacta: {{ "summary": {{ "headline": "titular corto del perfil (opcional)", "estimated_seniority": "junior|mid|senior|lead|unknown", "total_years_experience": 0 }}, "skills": [ {{ "name": "nombre del skill", "category": "technical" | "soft" | "tools" | "language", "level": "basic" | "intermediate" | "advanced", "evidence": "frase corta del documento (opcional)" }} ], "experience": [ {{ "company": "nombre empresa", "role": "puesto", "start_date": "YYYY-MM o año", "end_date": "YYYY-MM o null si actual", "current": true/false, "location": "opcional", "description": "breve descripción opcional", "highlights": ["logro 1", "logro 2"] }} ] }} Reglas: - skills: category "technical" = lenguajes, frameworks, bases de datos; "soft" = comunicación, liderazgo; "tools" = Herramientas (Git, Jira); "language" = idiomas. - experience: start_date y end_date en formato "YYYY" o "YYYY-MM" si se puede inferir. Si es el trabajo actual, end_date puede ser null y current true. - Extrae SOLO lo que esté en el texto. No inventes datos. - Si no hay información para skills o experience, devuelve listas vacías []. - El JSON debe ser válido (comillas dobles, sin comas finales). - Si no puedes determinar seniority o años, usa \"unknown\" y 0. TEXTO DEL DOCUMENTO: --- {text} --- Responde solo con el JSON, nada más.""" def _extract_json_candidate(text: str) -> str: """Best-effort: pull a JSON object from model output.""" if not text: return "" s = text.strip() fence = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s) if fence: s = fence.group(1).strip() # If there's extra text, keep the first {...} block. start = s.find("{") end = s.rfind("}") if start != -1 and end != -1 and end > start: s = s[start : end + 1] # Remove trailing commas (common LLM issue) s = re.sub(r",\s*([}\]])", r"\1", s) return s.strip() def extract_profile_from_text(text: str, llm) -> Dict[str, Any]: """ Call LLM to extract structured profile (skills + experience) from document text. llm: LangChain ChatGroq instance (e.g. from CareerAssistant.llm). Returns dict with "skills" and "experience" lists; on error returns empty structure. """ if not text or not text.strip(): return {"skills": [], "experience": []} # Limit size to avoid token limits (keep first ~12k chars) text_trimmed = text.strip()[:12000] prompt = EXTRACT_PROMPT.format(text=text_trimmed) try: from langchain_core.messages import HumanMessage response = llm.invoke([HumanMessage(content=prompt)]) content = response.content if hasattr(response, "content") else str(response) candidate = _extract_json_candidate(content) data = json.loads(candidate) skills = data.get("skills") or [] experience = data.get("experience") or [] summary = data.get("summary") or {} # Normalize if not isinstance(skills, list): skills = [] if not isinstance(experience, list): experience = [] if not isinstance(summary, dict): summary = {} return {"summary": summary, "skills": skills, "experience": experience} except (json.JSONDecodeError, Exception): return {"summary": {}, "skills": [], "experience": []} INSIGHTS_PROMPT = """Eres un analista de carrera. Te paso un perfil ya extraído de documentos reales (skills + experiencia).\n\nTu tarea: generar insights accionables SIN inventar información.\n\nResponde ÚNICAMENTE JSON válido (sin markdown), con esta estructura exacta:\n\n{\n \"strengths\": [\"...\"],\n \"potential_gaps\": [\"...\"],\n \"role_suggestions\": [\"...\"],\n \"next_actions\": [\"...\"]\n}\n\nReglas:\n- Todo debe derivarse SOLO del perfil que recibes. Si falta info, dilo en el texto del insight (ej: \"No hay evidencia de X en los documentos\").\n- Sé concreto y breve (bullets de 1 línea).\n- No menciones que eres una IA.\n\nPERFIL (JSON):\n{profile_json}\n""" def generate_dashboard_insights(profile: Dict[str, Any], llm) -> Dict[str, Any]: """Generate 'smart' insights based on extracted profile JSON.""" try: from langchain_core.messages import HumanMessage profile_json = json.dumps(profile or {}, ensure_ascii=False)[:12000] prompt = INSIGHTS_PROMPT.format(profile_json=profile_json) resp = llm.invoke([HumanMessage(content=prompt)]) content = resp.content if hasattr(resp, "content") else str(resp) candidate = _extract_json_candidate(content) data = json.loads(candidate) out = { "strengths": data.get("strengths") or [], "potential_gaps": data.get("potential_gaps") or [], "role_suggestions": data.get("role_suggestions") or [], "next_actions": data.get("next_actions") or [], } for k in list(out.keys()): if not isinstance(out[k], list): out[k] = [] out[k] = [str(x).strip() for x in out[k] if str(x).strip()][:12] return out except Exception: return {"strengths": [], "potential_gaps": [], "role_suggestions": [], "next_actions": []} def skills_by_category(skills: List[Dict]) -> Dict[str, int]: """Count skills per category for bar chart.""" counts = {} for s in skills: if not isinstance(s, dict): continue cat = (s.get("category") or "other").lower() counts[cat] = counts.get(cat, 0) + 1 return counts def skills_by_level(skills: List[Dict]) -> Dict[str, int]: """Count skills per level for chart.""" counts = {"basic": 0, "intermediate": 0, "advanced": 0} for s in skills: if not isinstance(s, dict): continue level = (s.get("level") or "intermediate").lower() if level in counts: counts[level] += 1 else: counts["intermediate"] += 1 return counts def experience_for_timeline(experience: List[Dict]) -> List[Dict]: """ Normalize experience entries for timeline: ensure start_date/end_date for plotting. Returns list of dicts with company, role, start_date, end_date, current, description. """ out = [] for e in experience: if not isinstance(e, dict): continue start = (e.get("start_date") or "").strip() or "Unknown" end = e.get("end_date") if end is None and e.get("current"): end = "Actualidad" elif not end: end = "?" out.append({ "company": (e.get("company") or "?").strip(), "role": (e.get("role") or "?").strip(), "start_date": start, "end_date": end, "current": bool(e.get("current")), "description": (e.get("description") or "").strip()[:200], }) return out