File size: 7,383 Bytes
b7934cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """
Profile Extractor - Uses LLM (Groq) to extract structured skills and experience from document text.
Returns JSON for dashboard: skills (by category/level) and experience (timeline).
"""
import json
import re
from typing import List, Dict, Any
EXTRACT_PROMPT = """Analiza el siguiente texto de CV/perfil profesional y extrae SOLO información que aparezca explícitamente.
Responde ÚNICAMENTE con un bloque JSON válido (sin markdown, sin texto antes o después), con esta estructura exacta:
{{
"summary": {{
"headline": "titular corto del perfil (opcional)",
"estimated_seniority": "junior|mid|senior|lead|unknown",
"total_years_experience": 0
}},
"skills": [
{{ "name": "nombre del skill", "category": "technical" | "soft" | "tools" | "language", "level": "basic" | "intermediate" | "advanced", "evidence": "frase corta del documento (opcional)" }}
],
"experience": [
{{ "company": "nombre empresa", "role": "puesto", "start_date": "YYYY-MM o año", "end_date": "YYYY-MM o null si actual", "current": true/false, "location": "opcional", "description": "breve descripción opcional", "highlights": ["logro 1", "logro 2"] }}
]
}}
Reglas:
- skills: category "technical" = lenguajes, frameworks, bases de datos; "soft" = comunicación, liderazgo; "tools" = Herramientas (Git, Jira); "language" = idiomas.
- experience: start_date y end_date en formato "YYYY" o "YYYY-MM" si se puede inferir. Si es el trabajo actual, end_date puede ser null y current true.
- Extrae SOLO lo que esté en el texto. No inventes datos.
- Si no hay información para skills o experience, devuelve listas vacías [].
- El JSON debe ser válido (comillas dobles, sin comas finales).
- Si no puedes determinar seniority o años, usa \"unknown\" y 0.
TEXTO DEL DOCUMENTO:
---
{text}
---
Responde solo con el JSON, nada más."""
def _extract_json_candidate(text: str) -> str:
"""Best-effort: pull a JSON object from model output."""
if not text:
return ""
s = text.strip()
fence = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s)
if fence:
s = fence.group(1).strip()
# If there's extra text, keep the first {...} block.
start = s.find("{")
end = s.rfind("}")
if start != -1 and end != -1 and end > start:
s = s[start : end + 1]
# Remove trailing commas (common LLM issue)
s = re.sub(r",\s*([}\]])", r"\1", s)
return s.strip()
def extract_profile_from_text(text: str, llm) -> Dict[str, Any]:
"""
Call LLM to extract structured profile (skills + experience) from document text.
llm: LangChain ChatGroq instance (e.g. from CareerAssistant.llm).
Returns dict with "skills" and "experience" lists; on error returns empty structure.
"""
if not text or not text.strip():
return {"skills": [], "experience": []}
# Limit size to avoid token limits (keep first ~12k chars)
text_trimmed = text.strip()[:12000]
prompt = EXTRACT_PROMPT.format(text=text_trimmed)
try:
from langchain_core.messages import HumanMessage
response = llm.invoke([HumanMessage(content=prompt)])
content = response.content if hasattr(response, "content") else str(response)
candidate = _extract_json_candidate(content)
data = json.loads(candidate)
skills = data.get("skills") or []
experience = data.get("experience") or []
summary = data.get("summary") or {}
# Normalize
if not isinstance(skills, list):
skills = []
if not isinstance(experience, list):
experience = []
if not isinstance(summary, dict):
summary = {}
return {"summary": summary, "skills": skills, "experience": experience}
except (json.JSONDecodeError, Exception):
return {"summary": {}, "skills": [], "experience": []}
INSIGHTS_PROMPT = """Eres un analista de carrera. Te paso un perfil ya extraído de documentos reales (skills + experiencia).\n\nTu tarea: generar insights accionables SIN inventar información.\n\nResponde ÚNICAMENTE JSON válido (sin markdown), con esta estructura exacta:\n\n{\n \"strengths\": [\"...\"],\n \"potential_gaps\": [\"...\"],\n \"role_suggestions\": [\"...\"],\n \"next_actions\": [\"...\"]\n}\n\nReglas:\n- Todo debe derivarse SOLO del perfil que recibes. Si falta info, dilo en el texto del insight (ej: \"No hay evidencia de X en los documentos\").\n- Sé concreto y breve (bullets de 1 línea).\n- No menciones que eres una IA.\n\nPERFIL (JSON):\n{profile_json}\n"""
def generate_dashboard_insights(profile: Dict[str, Any], llm) -> Dict[str, Any]:
"""Generate 'smart' insights based on extracted profile JSON."""
try:
from langchain_core.messages import HumanMessage
profile_json = json.dumps(profile or {}, ensure_ascii=False)[:12000]
prompt = INSIGHTS_PROMPT.format(profile_json=profile_json)
resp = llm.invoke([HumanMessage(content=prompt)])
content = resp.content if hasattr(resp, "content") else str(resp)
candidate = _extract_json_candidate(content)
data = json.loads(candidate)
out = {
"strengths": data.get("strengths") or [],
"potential_gaps": data.get("potential_gaps") or [],
"role_suggestions": data.get("role_suggestions") or [],
"next_actions": data.get("next_actions") or [],
}
for k in list(out.keys()):
if not isinstance(out[k], list):
out[k] = []
out[k] = [str(x).strip() for x in out[k] if str(x).strip()][:12]
return out
except Exception:
return {"strengths": [], "potential_gaps": [], "role_suggestions": [], "next_actions": []}
def skills_by_category(skills: List[Dict]) -> Dict[str, int]:
"""Count skills per category for bar chart."""
counts = {}
for s in skills:
if not isinstance(s, dict):
continue
cat = (s.get("category") or "other").lower()
counts[cat] = counts.get(cat, 0) + 1
return counts
def skills_by_level(skills: List[Dict]) -> Dict[str, int]:
"""Count skills per level for chart."""
counts = {"basic": 0, "intermediate": 0, "advanced": 0}
for s in skills:
if not isinstance(s, dict):
continue
level = (s.get("level") or "intermediate").lower()
if level in counts:
counts[level] += 1
else:
counts["intermediate"] += 1
return counts
def experience_for_timeline(experience: List[Dict]) -> List[Dict]:
"""
Normalize experience entries for timeline: ensure start_date/end_date for plotting.
Returns list of dicts with company, role, start_date, end_date, current, description.
"""
out = []
for e in experience:
if not isinstance(e, dict):
continue
start = (e.get("start_date") or "").strip() or "Unknown"
end = e.get("end_date")
if end is None and e.get("current"):
end = "Actualidad"
elif not end:
end = "?"
out.append({
"company": (e.get("company") or "?").strip(),
"role": (e.get("role") or "?").strip(),
"start_date": start,
"end_date": end,
"current": bool(e.get("current")),
"description": (e.get("description") or "").strip()[:200],
})
return out
|