cv_parser / src /parser_flow /CV_agent_flow.py
quentinL52
update
f88b8e8
raw
history blame
37 kB
"""
Orchestrateur CV enrichi avec 3 phases :
Phase 1 : DΓ©coupage du CV en sections
Phase 2 : Extraction parallèle (8 agents existants)
Phase 3 : Analyse & Recommandation parallèle (5 nouveaux agents)
Produit un JSON en 2 parties : informations + recommandations.
"""
import json
import logging
import os
import yaml
import asyncio
from datetime import datetime
from typing import Dict, Any, List
from crewai import Agent, Task, Crew, Process
from src.config.app_config import get_small_llm, get_big_llm
logger = logging.getLogger(__name__)
#_____________________________________________________________________________________
# Configuration du logger pour capturer la verbositΓ© dans un fichier
verbose_logger = logging.getLogger("crewai_verbose")
verbose_logger.setLevel(logging.INFO)
# CrΓ©ation du fichier de log (Γ©crase le prΓ©cΓ©dent Γ  chaque run avec 'w')
file_handler = logging.FileHandler("agents_trace.log", mode='w', encoding='utf-8')
formatter = logging.Formatter('%(asctime)s - %(message)s')
file_handler.setFormatter(formatter)
verbose_logger.addHandler(file_handler)
class CVAgentOrchestrator:
"""Orchestrateur multi-agents pour le parsing et l'analyse de CV."""
def __init__(self):
self.llm = get_small_llm()
self.big_llm = get_big_llm()
self.agents_config = self._load_yaml("agents.yaml")
self.tasks_config = self._load_yaml("tasks.yaml")
self.metiers_data = self._load_metiers()
self.skill_domain_map = self._load_skill_domain_map()
self._create_agents()
# ──────────────────────────────────────────────
# Chargement des configurations
# ──────────────────────────────────────────────
def _load_yaml(self, filename: str) -> Dict:
base_path = os.path.dirname(os.path.dirname(__file__))
config_path = os.path.join(base_path, "config", filename)
with open(config_path, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
def _load_metiers(self) -> List[Dict]:
"""Charge le rΓ©fΓ©rentiel de mΓ©tiers (sans les embeddings pour Γ©conomiser la mΓ©moire)."""
base_path = os.path.dirname(os.path.dirname(__file__))
metiers_path = os.path.join(base_path, "data", "metiers.json")
with open(metiers_path, "r", encoding="utf-8") as f:
data = json.load(f)
metiers = []
for m in data.get("metiers", []):
clean = {k: v for k, v in m.items() if k != "embedding"}
metiers.append(clean)
return metiers
def _load_skill_domain_map(self) -> Dict[str, List[str]]:
"""Charge le mapping compΓ©tences -> domaines."""
base_path = os.path.dirname(os.path.dirname(__file__))
map_path = os.path.join(base_path, "config", "skill_domain_map.json")
with open(map_path, "r", encoding="utf-8") as f:
return json.load(f)
# ──────────────────────────────────────────────
# CrΓ©ation des agents
# ──────────────────────────────────────────────
def _create_agents(self):
def make_agent(name, llm_override=None):
return Agent(
config=self.agents_config[name],
llm=llm_override or self.llm,
allow_delegation=False,
verbose=True,
max_iter=1,
respect_context_window=True,
# logs callbackagent
step_callback=lambda step: verbose_logger.info(f"Agent {name} Step: {step}"),
)
# Phase 2 : Agents d'extraction (existants)
self.cv_splitter = make_agent("cv_splitter", llm_override=self.big_llm)
self.skills_extractor = make_agent("skills_extractor")
self.experience_extractor = make_agent("experience_extractor")
self.project_extractor = make_agent("project_extractor")
self.education_extractor = make_agent("education_extractor")
self.reconversion_detector = make_agent("reconversion_detector")
self.language_extractor = make_agent("language_extractor")
self.etudiant_detector = make_agent("etudiant_detector")
self.identity_extractor = make_agent("identity_extractor")
# Phase 3 : Agents d'analyse et recommandation (nouveaux)
self.header_analyzer = make_agent("header_analyzer", llm_override=self.big_llm)
self.metier_matcher = make_agent("metier_matcher", llm_override=self.big_llm)
self.cv_quality_checker = make_agent("cv_quality_checker")
self.project_analyzer = make_agent("project_analyzer")
# ──────────────────────────────────────────────
# PHASE 1 : DΓ©coupage du CV en sections
# ──────────────────────────────────────────────
async def split_cv_sections(self, cv_content: str, cv_raw_start: str = "") -> Dict[str, str]:
"""DΓ©coupe le CV en sections via l'agent cv_splitter."""
task_config = self.tasks_config["split_cv_task"].copy()
# Γ‰chapper les accolades dans le contenu CV pour Γ©viter les erreurs de format
safe_content = cv_content[:20000].replace("{", "{{").replace("}", "}}")
safe_raw = cv_raw_start[:2000].replace("{", "{{").replace("}", "}}")
task_config["description"] = task_config["description"].format(
cv_content=safe_content,
cv_raw_start=safe_raw,
)
task = Task(config=task_config, agent=self.cv_splitter)
crew = Crew(
agents=[self.cv_splitter],
tasks=[task],
process=Process.sequential,
verbose=False,
)
result = await crew.kickoff_async()
parsed = self._parse_json_output(result, default_structure={})
return parsed
# ──────────────────────────────────────────────
# PHASE 2 : Extraction parallèle (8 agents)
# ──────────────────────────────────────────────
async def extract_all_sections(
self, sections: Dict[str, str], cv_raw_start: str = "", file_name: str = ""
) -> Dict[str, Any]:
"""Exécute les 8 tÒches d'extraction en parallèle."""
def create_task_async(task_key, agent, **kwargs):
t_config = self.tasks_config[task_key].copy()
t_description = t_config["description"]
# Γ‰viter les erreurs de formattage si des clΓ©s manquent ou sont mal Γ©chappΓ©es (ex: accolades dans le texte du CV)
try:
# Utiliser format_map pour plus de flexibilitΓ© si besoin, mais format() est standard
t_config["description"] = t_description.format(**kwargs)
except KeyError as e:
logger.warning(f"KeyError formatting task '{task_key}': {e}. Falling back to manual replace.")
# Fallback manuel sΓ©curisΓ© pour les clΓ©s prΓ©sentes
desc = t_description
for k, v in kwargs.items():
placeholder = "{" + k + "}"
if placeholder in desc:
desc = desc.replace(placeholder, str(v))
t_config["description"] = desc
except Exception as e:
logger.error(f"Unexpected error formatting task '{task_key}': {e}")
task = Task(config=t_config, agent=agent)
c = Crew(agents=[agent], tasks=[task], verbose=False)
return (task_key, c.kickoff_async())
tasks_def = [
(
"skills_task",
self.skills_extractor,
{
"experiences": sections.get("experiences", ""),
"projects": sections.get("projects", ""),
"skills": sections.get("skills", ""),
"education": sections.get("education", ""),
},
),
(
"experience_task",
self.experience_extractor,
{"experiences": sections.get("experiences", "")},
),
(
"project_task",
self.project_extractor,
{"projects": sections.get("projects", "")},
),
(
"education_task",
self.education_extractor,
{"education": sections.get("education", "")},
),
(
"reconversion_task",
self.reconversion_detector,
{
"experiences": sections.get("experiences", ""),
"education": sections.get("education", ""),
},
),
(
"language_task",
self.language_extractor,
{
"languages": sections.get("languages", ""),
"cv_raw_start": cv_raw_start[:500],
},
),
(
"etudiant_task",
self.etudiant_detector,
{
"education": sections.get("education", ""),
"current_date": datetime.now().strftime("%Y-%m-%d"),
},
),
(
"identity_task",
self.identity_extractor,
{
"header": sections.get("header", ""),
"cv_raw_start": cv_raw_start[:1500],
"file_name": file_name,
},
),
]
task_coroutines = [
create_task_async(key, agent, **kwargs) for key, agent, kwargs in tasks_def
]
keys = [t[0] for t in task_coroutines]
coroutines = [t[1] for t in task_coroutines]
results_list = await asyncio.gather(*coroutines, return_exceptions=True)
results_map = {}
for key, result in zip(keys, results_list):
if isinstance(result, Exception):
logger.error(f"Task '{key}' failed: {result}")
else:
results_map[key] = result
return self._aggregate_extraction_results(results_map)
# ──────────────────────────────────────────────
# PHASE 3 : Analyse & Recommandation (5 agents)
# ──────────────────────────────────────────────
async def analyze_and_recommend(
self,
cv_full_text: str,
sections: Dict[str, str],
extraction: Dict[str, Any],
cv_raw_start: str = "",
) -> Dict[str, Any]:
"""ExΓ©cute les 4 tΓ’ches d'analyse en 2 Γ©tapes optimisΓ©es.
Γ‰tape 3a : header_analyzer seul (rapide, nΓ©cessaire pour tous les autres)
Étape 3b : 3 agents en parallèle (quality, metier, project)
"""
candidat = extraction.get("candidat", {})
competences = candidat.get("compΓ©tences", {})
hard_skills = competences.get("hard_skills", [])
soft_skills = competences.get("soft_skills", [])
skills_with_context = competences.get("skills_with_context", [])
reconversion = candidat.get("reconversion", {})
# Identifier les domaines de compΓ©tences et mΓ©thodologies
skill_domains = self._map_skills_to_domains(hard_skills)
methodologies = self._extract_methodologies(hard_skills, skill_domains)
# PrΓ©parer les rΓ©sumΓ©s pour les prompts
experiences_summary = json.dumps(
candidat.get("expΓ©riences", []), ensure_ascii=False
)[:3000]
projets = candidat.get("projets", {})
professional_projects = json.dumps(
projets.get("professional", []), ensure_ascii=False
)[:2000]
personal_projects = json.dumps(
projets.get("personal", []), ensure_ascii=False
)[:2000]
projects_summary = f"Pro: {professional_projects}\nPerso: {personal_projects}"
reconversion_data = json.dumps(reconversion, ensure_ascii=False) if reconversion else "{}"
# PrΓ©parer le rΓ©fΓ©rentiel mΓ©tiers complet (30 mΓ©tiers)
metiers_reference = self._prepare_metiers_for_prompt()
# Skills rΓ©sumΓ© pour header analysis (fallback)
skills_summary = ", ".join(hard_skills[:20]) if hard_skills else "Non identifiΓ©es"
def create_task_async(task_key, agent, **kwargs):
t_config = self.tasks_config[task_key].copy()
t_config["description"] = t_config["description"].format(**kwargs)
task = Task(config=t_config, agent=agent)
c = Crew(agents=[agent], tasks=[task], verbose=False)
return (task_key, c.kickoff_async())
# Utilise le texte brut fitz si fourni, sinon fallback sur le dΓ©but du Markdown
raw_for_header = cv_raw_start[:2000] if cv_raw_start else cv_full_text[:2000]
header_section = sections.get("header", "")
safe_cv_raw = raw_for_header.replace("{", "{{").replace("}", "}}")
safe_header = header_section.replace("{", "{{").replace("}", "}}")
safe_skills = skills_summary.replace("{", "{{").replace("}", "}}")
header_data = {
"poste_vise": "Non identifiΓ©",
"niveau_seniorite": "non prΓ©cisΓ©",
"confiance": 0,
}
try:
header_coroutine = create_task_async(
"poste_visΓ©_task",
self.header_analyzer,
cv_raw_start=safe_cv_raw,
header=safe_header,
skills_summary=safe_skills,
)
header_result = await header_coroutine[1]
if header_result:
header_data = self._parse_json_output(
header_result,
{
"poste_vise": "Non identifiΓ©",
"niveau_seniorite": "non prΓ©cisΓ©",
"confiance": 0,
},
)
logger.info(f"Header analyzer result: poste_vise='{header_data.get('poste_vise')}', confiance={header_data.get('confiance')}")
except Exception as e:
logger.error(f"Header analyzer failed: {e}", exc_info=True)
poste_vise = header_data.get("poste_vise", "Non identifiΓ©")
niveau_seniorite = header_data.get("niveau_seniorite", "non prΓ©cisΓ©")
# --- Fallback programmatique si le LLM n'a pas trouvΓ© le poste ---
if poste_vise == "Non identifiΓ©":
logger.warning("Header analyzer returned 'Non identifiΓ©', trying fallback extraction...")
fallback = self._fallback_extract_poste_vise(
cv_full_text, header_section
)
if fallback:
poste_vise = fallback
header_data["poste_vise"] = fallback
header_data["source_detection"] = "fallback_programmatique"
header_data["confiance"] = 70
logger.info(f"Fallback found poste_vise: '{fallback}'")
# PrΓ©parer le dΓ©tail du mΓ©tier pour le project_analyzer
metier_reference_detail = self._get_metier_reference_for_poste(poste_vise)
# --- Étape 3b : 3 agents en parallèle ---
parallel_tasks = [
(
"cv_quality_task",
self.cv_quality_checker,
{
"cv_full_text": cv_full_text[:8000],
"cv_raw_start": safe_cv_raw,
"skills_with_context": json.dumps(
skills_with_context, ensure_ascii=False
)[:2000],
"experiences_summary": experiences_summary,
"projects_summary": projects_summary[:2000],
"niveau_seniorite": niveau_seniorite,
"reconversion_data": reconversion_data,
},
),
(
"metier_matching_task",
self.metier_matcher,
{
"poste_vise": poste_vise,
"hard_skills": json.dumps(hard_skills, ensure_ascii=False),
"soft_skills": json.dumps(soft_skills, ensure_ascii=False),
"skill_domains": json.dumps(skill_domains, ensure_ascii=False),
"methodologies": json.dumps(methodologies, ensure_ascii=False),
"experiences_summary": experiences_summary,
"projects_summary": projects_summary[:2000],
"reconversion_data": reconversion_data,
"metiers_reference": metiers_reference,
},
),
(
"project_analysis_task",
self.project_analyzer,
{
"poste_vise": poste_vise,
"metier_reference_detail": metier_reference_detail,
"experiences_summary": experiences_summary,
"professional_projects": professional_projects,
"personal_projects": personal_projects,
"reconversion_data": reconversion_data,
},
),
]
task_coroutines = [
create_task_async(key, agent, **kwargs) for key, agent, kwargs in parallel_tasks
]
keys = [t[0] for t in task_coroutines]
coroutines = [t[1] for t in task_coroutines]
results_list = await asyncio.gather(*coroutines, return_exceptions=True)
analysis_results = {}
for key, result in zip(keys, results_list):
if isinstance(result, Exception):
logger.error(f"Analysis task '{key}' failed: {result}")
else:
analysis_results[key] = result
return self._aggregate_recommendations(
analysis_results,
header_data,
poste_vise,
)
# ──────────────────────────────────────────────
# Mapping compΓ©tences -> domaines
# ──────────────────────────────────────────────
def _map_skills_to_domains(self, hard_skills: List[str]) -> Dict[str, List[str]]:
"""Mappe les compΓ©tences du candidat Γ  leurs domaines mΓ©tier."""
result = {}
for skill in hard_skills:
skill_lower = skill.lower().strip()
for domain, domain_skills in self.skill_domain_map.items():
if skill_lower in domain_skills:
if domain not in result:
result[domain] = []
result[domain].append(skill)
break
return result
def _prepare_metiers_for_prompt(self) -> str:
"""PrΓ©pare le rΓ©fΓ©rentiel mΓ©tiers COMPLET (30 mΓ©tiers) pour le prompt."""
lines = []
for m in self.metiers_data:
mid = m.get("id", "?")
nom = m.get("nom", "?")
cat = m.get("categorie", "?")
comp = m.get("competences_techniques", [])
outils = m.get("outils_technologies", [])
soft = m.get("competences_soft", [])
niveau = m.get("niveau_etude", "?")
exp = m.get("experience_requise", "?")
lines.append(
f"[{mid}] {nom} ({cat})\n"
f" CompΓ©tences techniques: {', '.join(comp)}\n"
f" Outils: {', '.join(outils)}\n"
f" Soft skills: {', '.join(soft[:3])}\n"
f" Niveau: {niveau} | ExpΓ©rience: {exp}"
)
return "\n\n".join(lines)
def _get_metier_reference_for_poste(self, poste_vise: str) -> str:
"""Trouve les mΓ©tiers les plus proches du poste visΓ© pour contextualiser l'analyse de projets."""
if not poste_vise or poste_vise == "Non identifiΓ©":
return "Aucun métier de référence spécifique. Analyser les projets selon leur qualité intrinsèque."
poste_lower = poste_vise.lower()
scored = []
for m in self.metiers_data:
nom_lower = m.get("nom", "").lower()
id_lower = m.get("id", "").lower()
desc_lower = m.get("description", "").lower()
score = 0
keywords = [w for w in poste_lower.replace("/", " ").replace("-", " ").split() if len(w) > 2]
for kw in keywords:
if kw in nom_lower:
score += 3
if kw in id_lower:
score += 2
if kw in desc_lower:
score += 1
nom_keywords = [w for w in nom_lower.replace("/", " ").replace("-", " ").split() if len(w) > 2]
for kw in nom_keywords:
if kw in poste_lower:
score += 3
if score > 0:
scored.append((score, m))
scored.sort(key=lambda x: -x[0])
if not scored:
return "Poste visé non trouvé dans le référentiel. Analyser les projets selon leur qualité intrinsèque."
lines = ["MΓ©tier(s) de rΓ©fΓ©rence les plus proches du poste visΓ© :"]
for _, m in scored[:3]:
mid = m.get("id")
nom = m.get("nom")
comp = m.get("competences_techniques", [])
outils = m.get("outils_technologies", [])
missions = m.get("missions_principales", [])
lines.append(
f"\n[{mid}] {nom}\n"
f" CompΓ©tences attendues: {', '.join(comp)}\n"
f" Outils attendus: {', '.join(outils)}\n"
f" Missions principales: {'; '.join(missions[:3])}"
)
return "\n".join(lines)
def _extract_methodologies(self, hard_skills: List[str], skill_domains: Dict[str, List[str]]) -> List[str]:
"""Extrait les mΓ©thodologies de travail du candidat."""
methodology_keywords = {
"agile", "scrum", "kanban", "devops", "ci/cd", "cicd", "tdd", "bdd",
"design thinking", "lean", "safe", "xp", "pair programming",
"code review", "sprint", "product owner", "scrum master",
"rgpd", "rgaa",
}
methodologies = []
for skill in hard_skills:
if skill.lower().strip() in methodology_keywords:
methodologies.append(skill)
if "gestion_projet" in skill_domains:
for skill in skill_domains["gestion_projet"]:
if skill not in methodologies:
methodologies.append(skill)
if "devops" in skill_domains:
for skill in skill_domains["devops"]:
s = skill.lower()
if any(kw in s for kw in ["ci", "cd", "github actions", "gitlab ci"]):
if skill not in methodologies:
methodologies.append(skill)
return methodologies
# ──────────────────────────────────────────────
# AgrΓ©gation des rΓ©sultats d'extraction (Phase 2)
# ──────────────────────────────────────────────
def _aggregate_extraction_results(self, results_map: Dict[str, Any]) -> Dict[str, Any]:
"""Agrège les résultats d'extraction (identique au module existant)."""
def get_parsed(key, default=None):
if key not in results_map:
return default
return self._parse_json_output(results_map[key], default)
competences = get_parsed("skills_task", {"hard_skills": [], "soft_skills": []})
experiences = get_parsed("experience_task", [])
projets = get_parsed("project_task", {"professional": [], "personal": []})
formations = get_parsed("education_task", [])
reconversion = get_parsed("reconversion_task", {}).get(
"reconversion_analysis", {}
)
etudiant_data = get_parsed("etudiant_task", {}).get("etudiant_analysis", {})
latest_end_date = etudiant_data.get("latest_education_end_date")
if latest_end_date:
is_student_by_date = self._is_still_student(latest_end_date)
etudiant_data["is_etudiant"] = is_student_by_date
langues_raw = get_parsed("language_task", {})
if isinstance(competences, dict):
raw_skills = competences.get("hard_skills", [])
seen = set()
unique_skills = []
for skill in raw_skills:
key = (
str(skill).lower()
if not isinstance(skill, str)
else skill.lower()
)
if key not in seen:
seen.add(key)
unique_skills.append(skill)
competences["hard_skills"] = unique_skills
identity = get_parsed("identity_task", {})
return {
"candidat": {
"first_name": (
identity.get("first_name")
if isinstance(identity, dict)
else None
),
"compΓ©tences": competences,
"expΓ©riences": experiences,
"reconversion": reconversion,
"projets": projets,
"formations": formations,
"etudiant": etudiant_data,
"langues": (
langues_raw.get("langues", [])
if isinstance(langues_raw, dict)
else []
),
}
}
# ──────────────────────────────────────────────
# AgrΓ©gation des recommandations (Phase 3)
# ──────────────────────────────────────────────
def _aggregate_recommendations(
self,
analysis_results: Dict[str, Any],
header_data: Dict,
poste_vise: str,
) -> Dict[str, Any]:
"""Agrège les résultats d'analyse avec des recommandations orientées projets."""
def get_parsed(key, default=None):
if key not in analysis_results:
return default
return self._parse_json_output(analysis_results[key], default)
metier_data = get_parsed("metier_matching_task", {"postes_recommandes": []})
quality_data = get_parsed(
"cv_quality_task",
{"score_global": 0, "red_flags": [], "conseils_prioritaires": []},
)
project_data = get_parsed("project_analysis_task", {"analyse_projets": []})
# ── Conseils d'amΓ©lioration ────────────────────────────────────────────
conseils = []
# 1. Conseils qualitΓ© CV
if isinstance(quality_data, dict):
conseils.extend(quality_data.get("conseils_prioritaires", []))
# 2. Projets Γ  mettre en avant
if isinstance(project_data, dict):
for item in (project_data.get("ordre_mise_en_avant", []) or [])[:3]:
if isinstance(item, dict) and item.get("raison"):
conseils.append(
f"Projet prioritaire #{item.get('rang', '?')} Γ  mettre en avant"
f" - '{item.get('titre', '?')}' : {item['raison']}"
)
return {
"header_analysis": header_data,
"postes_recommandes": (
metier_data.get("postes_recommandes", [])
if isinstance(metier_data, dict)
else []
),
"analyse_poste_vise": (
metier_data.get("analyse_poste_vise", "")
if isinstance(metier_data, dict)
else ""
),
"qualite_cv": quality_data,
"analyse_projets": (
project_data.get("analyse_projets", [])
if isinstance(project_data, dict)
else []
),
"ordre_mise_en_avant_projets": (
project_data.get("ordre_mise_en_avant", [])
if isinstance(project_data, dict)
else []
),
"coherence_globale_projets": (
project_data.get("coherence_globale", {})
if isinstance(project_data, dict)
else {}
),
"conseils_amelioration": conseils,
}
# ──────────────────────────────────────────────
# Utilitaires
# ──────────────────────────────────────────────
def _fallback_extract_poste_vise(
self, cv_full_text: str, header_section: str
) -> str:
"""Extraction programmatique du poste visΓ© en fallback.
Cherche la ligne de titre dans l'en-tΓͺte du CV en filtrant les lignes
qui ne sont clairement PAS un titre de poste (email, tΓ©lΓ©phone, liens,
titres de section, compΓ©tences techniques).
"""
import re
# Patterns qui NE sont PAS un titre de poste
skip_patterns = [
r"^#{1,6}\s", # Titres markdown
r"@", # Email
r"^\+?\d[\d\s\-\.]{7,}", # TΓ©lΓ©phone
r"^http|^www\.|linkedin|github", # URLs/liens
r"^\*{1,3}[A-Z]", # Bold section headers
r"^(CONTACT|LIENS|STACK|LANGUES|CENTRES|EXPERIENCE|FORMATION|PROJET|COMPÉTENCES|EDUCATION)", # Section headings
r"^(Python|SQL|JavaScript|React|FastAPI|Docker|AWS|Git|CI)", # Skills
r"^(Ile-de-France|Paris|Lyon|Marseille|France)", # Locations
r"^\d{2}\s?\d{2}\s?\d{2}", # Phone numbers
r"^(FranΓ§ais|Anglais|Portugais|Espagnol)", # Languages
r"^(Langages|Frameworks|Analytics|DevOps|MΓ©thodologies|IA &|BI :)", # Skill categories
r"^(Blockchain|Jeux de rΓ΄le|RandonnΓ©e)", # Interests
r"^\s*$", # Empty lines
r"^[\*\-\|]", # List items and table separators
]
# Mots-clΓ©s qui INDIQUENT un titre de poste
title_indicators = [
"dΓ©veloppeur", "developer", "ingΓ©nieur", "engineer", "chef de projet",
"data analyst", "data scientist", "data engineer", "consultant",
"architecte", "manager", "lead", "senior", "junior", "fullstack",
"full-stack", "full stack", "backend", "frontend", "devops",
"product", "project", "spΓ©cialiste", "expert", "analyste",
"mlops", "ai", "ia", "machine learning", "nlp", "deep learning",
]
def _has_title_indicator(text_lower: str) -> bool:
for indicator in title_indicators:
if len(indicator) <= 3:
if re.search(r"\b" + re.escape(indicator) + r"\b", text_lower):
return True
else:
if indicator in text_lower:
return True
return False
def _is_likely_title(line: str) -> bool:
stripped = line.strip().strip("#*_ ")
if len(line.split()) > 10:
return False
for pattern in skip_patterns:
if re.match(pattern, stripped, re.IGNORECASE):
return False
return _has_title_indicator(stripped.lower())
# Chercher dans toutes les sources, par ordre de prioritΓ©
sources = [
("header", header_section),
("cv_text", cv_full_text[:3000]),
]
for source_name, text in sources:
if not text:
continue
lines = text.split("\n")
for line in lines:
if _is_likely_title(line):
clean = line.strip().strip("#*_ ")
logger.info(f"Fallback: found title in {source_name}: '{clean}'")
return clean
return ""
def _is_still_student(self, date_str: str) -> bool:
"""DΓ©termine si le candidat est encore Γ©tudiant Γ  partir de la date de fin d'Γ©tudes."""
if not date_str:
return False
date_str = str(date_str).lower().strip()
ongoing_keywords = [
"present", "prΓ©sent", "current", "cours", "aujourd'hui", "now"
]
if any(keyword in date_str for keyword in ongoing_keywords):
return True
try:
now = datetime.now()
end_date = None
if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-":
end_date = datetime.strptime(date_str, "%Y-%m-%d")
elif len(date_str) == 7 and date_str[4] == "-":
end_date = datetime.strptime(date_str, "%Y-%m")
elif "/" in date_str:
parts = date_str.split("/")
if len(parts) == 2:
_, y = parts
if len(y) == 4:
end_date = datetime.strptime(date_str, "%m/%Y")
elif len(y) == 2:
end_date = datetime.strptime(date_str, "%m/%y")
elif len(date_str) == 4 and date_str.isdigit():
end_date = datetime.strptime(date_str, "%Y")
end_date = end_date.replace(month=12, day=31)
if end_date:
return end_date >= now
return False
except (ValueError, IndexError):
logger.warning(f"Date parsing failed for: {date_str}")
return False
def _parse_json_output(self, crew_output, default_structure=None) -> Any:
"""Parse la sortie JSON d'un agent CrewAI avec nettoyage robuste."""
if crew_output is None:
return default_structure if default_structure is not None else {}
raw = crew_output.raw if hasattr(crew_output, "raw") else str(crew_output)
# Extraire le bloc JSON si encapsulΓ© dans des backticks
if "```json" in raw:
raw = raw.split("```json")[1].split("```")[0].strip()
elif "```" in raw:
parts = raw.split("```")
if len(parts) >= 3:
raw = parts[1].strip()
raw = raw.strip().lstrip("\ufeff")
def _try_parse(text: str):
"""Tente un parse direct puis un parse avec extraction du premier bloc JSON."""
try:
return json.loads(text)
except json.JSONDecodeError:
pass
for start_char, end_char in [("{", "}"), ("[", "]")]:
start_idx = text.find(start_char)
end_idx = text.rfind(end_char)
if start_idx != -1 and end_idx > start_idx:
try:
return json.loads(text[start_idx : end_idx + 1])
except json.JSONDecodeError:
pass
return None
# Tentative 1 : parse du texte tel quel (gère "JSON : {...}" et JSON propre)
result = _try_parse(raw)
if result is not None:
return result
# Tentative 2 : le LLM a copiΓ© les {{ }} du expected_output YAML.
# ⚠️ On ne remplace QUE si {{ est dΓ©tectΓ© β€” Γ©vite de casser un JSON
# compact valide du type {"inner": {"key": "val"}} β†’ {"inner": {"key": "val"}
if "{{" in raw:
cleaned = raw.replace("{{", "{").replace("}}", "}")
result = _try_parse(cleaned)
if result is not None:
return result
logger.error(f"JSON Parse Error (after cleanup): {raw[:200]}")
return default_structure if default_structure is not None else {}