Spaces:
Sleeping
Sleeping
| """ | |
| Orchestrateur CV enrichi avec 3 phases : | |
| Phase 1 : DΓ©coupage du CV en sections | |
| Phase 2 : Extraction parallèle (8 agents existants) | |
| Phase 3 : Analyse & Recommandation parallèle (5 nouveaux agents) | |
| Produit un JSON en 2 parties : informations + recommandations. | |
| """ | |
| import json | |
| import logging | |
| import os | |
| import yaml | |
| import asyncio | |
| from datetime import datetime | |
| from typing import Dict, Any, List | |
| from crewai import Agent, Task, Crew, Process | |
| from src.config.app_config import get_small_llm, get_big_llm | |
| logger = logging.getLogger(__name__) | |
| #_____________________________________________________________________________________ | |
| # Configuration du logger pour capturer la verbositΓ© dans un fichier | |
| verbose_logger = logging.getLogger("crewai_verbose") | |
| verbose_logger.setLevel(logging.INFO) | |
| # CrΓ©ation du fichier de log (Γ©crase le prΓ©cΓ©dent Γ chaque run avec 'w') | |
| file_handler = logging.FileHandler("agents_trace.log", mode='w', encoding='utf-8') | |
| formatter = logging.Formatter('%(asctime)s - %(message)s') | |
| file_handler.setFormatter(formatter) | |
| verbose_logger.addHandler(file_handler) | |
| class CVAgentOrchestrator: | |
| """Orchestrateur multi-agents pour le parsing et l'analyse de CV.""" | |
| def __init__(self): | |
| self.llm = get_small_llm() | |
| self.big_llm = get_big_llm() | |
| self.agents_config = self._load_yaml("agents.yaml") | |
| self.tasks_config = self._load_yaml("tasks.yaml") | |
| self.metiers_data = self._load_metiers() | |
| self.skill_domain_map = self._load_skill_domain_map() | |
| self._create_agents() | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Chargement des configurations | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _load_yaml(self, filename: str) -> Dict: | |
| base_path = os.path.dirname(os.path.dirname(__file__)) | |
| config_path = os.path.join(base_path, "config", filename) | |
| with open(config_path, "r", encoding="utf-8") as f: | |
| return yaml.safe_load(f) | |
| def _load_metiers(self) -> List[Dict]: | |
| """Charge le rΓ©fΓ©rentiel de mΓ©tiers (sans les embeddings pour Γ©conomiser la mΓ©moire).""" | |
| base_path = os.path.dirname(os.path.dirname(__file__)) | |
| metiers_path = os.path.join(base_path, "data", "metiers.json") | |
| with open(metiers_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| metiers = [] | |
| for m in data.get("metiers", []): | |
| clean = {k: v for k, v in m.items() if k != "embedding"} | |
| metiers.append(clean) | |
| return metiers | |
| def _load_skill_domain_map(self) -> Dict[str, List[str]]: | |
| """Charge le mapping compΓ©tences -> domaines.""" | |
| base_path = os.path.dirname(os.path.dirname(__file__)) | |
| map_path = os.path.join(base_path, "config", "skill_domain_map.json") | |
| with open(map_path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CrΓ©ation des agents | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _create_agents(self): | |
| def make_agent(name, llm_override=None): | |
| return Agent( | |
| config=self.agents_config[name], | |
| llm=llm_override or self.llm, | |
| allow_delegation=False, | |
| verbose=True, | |
| max_iter=1, | |
| respect_context_window=True, | |
| # logs callbackagent | |
| step_callback=lambda step: verbose_logger.info(f"Agent {name} Step: {step}"), | |
| ) | |
| # Phase 2 : Agents d'extraction (existants) | |
| self.cv_splitter = make_agent("cv_splitter", llm_override=self.big_llm) | |
| self.skills_extractor = make_agent("skills_extractor") | |
| self.experience_extractor = make_agent("experience_extractor") | |
| self.project_extractor = make_agent("project_extractor") | |
| self.education_extractor = make_agent("education_extractor") | |
| self.reconversion_detector = make_agent("reconversion_detector") | |
| self.language_extractor = make_agent("language_extractor") | |
| self.etudiant_detector = make_agent("etudiant_detector") | |
| self.identity_extractor = make_agent("identity_extractor") | |
| # Phase 3 : Agents d'analyse et recommandation (nouveaux) | |
| self.header_analyzer = make_agent("header_analyzer", llm_override=self.big_llm) | |
| self.metier_matcher = make_agent("metier_matcher", llm_override=self.big_llm) | |
| self.cv_quality_checker = make_agent("cv_quality_checker") | |
| self.project_analyzer = make_agent("project_analyzer") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PHASE 1 : DΓ©coupage du CV en sections | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def split_cv_sections(self, cv_content: str, cv_raw_start: str = "") -> Dict[str, str]: | |
| """DΓ©coupe le CV en sections via l'agent cv_splitter.""" | |
| task_config = self.tasks_config["split_cv_task"].copy() | |
| # Γchapper les accolades dans le contenu CV pour Γ©viter les erreurs de format | |
| safe_content = cv_content[:20000].replace("{", "{{").replace("}", "}}") | |
| safe_raw = cv_raw_start[:2000].replace("{", "{{").replace("}", "}}") | |
| task_config["description"] = task_config["description"].format( | |
| cv_content=safe_content, | |
| cv_raw_start=safe_raw, | |
| ) | |
| task = Task(config=task_config, agent=self.cv_splitter) | |
| crew = Crew( | |
| agents=[self.cv_splitter], | |
| tasks=[task], | |
| process=Process.sequential, | |
| verbose=False, | |
| ) | |
| result = await crew.kickoff_async() | |
| parsed = self._parse_json_output(result, default_structure={}) | |
| return parsed | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PHASE 2 : Extraction parallèle (8 agents) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def extract_all_sections( | |
| self, sections: Dict[str, str], cv_raw_start: str = "", file_name: str = "" | |
| ) -> Dict[str, Any]: | |
| """Exécute les 8 tÒches d'extraction en parallèle.""" | |
| def create_task_async(task_key, agent, **kwargs): | |
| t_config = self.tasks_config[task_key].copy() | |
| t_description = t_config["description"] | |
| # Γviter les erreurs de formattage si des clΓ©s manquent ou sont mal Γ©chappΓ©es (ex: accolades dans le texte du CV) | |
| try: | |
| # Utiliser format_map pour plus de flexibilitΓ© si besoin, mais format() est standard | |
| t_config["description"] = t_description.format(**kwargs) | |
| except KeyError as e: | |
| logger.warning(f"KeyError formatting task '{task_key}': {e}. Falling back to manual replace.") | |
| # Fallback manuel sΓ©curisΓ© pour les clΓ©s prΓ©sentes | |
| desc = t_description | |
| for k, v in kwargs.items(): | |
| placeholder = "{" + k + "}" | |
| if placeholder in desc: | |
| desc = desc.replace(placeholder, str(v)) | |
| t_config["description"] = desc | |
| except Exception as e: | |
| logger.error(f"Unexpected error formatting task '{task_key}': {e}") | |
| task = Task(config=t_config, agent=agent) | |
| c = Crew(agents=[agent], tasks=[task], verbose=False) | |
| return (task_key, c.kickoff_async()) | |
| tasks_def = [ | |
| ( | |
| "skills_task", | |
| self.skills_extractor, | |
| { | |
| "experiences": sections.get("experiences", ""), | |
| "projects": sections.get("projects", ""), | |
| "skills": sections.get("skills", ""), | |
| "education": sections.get("education", ""), | |
| }, | |
| ), | |
| ( | |
| "experience_task", | |
| self.experience_extractor, | |
| {"experiences": sections.get("experiences", "")}, | |
| ), | |
| ( | |
| "project_task", | |
| self.project_extractor, | |
| {"projects": sections.get("projects", "")}, | |
| ), | |
| ( | |
| "education_task", | |
| self.education_extractor, | |
| {"education": sections.get("education", "")}, | |
| ), | |
| ( | |
| "reconversion_task", | |
| self.reconversion_detector, | |
| { | |
| "experiences": sections.get("experiences", ""), | |
| "education": sections.get("education", ""), | |
| }, | |
| ), | |
| ( | |
| "language_task", | |
| self.language_extractor, | |
| { | |
| "languages": sections.get("languages", ""), | |
| "cv_raw_start": cv_raw_start[:500], | |
| }, | |
| ), | |
| ( | |
| "etudiant_task", | |
| self.etudiant_detector, | |
| { | |
| "education": sections.get("education", ""), | |
| "current_date": datetime.now().strftime("%Y-%m-%d"), | |
| }, | |
| ), | |
| ( | |
| "identity_task", | |
| self.identity_extractor, | |
| { | |
| "header": sections.get("header", ""), | |
| "cv_raw_start": cv_raw_start[:1500], | |
| "file_name": file_name, | |
| }, | |
| ), | |
| ] | |
| task_coroutines = [ | |
| create_task_async(key, agent, **kwargs) for key, agent, kwargs in tasks_def | |
| ] | |
| keys = [t[0] for t in task_coroutines] | |
| coroutines = [t[1] for t in task_coroutines] | |
| results_list = await asyncio.gather(*coroutines, return_exceptions=True) | |
| results_map = {} | |
| for key, result in zip(keys, results_list): | |
| if isinstance(result, Exception): | |
| logger.error(f"Task '{key}' failed: {result}") | |
| else: | |
| results_map[key] = result | |
| return self._aggregate_extraction_results(results_map) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PHASE 3 : Analyse & Recommandation (5 agents) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def analyze_and_recommend( | |
| self, | |
| cv_full_text: str, | |
| sections: Dict[str, str], | |
| extraction: Dict[str, Any], | |
| cv_raw_start: str = "", | |
| ) -> Dict[str, Any]: | |
| """ExΓ©cute les 4 tΓ’ches d'analyse en 2 Γ©tapes optimisΓ©es. | |
| Γtape 3a : header_analyzer seul (rapide, nΓ©cessaire pour tous les autres) | |
| Γtape 3b : 3 agents en parallΓ¨le (quality, metier, project) | |
| """ | |
| candidat = extraction.get("candidat", {}) | |
| competences = candidat.get("compΓ©tences", {}) | |
| hard_skills = competences.get("hard_skills", []) | |
| soft_skills = competences.get("soft_skills", []) | |
| skills_with_context = competences.get("skills_with_context", []) | |
| reconversion = candidat.get("reconversion", {}) | |
| # Identifier les domaines de compΓ©tences et mΓ©thodologies | |
| skill_domains = self._map_skills_to_domains(hard_skills) | |
| methodologies = self._extract_methodologies(hard_skills, skill_domains) | |
| # PrΓ©parer les rΓ©sumΓ©s pour les prompts | |
| experiences_summary = json.dumps( | |
| candidat.get("expΓ©riences", []), ensure_ascii=False | |
| )[:3000] | |
| projets = candidat.get("projets", {}) | |
| professional_projects = json.dumps( | |
| projets.get("professional", []), ensure_ascii=False | |
| )[:2000] | |
| personal_projects = json.dumps( | |
| projets.get("personal", []), ensure_ascii=False | |
| )[:2000] | |
| projects_summary = f"Pro: {professional_projects}\nPerso: {personal_projects}" | |
| reconversion_data = json.dumps(reconversion, ensure_ascii=False) if reconversion else "{}" | |
| # PrΓ©parer le rΓ©fΓ©rentiel mΓ©tiers complet (30 mΓ©tiers) | |
| metiers_reference = self._prepare_metiers_for_prompt() | |
| # Skills rΓ©sumΓ© pour header analysis (fallback) | |
| skills_summary = ", ".join(hard_skills[:20]) if hard_skills else "Non identifiΓ©es" | |
| def create_task_async(task_key, agent, **kwargs): | |
| t_config = self.tasks_config[task_key].copy() | |
| t_config["description"] = t_config["description"].format(**kwargs) | |
| task = Task(config=t_config, agent=agent) | |
| c = Crew(agents=[agent], tasks=[task], verbose=False) | |
| return (task_key, c.kickoff_async()) | |
| # Utilise le texte brut fitz si fourni, sinon fallback sur le dΓ©but du Markdown | |
| raw_for_header = cv_raw_start[:2000] if cv_raw_start else cv_full_text[:2000] | |
| header_section = sections.get("header", "") | |
| safe_cv_raw = raw_for_header.replace("{", "{{").replace("}", "}}") | |
| safe_header = header_section.replace("{", "{{").replace("}", "}}") | |
| safe_skills = skills_summary.replace("{", "{{").replace("}", "}}") | |
| header_data = { | |
| "poste_vise": "Non identifiΓ©", | |
| "niveau_seniorite": "non prΓ©cisΓ©", | |
| "confiance": 0, | |
| } | |
| try: | |
| header_coroutine = create_task_async( | |
| "poste_visΓ©_task", | |
| self.header_analyzer, | |
| cv_raw_start=safe_cv_raw, | |
| header=safe_header, | |
| skills_summary=safe_skills, | |
| ) | |
| header_result = await header_coroutine[1] | |
| if header_result: | |
| header_data = self._parse_json_output( | |
| header_result, | |
| { | |
| "poste_vise": "Non identifiΓ©", | |
| "niveau_seniorite": "non prΓ©cisΓ©", | |
| "confiance": 0, | |
| }, | |
| ) | |
| logger.info(f"Header analyzer result: poste_vise='{header_data.get('poste_vise')}', confiance={header_data.get('confiance')}") | |
| except Exception as e: | |
| logger.error(f"Header analyzer failed: {e}", exc_info=True) | |
| poste_vise = header_data.get("poste_vise", "Non identifiΓ©") | |
| niveau_seniorite = header_data.get("niveau_seniorite", "non prΓ©cisΓ©") | |
| # --- Fallback programmatique si le LLM n'a pas trouvΓ© le poste --- | |
| if poste_vise == "Non identifiΓ©": | |
| logger.warning("Header analyzer returned 'Non identifiΓ©', trying fallback extraction...") | |
| fallback = self._fallback_extract_poste_vise( | |
| cv_full_text, header_section | |
| ) | |
| if fallback: | |
| poste_vise = fallback | |
| header_data["poste_vise"] = fallback | |
| header_data["source_detection"] = "fallback_programmatique" | |
| header_data["confiance"] = 70 | |
| logger.info(f"Fallback found poste_vise: '{fallback}'") | |
| # PrΓ©parer le dΓ©tail du mΓ©tier pour le project_analyzer | |
| metier_reference_detail = self._get_metier_reference_for_poste(poste_vise) | |
| # --- Γtape 3b : 3 agents en parallΓ¨le --- | |
| parallel_tasks = [ | |
| ( | |
| "cv_quality_task", | |
| self.cv_quality_checker, | |
| { | |
| "cv_full_text": cv_full_text[:8000], | |
| "cv_raw_start": safe_cv_raw, | |
| "skills_with_context": json.dumps( | |
| skills_with_context, ensure_ascii=False | |
| )[:2000], | |
| "experiences_summary": experiences_summary, | |
| "projects_summary": projects_summary[:2000], | |
| "niveau_seniorite": niveau_seniorite, | |
| "reconversion_data": reconversion_data, | |
| }, | |
| ), | |
| ( | |
| "metier_matching_task", | |
| self.metier_matcher, | |
| { | |
| "poste_vise": poste_vise, | |
| "hard_skills": json.dumps(hard_skills, ensure_ascii=False), | |
| "soft_skills": json.dumps(soft_skills, ensure_ascii=False), | |
| "skill_domains": json.dumps(skill_domains, ensure_ascii=False), | |
| "methodologies": json.dumps(methodologies, ensure_ascii=False), | |
| "experiences_summary": experiences_summary, | |
| "projects_summary": projects_summary[:2000], | |
| "reconversion_data": reconversion_data, | |
| "metiers_reference": metiers_reference, | |
| }, | |
| ), | |
| ( | |
| "project_analysis_task", | |
| self.project_analyzer, | |
| { | |
| "poste_vise": poste_vise, | |
| "metier_reference_detail": metier_reference_detail, | |
| "experiences_summary": experiences_summary, | |
| "professional_projects": professional_projects, | |
| "personal_projects": personal_projects, | |
| "reconversion_data": reconversion_data, | |
| }, | |
| ), | |
| ] | |
| task_coroutines = [ | |
| create_task_async(key, agent, **kwargs) for key, agent, kwargs in parallel_tasks | |
| ] | |
| keys = [t[0] for t in task_coroutines] | |
| coroutines = [t[1] for t in task_coroutines] | |
| results_list = await asyncio.gather(*coroutines, return_exceptions=True) | |
| analysis_results = {} | |
| for key, result in zip(keys, results_list): | |
| if isinstance(result, Exception): | |
| logger.error(f"Analysis task '{key}' failed: {result}") | |
| else: | |
| analysis_results[key] = result | |
| return self._aggregate_recommendations( | |
| analysis_results, | |
| header_data, | |
| poste_vise, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Mapping compΓ©tences -> domaines | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _map_skills_to_domains(self, hard_skills: List[str]) -> Dict[str, List[str]]: | |
| """Mappe les compΓ©tences du candidat Γ leurs domaines mΓ©tier.""" | |
| result = {} | |
| for skill in hard_skills: | |
| skill_lower = skill.lower().strip() | |
| for domain, domain_skills in self.skill_domain_map.items(): | |
| if skill_lower in domain_skills: | |
| if domain not in result: | |
| result[domain] = [] | |
| result[domain].append(skill) | |
| break | |
| return result | |
| def _prepare_metiers_for_prompt(self) -> str: | |
| """PrΓ©pare le rΓ©fΓ©rentiel mΓ©tiers COMPLET (30 mΓ©tiers) pour le prompt.""" | |
| lines = [] | |
| for m in self.metiers_data: | |
| mid = m.get("id", "?") | |
| nom = m.get("nom", "?") | |
| cat = m.get("categorie", "?") | |
| comp = m.get("competences_techniques", []) | |
| outils = m.get("outils_technologies", []) | |
| soft = m.get("competences_soft", []) | |
| niveau = m.get("niveau_etude", "?") | |
| exp = m.get("experience_requise", "?") | |
| lines.append( | |
| f"[{mid}] {nom} ({cat})\n" | |
| f" CompΓ©tences techniques: {', '.join(comp)}\n" | |
| f" Outils: {', '.join(outils)}\n" | |
| f" Soft skills: {', '.join(soft[:3])}\n" | |
| f" Niveau: {niveau} | ExpΓ©rience: {exp}" | |
| ) | |
| return "\n\n".join(lines) | |
| def _get_metier_reference_for_poste(self, poste_vise: str) -> str: | |
| """Trouve les mΓ©tiers les plus proches du poste visΓ© pour contextualiser l'analyse de projets.""" | |
| if not poste_vise or poste_vise == "Non identifiΓ©": | |
| return "Aucun métier de référence spécifique. Analyser les projets selon leur qualité intrinsèque." | |
| poste_lower = poste_vise.lower() | |
| scored = [] | |
| for m in self.metiers_data: | |
| nom_lower = m.get("nom", "").lower() | |
| id_lower = m.get("id", "").lower() | |
| desc_lower = m.get("description", "").lower() | |
| score = 0 | |
| keywords = [w for w in poste_lower.replace("/", " ").replace("-", " ").split() if len(w) > 2] | |
| for kw in keywords: | |
| if kw in nom_lower: | |
| score += 3 | |
| if kw in id_lower: | |
| score += 2 | |
| if kw in desc_lower: | |
| score += 1 | |
| nom_keywords = [w for w in nom_lower.replace("/", " ").replace("-", " ").split() if len(w) > 2] | |
| for kw in nom_keywords: | |
| if kw in poste_lower: | |
| score += 3 | |
| if score > 0: | |
| scored.append((score, m)) | |
| scored.sort(key=lambda x: -x[0]) | |
| if not scored: | |
| return "Poste visé non trouvé dans le référentiel. Analyser les projets selon leur qualité intrinsèque." | |
| lines = ["MΓ©tier(s) de rΓ©fΓ©rence les plus proches du poste visΓ© :"] | |
| for _, m in scored[:3]: | |
| mid = m.get("id") | |
| nom = m.get("nom") | |
| comp = m.get("competences_techniques", []) | |
| outils = m.get("outils_technologies", []) | |
| missions = m.get("missions_principales", []) | |
| lines.append( | |
| f"\n[{mid}] {nom}\n" | |
| f" CompΓ©tences attendues: {', '.join(comp)}\n" | |
| f" Outils attendus: {', '.join(outils)}\n" | |
| f" Missions principales: {'; '.join(missions[:3])}" | |
| ) | |
| return "\n".join(lines) | |
| def _extract_methodologies(self, hard_skills: List[str], skill_domains: Dict[str, List[str]]) -> List[str]: | |
| """Extrait les mΓ©thodologies de travail du candidat.""" | |
| methodology_keywords = { | |
| "agile", "scrum", "kanban", "devops", "ci/cd", "cicd", "tdd", "bdd", | |
| "design thinking", "lean", "safe", "xp", "pair programming", | |
| "code review", "sprint", "product owner", "scrum master", | |
| "rgpd", "rgaa", | |
| } | |
| methodologies = [] | |
| for skill in hard_skills: | |
| if skill.lower().strip() in methodology_keywords: | |
| methodologies.append(skill) | |
| if "gestion_projet" in skill_domains: | |
| for skill in skill_domains["gestion_projet"]: | |
| if skill not in methodologies: | |
| methodologies.append(skill) | |
| if "devops" in skill_domains: | |
| for skill in skill_domains["devops"]: | |
| s = skill.lower() | |
| if any(kw in s for kw in ["ci", "cd", "github actions", "gitlab ci"]): | |
| if skill not in methodologies: | |
| methodologies.append(skill) | |
| return methodologies | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # AgrΓ©gation des rΓ©sultats d'extraction (Phase 2) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _aggregate_extraction_results(self, results_map: Dict[str, Any]) -> Dict[str, Any]: | |
| """Agrège les résultats d'extraction (identique au module existant).""" | |
| def get_parsed(key, default=None): | |
| if key not in results_map: | |
| return default | |
| return self._parse_json_output(results_map[key], default) | |
| competences = get_parsed("skills_task", {"hard_skills": [], "soft_skills": []}) | |
| experiences = get_parsed("experience_task", []) | |
| projets = get_parsed("project_task", {"professional": [], "personal": []}) | |
| formations = get_parsed("education_task", []) | |
| reconversion = get_parsed("reconversion_task", {}).get( | |
| "reconversion_analysis", {} | |
| ) | |
| etudiant_data = get_parsed("etudiant_task", {}).get("etudiant_analysis", {}) | |
| latest_end_date = etudiant_data.get("latest_education_end_date") | |
| if latest_end_date: | |
| is_student_by_date = self._is_still_student(latest_end_date) | |
| etudiant_data["is_etudiant"] = is_student_by_date | |
| langues_raw = get_parsed("language_task", {}) | |
| if isinstance(competences, dict): | |
| raw_skills = competences.get("hard_skills", []) | |
| seen = set() | |
| unique_skills = [] | |
| for skill in raw_skills: | |
| key = ( | |
| str(skill).lower() | |
| if not isinstance(skill, str) | |
| else skill.lower() | |
| ) | |
| if key not in seen: | |
| seen.add(key) | |
| unique_skills.append(skill) | |
| competences["hard_skills"] = unique_skills | |
| identity = get_parsed("identity_task", {}) | |
| return { | |
| "candidat": { | |
| "first_name": ( | |
| identity.get("first_name") | |
| if isinstance(identity, dict) | |
| else None | |
| ), | |
| "compΓ©tences": competences, | |
| "expΓ©riences": experiences, | |
| "reconversion": reconversion, | |
| "projets": projets, | |
| "formations": formations, | |
| "etudiant": etudiant_data, | |
| "langues": ( | |
| langues_raw.get("langues", []) | |
| if isinstance(langues_raw, dict) | |
| else [] | |
| ), | |
| } | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # AgrΓ©gation des recommandations (Phase 3) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _aggregate_recommendations( | |
| self, | |
| analysis_results: Dict[str, Any], | |
| header_data: Dict, | |
| poste_vise: str, | |
| ) -> Dict[str, Any]: | |
| """Agrège les résultats d'analyse avec des recommandations orientées projets.""" | |
| def get_parsed(key, default=None): | |
| if key not in analysis_results: | |
| return default | |
| return self._parse_json_output(analysis_results[key], default) | |
| metier_data = get_parsed("metier_matching_task", {"postes_recommandes": []}) | |
| quality_data = get_parsed( | |
| "cv_quality_task", | |
| {"score_global": 0, "red_flags": [], "conseils_prioritaires": []}, | |
| ) | |
| project_data = get_parsed("project_analysis_task", {"analyse_projets": []}) | |
| # ββ Conseils d'amΓ©lioration ββββββββββββββββββββββββββββββββββββββββββββ | |
| conseils = [] | |
| # 1. Conseils qualitΓ© CV | |
| if isinstance(quality_data, dict): | |
| conseils.extend(quality_data.get("conseils_prioritaires", [])) | |
| # 2. Projets Γ mettre en avant | |
| if isinstance(project_data, dict): | |
| for item in (project_data.get("ordre_mise_en_avant", []) or [])[:3]: | |
| if isinstance(item, dict) and item.get("raison"): | |
| conseils.append( | |
| f"Projet prioritaire #{item.get('rang', '?')} Γ mettre en avant" | |
| f" - '{item.get('titre', '?')}' : {item['raison']}" | |
| ) | |
| return { | |
| "header_analysis": header_data, | |
| "postes_recommandes": ( | |
| metier_data.get("postes_recommandes", []) | |
| if isinstance(metier_data, dict) | |
| else [] | |
| ), | |
| "analyse_poste_vise": ( | |
| metier_data.get("analyse_poste_vise", "") | |
| if isinstance(metier_data, dict) | |
| else "" | |
| ), | |
| "qualite_cv": quality_data, | |
| "analyse_projets": ( | |
| project_data.get("analyse_projets", []) | |
| if isinstance(project_data, dict) | |
| else [] | |
| ), | |
| "ordre_mise_en_avant_projets": ( | |
| project_data.get("ordre_mise_en_avant", []) | |
| if isinstance(project_data, dict) | |
| else [] | |
| ), | |
| "coherence_globale_projets": ( | |
| project_data.get("coherence_globale", {}) | |
| if isinstance(project_data, dict) | |
| else {} | |
| ), | |
| "conseils_amelioration": conseils, | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Utilitaires | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _fallback_extract_poste_vise( | |
| self, cv_full_text: str, header_section: str | |
| ) -> str: | |
| """Extraction programmatique du poste visΓ© en fallback. | |
| Cherche la ligne de titre dans l'en-tΓͺte du CV en filtrant les lignes | |
| qui ne sont clairement PAS un titre de poste (email, tΓ©lΓ©phone, liens, | |
| titres de section, compΓ©tences techniques). | |
| """ | |
| import re | |
| # Patterns qui NE sont PAS un titre de poste | |
| skip_patterns = [ | |
| r"^#{1,6}\s", # Titres markdown | |
| r"@", # Email | |
| r"^\+?\d[\d\s\-\.]{7,}", # TΓ©lΓ©phone | |
| r"^http|^www\.|linkedin|github", # URLs/liens | |
| r"^\*{1,3}[A-Z]", # Bold section headers | |
| r"^(CONTACT|LIENS|STACK|LANGUES|CENTRES|EXPERIENCE|FORMATION|PROJET|COMPΓTENCES|EDUCATION)", # Section headings | |
| r"^(Python|SQL|JavaScript|React|FastAPI|Docker|AWS|Git|CI)", # Skills | |
| r"^(Ile-de-France|Paris|Lyon|Marseille|France)", # Locations | |
| r"^\d{2}\s?\d{2}\s?\d{2}", # Phone numbers | |
| r"^(FranΓ§ais|Anglais|Portugais|Espagnol)", # Languages | |
| r"^(Langages|Frameworks|Analytics|DevOps|MΓ©thodologies|IA &|BI :)", # Skill categories | |
| r"^(Blockchain|Jeux de rΓ΄le|RandonnΓ©e)", # Interests | |
| r"^\s*$", # Empty lines | |
| r"^[\*\-\|]", # List items and table separators | |
| ] | |
| # Mots-clΓ©s qui INDIQUENT un titre de poste | |
| title_indicators = [ | |
| "dΓ©veloppeur", "developer", "ingΓ©nieur", "engineer", "chef de projet", | |
| "data analyst", "data scientist", "data engineer", "consultant", | |
| "architecte", "manager", "lead", "senior", "junior", "fullstack", | |
| "full-stack", "full stack", "backend", "frontend", "devops", | |
| "product", "project", "spΓ©cialiste", "expert", "analyste", | |
| "mlops", "ai", "ia", "machine learning", "nlp", "deep learning", | |
| ] | |
| def _has_title_indicator(text_lower: str) -> bool: | |
| for indicator in title_indicators: | |
| if len(indicator) <= 3: | |
| if re.search(r"\b" + re.escape(indicator) + r"\b", text_lower): | |
| return True | |
| else: | |
| if indicator in text_lower: | |
| return True | |
| return False | |
| def _is_likely_title(line: str) -> bool: | |
| stripped = line.strip().strip("#*_ ") | |
| if len(line.split()) > 10: | |
| return False | |
| for pattern in skip_patterns: | |
| if re.match(pattern, stripped, re.IGNORECASE): | |
| return False | |
| return _has_title_indicator(stripped.lower()) | |
| # Chercher dans toutes les sources, par ordre de prioritΓ© | |
| sources = [ | |
| ("header", header_section), | |
| ("cv_text", cv_full_text[:3000]), | |
| ] | |
| for source_name, text in sources: | |
| if not text: | |
| continue | |
| lines = text.split("\n") | |
| for line in lines: | |
| if _is_likely_title(line): | |
| clean = line.strip().strip("#*_ ") | |
| logger.info(f"Fallback: found title in {source_name}: '{clean}'") | |
| return clean | |
| return "" | |
| def _is_still_student(self, date_str: str) -> bool: | |
| """DΓ©termine si le candidat est encore Γ©tudiant Γ partir de la date de fin d'Γ©tudes.""" | |
| if not date_str: | |
| return False | |
| date_str = str(date_str).lower().strip() | |
| ongoing_keywords = [ | |
| "present", "prΓ©sent", "current", "cours", "aujourd'hui", "now" | |
| ] | |
| if any(keyword in date_str for keyword in ongoing_keywords): | |
| return True | |
| try: | |
| now = datetime.now() | |
| end_date = None | |
| if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-": | |
| end_date = datetime.strptime(date_str, "%Y-%m-%d") | |
| elif len(date_str) == 7 and date_str[4] == "-": | |
| end_date = datetime.strptime(date_str, "%Y-%m") | |
| elif "/" in date_str: | |
| parts = date_str.split("/") | |
| if len(parts) == 2: | |
| _, y = parts | |
| if len(y) == 4: | |
| end_date = datetime.strptime(date_str, "%m/%Y") | |
| elif len(y) == 2: | |
| end_date = datetime.strptime(date_str, "%m/%y") | |
| elif len(date_str) == 4 and date_str.isdigit(): | |
| end_date = datetime.strptime(date_str, "%Y") | |
| end_date = end_date.replace(month=12, day=31) | |
| if end_date: | |
| return end_date >= now | |
| return False | |
| except (ValueError, IndexError): | |
| logger.warning(f"Date parsing failed for: {date_str}") | |
| return False | |
| def _parse_json_output(self, crew_output, default_structure=None) -> Any: | |
| """Parse la sortie JSON d'un agent CrewAI avec nettoyage robuste.""" | |
| if crew_output is None: | |
| return default_structure if default_structure is not None else {} | |
| raw = crew_output.raw if hasattr(crew_output, "raw") else str(crew_output) | |
| # Extraire le bloc JSON si encapsulΓ© dans des backticks | |
| if "```json" in raw: | |
| raw = raw.split("```json")[1].split("```")[0].strip() | |
| elif "```" in raw: | |
| parts = raw.split("```") | |
| if len(parts) >= 3: | |
| raw = parts[1].strip() | |
| raw = raw.strip().lstrip("\ufeff") | |
| def _try_parse(text: str): | |
| """Tente un parse direct puis un parse avec extraction du premier bloc JSON.""" | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError: | |
| pass | |
| for start_char, end_char in [("{", "}"), ("[", "]")]: | |
| start_idx = text.find(start_char) | |
| end_idx = text.rfind(end_char) | |
| if start_idx != -1 and end_idx > start_idx: | |
| try: | |
| return json.loads(text[start_idx : end_idx + 1]) | |
| except json.JSONDecodeError: | |
| pass | |
| return None | |
| # Tentative 1 : parse du texte tel quel (gère "JSON : {...}" et JSON propre) | |
| result = _try_parse(raw) | |
| if result is not None: | |
| return result | |
| # Tentative 2 : le LLM a copiΓ© les {{ }} du expected_output YAML. | |
| # β οΈ On ne remplace QUE si {{ est dΓ©tectΓ© β Γ©vite de casser un JSON | |
| # compact valide du type {"inner": {"key": "val"}} β {"inner": {"key": "val"} | |
| if "{{" in raw: | |
| cleaned = raw.replace("{{", "{").replace("}}", "}") | |
| result = _try_parse(cleaned) | |
| if result is not None: | |
| return result | |
| logger.error(f"JSON Parse Error (after cleanup): {raw[:200]}") | |
| return default_structure if default_structure is not None else {} | |