Spaces:

QuentinL52
/

interview_agents_api

Sleeping

App Files Files Community

QuentinL52 commited on Jul 16, 2025

Commit

3be1ab7

verified ·

1 Parent(s): e5394e3

Update src/cv_parsing_agents.py

Browse files

Files changed (1) hide show

src/cv_parsing_agents.py +115 -63

src/cv_parsing_agents.py CHANGED Viewed

@@ -7,23 +7,8 @@ import logging
 logger = logging.getLogger(__name__)
-try:
-    from src.crew.crew_pool import analyse_cv
-    CREW_POOL_AVAILABLE = True
-    logger.info("✅ crew_pool importé avec succès")
-except ImportError as e:
-    logger.error(f"❌ Erreur import crew_pool: {e}")
-    CREW_POOL_AVAILABLE = False
-    analyse_cv = None
-try:
-    from src.config import load_pdf
-    CONFIG_AVAILABLE = True
-    logger.info("✅ config importé avec succès")
-except ImportError as e:
-    logger.error(f"❌ Erreur import config: {e}")
-    CONFIG_AVAILABLE = False
-    load_pdf = None
 def clean_dict_keys(data):
     """
@@ -51,64 +36,76 @@ class OptimizedCvParserAgent:
     """
     def __init__(self, pdf_path: str):
         if not pdf_path or not isinstance(pdf_path, str):
             raise ValueError("Le chemin du fichier PDF doit être une chaîne non vide")
         self.pdf_path = pdf_path
-        if not CREW_POOL_AVAILABLE:
-            logger.warning("CrewAI crew_pool non disponible - mode dégradé")
-        if not CONFIG_AVAILABLE:
-            logger.warning("Module config non disponible - mode dégradé")
     def process(self) -> dict:
         logger.info(f"Début du traitement optimisé du CV : {self.pdf_path}")
         if not os.path.exists(self.pdf_path):
-            logger.error(f"Fichier PDF non trouvé: {self.pdf_path}")
-            return self._create_fallback_data()
-        if not CREW_POOL_AVAILABLE or not CONFIG_AVAILABLE:
-            logger.error("Dépendances manquantes pour le traitement complet")
-            return self._create_fallback_data()
-        try:
-            cv_text_content = load_pdf(self.pdf_path)
-            if not cv_text_content or not cv_text_content.strip():
-                logger.error("Le PDF semble vide ou illisible")
-                return self._create_fallback_data()
-            logger.info(f"PDF chargé, {len(cv_text_content)} caractères extraits")
-            crew_output = analyse_cv(cv_text_content)
-            if not crew_output or not hasattr(crew_output, 'raw') or not crew_output.raw.strip():
-                logger.error("L'analyse par le crew n'a pas retourné de résultat.")
-                return self._create_fallback_data()
-            raw_string = crew_output.raw
-            logger.info(f"Résultat brut du crew optimisé: {raw_string[:200]}...")
-            json_string_cleaned = self._clean_json_string(raw_string)
-            profile_data = json.loads(json_string_cleaned)
-            logger.info("Parsing JSON optimisé réussi")
-            optimized_data = self._validate_and_enhance_data(profile_data)
-            return clean_dict_keys(optimized_data)
-        except json.JSONDecodeError as e:
-            logger.error(f"Erreur de décodage JSON : {e}")
-            if 'crew_output' in locals():
-                logger.error(f"Données brutes reçues : {crew_output.raw}")
-            return self._create_fallback_data()
-        except Exception as e:
-            logger.error(f"Erreur inattendue dans OptimizedCvParserAgent : {e}", exc_info=True)
-            return self._create_fallback_data()
     def _validate_and_enhance_data(self, profile_data: dict) -> dict:
         if not isinstance(profile_data, dict) or "candidat" not in profile_data:
-            logger.warning("Structure de données invalide, création de structure de base")
-            return self._create_fallback_data()
         candidat = profile_data["candidat"]
@@ -116,12 +113,15 @@ class OptimizedCvParserAgent:
             "informations_personnelles", "compétences", "expériences",
             "projets", "formations", "reconversion"
         ]
         for section in required_sections:
             if section not in candidat or not candidat[section]:
                 logger.warning(f"Section manquante ou vide: {section}")
                 candidat[section] = self._get_default_section_data(section)
         self._normalize_competences(candidat.get("compétences", {}))
         self._normalize_experiences(candidat.get("expériences", []))
         logger.info("Validation et enrichissement des données terminés")
         return profile_data
@@ -129,10 +129,12 @@ class OptimizedCvParserAgent:
         """Normalise la section compétences"""
         if not isinstance(competences, dict):
             return
         if "hard_skills" not in competences:
             competences["hard_skills"] = []
         if "soft_skills" not in competences:
             competences["soft_skills"] = []
         competences["hard_skills"] = [skill.strip() for skill in competences["hard_skills"] if skill and skill.strip()]
         competences["soft_skills"] = [skill.strip() for skill in competences["soft_skills"] if skill and skill.strip()]
@@ -140,10 +142,13 @@ class OptimizedCvParserAgent:
         """Normalise la section expériences"""
         if not isinstance(experiences, list):
             return
         required_fields = ["Poste", "Entreprise", "start_date", "end_date", "responsabilités"]
         for exp in experiences:
             if not isinstance(exp, dict):
                 continue
             for field in required_fields:
                 if field not in exp or exp[field] in [None, "", []]:
                     exp[field] = "Non spécifié" if field != "responsabilités" else []
@@ -175,6 +180,41 @@ class OptimizedCvParserAgent:
         return defaults.get(section, {})
     def _clean_json_string(self, raw_string: str) -> str:
         json_string_cleaned = raw_string.strip()
         if '```' in raw_string:
@@ -192,6 +232,12 @@ class OptimizedCvParserAgent:
         return json_string_cleaned
     def get_processing_stats(self) -> dict:
         return {
             "optimization_enabled": True,
             "section_based_processing": True,
@@ -208,5 +254,11 @@ class CvParserAgent(OptimizedCvParserAgent):
 if __name__ == "__main__":
     logger.info("Test du module cv_parsing_agents optimisé")
-    logger.info(f"CREW_POOL_AVAILABLE: {CREW_POOL_AVAILABLE}")
-    logger.info(f"CONFIG_AVAILABLE: {CONFIG_AVAILABLE}")

 logger = logging.getLogger(__name__)
+from src.crew.crew_pool import analyse_cv
+from src.config import load_pdf
 def clean_dict_keys(data):
     """
     """
     def __init__(self, pdf_path: str):
+        """
+        Initialise l'agent de parsing de CV optimisé.
+        Args:
+            pdf_path (str): Chemin vers le fichier PDF à traiter
+        Raises:
+            ValueError: Si le chemin du fichier est invalide
+        """
         if not pdf_path or not isinstance(pdf_path, str):
             raise ValueError("Le chemin du fichier PDF doit être une chaîne non vide")
         self.pdf_path = pdf_path
     def process(self) -> dict:
+        """
+        Traite le fichier PDF pour en extraire le contenu sous forme de JSON optimisé.
+        Returns:
+            dict: Dictionnaire contenant les données extraites du CV
+        Raises:
+            FileNotFoundError: Si le fichier PDF n'existe pas
+            ValueError: Si le PDF est vide ou illisible
+            json.JSONDecodeError: Si le résultat n'est pas un JSON valide
+            Exception: Pour toute autre erreur de traitement
+        """
         logger.info(f"Début du traitement optimisé du CV : {self.pdf_path}")
         if not os.path.exists(self.pdf_path):
+            raise FileNotFoundError(f"Fichier PDF non trouvé: {self.pdf_path}")
+        cv_text_content = load_pdf(self.pdf_path)
+        if not cv_text_content or not cv_text_content.strip():
+            raise ValueError("Le PDF semble vide ou illisible")
+        logger.info(f"PDF chargé, {len(cv_text_content)} caractères extraits")
+        crew_output = analyse_cv(cv_text_content)
+        if not crew_output or not hasattr(crew_output, 'raw') or not crew_output.raw.strip():
+            raise Exception("L'analyse par le crew n'a pas retourné de résultat.")
+        raw_string = crew_output.raw
+        logger.info(f"Résultat brut du crew optimisé: {raw_string[:200]}...")
+        json_string_cleaned = self._clean_json_string(raw_string)
+        profile_data = json.loads(json_string_cleaned)
+        logger.info("Parsing JSON optimisé réussi")
+        optimized_data = self._validate_and_enhance_data(profile_data)
+        return clean_dict_keys(optimized_data)
     def _validate_and_enhance_data(self, profile_data: dict) -> dict:
+        """
+        Valide et enrichit les données extraites du CV.
+        Args:
+            profile_data (dict): Données brutes extraites
+        Returns:
+            dict: Données validées et enrichies
+        Raises:
+            ValueError: Si la structure de données est invalide
+        """
         if not isinstance(profile_data, dict) or "candidat" not in profile_data:
+            raise ValueError("Structure de données invalide - clé 'candidat' manquante")
         candidat = profile_data["candidat"]
             "informations_personnelles", "compétences", "expériences",
             "projets", "formations", "reconversion"
         ]
         for section in required_sections:
             if section not in candidat or not candidat[section]:
                 logger.warning(f"Section manquante ou vide: {section}")
                 candidat[section] = self._get_default_section_data(section)
         self._normalize_competences(candidat.get("compétences", {}))
         self._normalize_experiences(candidat.get("expériences", []))
         logger.info("Validation et enrichissement des données terminés")
         return profile_data
         """Normalise la section compétences"""
         if not isinstance(competences, dict):
             return
         if "hard_skills" not in competences:
             competences["hard_skills"] = []
         if "soft_skills" not in competences:
             competences["soft_skills"] = []
         competences["hard_skills"] = [skill.strip() for skill in competences["hard_skills"] if skill and skill.strip()]
         competences["soft_skills"] = [skill.strip() for skill in competences["soft_skills"] if skill and skill.strip()]
         """Normalise la section expériences"""
         if not isinstance(experiences, list):
             return
         required_fields = ["Poste", "Entreprise", "start_date", "end_date", "responsabilités"]
         for exp in experiences:
             if not isinstance(exp, dict):
                 continue
             for field in required_fields:
                 if field not in exp or exp[field] in [None, "", []]:
                     exp[field] = "Non spécifié" if field != "responsabilités" else []
         return defaults.get(section, {})
     def _clean_json_string(self, raw_string: str) -> str:
+        """
+        Nettoie une chaîne JSON brute en supprimant les blocs de code markdown.
+        Args:
+            raw_string (str): Chaîne brute à nettoyer
+        Returns:
+            str: Chaîne JSON nettoyée
+        """
+        json_string_cleaned = raw_string.strip()
+        if '```' in raw_string:
+            try:
+                if '```json' in raw_string:
+                    json_part = raw_string.split('```json')[1].split('```')[0]
+                    json_string_cleaned = json_part.strip()
+                else:
+                    parts = raw_string.split('```')
+                    if len(parts) >= 3:
+                        json_string_cleaned = parts[1].strip()
+            except IndexError:
+                logger.warning("Format de code block détecté mais mal formé")
+        return json_string_cleaned
+    def _clean_json_string(self, raw_string: str) -> str:
+        """
+        Nettoie une chaîne JSON brute en supprimant les blocs de code markdown.
+        Args:
+            raw_string (str): Chaîne brute à nettoyer
+        Returns:
+            str: Chaîne JSON nettoyée
+        """
         json_string_cleaned = raw_string.strip()
         if '```' in raw_string:
         return json_string_cleaned
     def get_processing_stats(self) -> dict:
+        """
+        Retourne des statistiques sur l'optimisation du traitement.
+        Returns:
+            dict: Statistiques d'optimisation
+        """
         return {
             "optimization_enabled": True,
             "section_based_processing": True,
 if __name__ == "__main__":
     logger.info("Test du module cv_parsing_agents optimisé")
+    try:
+        agent = OptimizedCvParserAgent("/tmp/test.pdf")
+        stats = agent.get_processing_stats()
+        logger.info("✅ OptimizedCvParserAgent créé avec succès")
+        logger.info(f"✅ Statistiques d'optimisation: {stats}")
+    except Exception as e:
+        logger.error(f"❌ Erreur création OptimizedCvParserAgent: {e}")