Spaces:

QuentinL52
/

interview_agents_api

Sleeping

App Files Files Community

QuentinL52 commited on Jul 16, 2025

Commit

2f3b36f

verified ·

1 Parent(s): 8d38cd3

Delete src/cv_parsing_agents.py

Browse files

Files changed (1) hide show

src/cv_parsing_agents.py +0 -264

src/cv_parsing_agents.py DELETED Viewed

@@ -1,264 +0,0 @@
-"""
-Module pour le parsing de CV optimisé avec CrewAI
-"""
-import os
-import json
-import logging
-logger = logging.getLogger(__name__)
-from src.crew.crew_pool import analyse_cv
-from src.config import load_pdf
-def clean_dict_keys(data):
-    """
-    Nettoie les clés d'un dictionnaire en les convertissant en string.
-    Args:
-        data: Données à nettoyer (dict, list, ou autre)
-    Returns:
-        Données nettoyées avec des clés string
-    """
-    if isinstance(data, dict):
-        return {str(key): clean_dict_keys(value) for key, value in data.items()}
-    elif isinstance(data, list):
-        return [clean_dict_keys(element) for element in data]
-    else:
-        return data
-class OptimizedCvParserAgent:
-    """
-    Agent de parsing de CV optimisé utilisant CrewAI avec découpage intelligent.
-    Cette classe traite un fichier PDF de CV en le découpant en sections
-    pour optimiser le traitement par les agents spécialisés.
-    """
-    def __init__(self, pdf_path: str):
-        """
-        Initialise l'agent de parsing de CV optimisé.
-        Args:
-            pdf_path (str): Chemin vers le fichier PDF à traiter
-        Raises:
-            ValueError: Si le chemin du fichier est invalide
-        """
-        if not pdf_path or not isinstance(pdf_path, str):
-            raise ValueError("Le chemin du fichier PDF doit être une chaîne non vide")
-        self.pdf_path = pdf_path
-    def process(self) -> dict:
-        """
-        Traite le fichier PDF pour en extraire le contenu sous forme de JSON optimisé.
-        Returns:
-            dict: Dictionnaire contenant les données extraites du CV
-        Raises:
-            FileNotFoundError: Si le fichier PDF n'existe pas
-            ValueError: Si le PDF est vide ou illisible
-            json.JSONDecodeError: Si le résultat n'est pas un JSON valide
-            Exception: Pour toute autre erreur de traitement
-        """
-        logger.info(f"Début du traitement optimisé du CV : {self.pdf_path}")
-        if not os.path.exists(self.pdf_path):
-            raise FileNotFoundError(f"Fichier PDF non trouvé: {self.pdf_path}")
-        cv_text_content = load_pdf(self.pdf_path)
-        if not cv_text_content or not cv_text_content.strip():
-            raise ValueError("Le PDF semble vide ou illisible")
-        logger.info(f"PDF chargé, {len(cv_text_content)} caractères extraits")
-        crew_output = analyse_cv(cv_text_content)
-        if not crew_output or not hasattr(crew_output, 'raw') or not crew_output.raw.strip():
-            raise Exception("L'analyse par le crew n'a pas retourné de résultat.")
-        raw_string = crew_output.raw
-        logger.info(f"Résultat brut du crew optimisé: {raw_string[:200]}...")
-        json_string_cleaned = self._clean_json_string(raw_string)
-        profile_data = json.loads(json_string_cleaned)
-        logger.info("Parsing JSON optimisé réussi")
-        optimized_data = self._validate_and_enhance_data(profile_data)
-        return clean_dict_keys(optimized_data)
-    def _validate_and_enhance_data(self, profile_data: dict) -> dict:
-        """
-        Valide et enrichit les données extraites du CV.
-        Args:
-            profile_data (dict): Données brutes extraites
-        Returns:
-            dict: Données validées et enrichies
-        Raises:
-            ValueError: Si la structure de données est invalide
-        """
-        if not isinstance(profile_data, dict) or "candidat" not in profile_data:
-            raise ValueError("Structure de données invalide - clé 'candidat' manquante")
-        candidat = profile_data["candidat"]
-        required_sections = [
-            "informations_personnelles", "compétences", "expériences",
-            "projets", "formations", "reconversion"
-        ]
-        for section in required_sections:
-            if section not in candidat or not candidat[section]:
-                logger.warning(f"Section manquante ou vide: {section}")
-                candidat[section] = self._get_default_section_data(section)
-        self._normalize_competences(candidat.get("compétences", {}))
-        self._normalize_experiences(candidat.get("expériences", []))
-        logger.info("Validation et enrichissement des données terminés")
-        return profile_data
-    def _normalize_competences(self, competences: dict):
-        """Normalise la section compétences"""
-        if not isinstance(competences, dict):
-            return
-        if "hard_skills" not in competences:
-            competences["hard_skills"] = []
-        if "soft_skills" not in competences:
-            competences["soft_skills"] = []
-        competences["hard_skills"] = [skill.strip() for skill in competences["hard_skills"] if skill and skill.strip()]
-        competences["soft_skills"] = [skill.strip() for skill in competences["soft_skills"] if skill and skill.strip()]
-    def _normalize_experiences(self, experiences: list):
-        """Normalise la section expériences"""
-        if not isinstance(experiences, list):
-            return
-        required_fields = ["Poste", "Entreprise", "start_date", "end_date", "responsabilités"]
-        for exp in experiences:
-            if not isinstance(exp, dict):
-                continue
-            for field in required_fields:
-                if field not in exp or exp[field] in [None, "", []]:
-                    exp[field] = "Non spécifié" if field != "responsabilités" else []
-    def _get_default_section_data(self, section: str):
-        """Retourne des données par défaut pour une section manquante"""
-        defaults = {
-            "informations_personnelles": {
-                "nom": "Non spécifié",
-                "email": "Non spécifié",
-                "numero_de_telephone": "Non spécifié",
-                "localisation": "Non spécifiée"
-            },
-            "compétences": {
-                "hard_skills": [],
-                "soft_skills": []
-            },
-            "expériences": [],
-            "projets": {
-                "professional": [],
-                "personal": []
-            },
-            "formations": [],
-            "reconversion": {
-                "is_reconversion": False,
-                "analysis": "Analyse non disponible"
-            }
-        }
-        return defaults.get(section, {})
-    def _clean_json_string(self, raw_string: str) -> str:
-        """
-        Nettoie une chaîne JSON brute en supprimant les blocs de code markdown.
-        Args:
-            raw_string (str): Chaîne brute à nettoyer
-        Returns:
-            str: Chaîne JSON nettoyée
-        """
-        json_string_cleaned = raw_string.strip()
-        if '```' in raw_string:
-            try:
-                if '```json' in raw_string:
-                    json_part = raw_string.split('```json')[1].split('```')[0]
-                    json_string_cleaned = json_part.strip()
-                else:
-                    parts = raw_string.split('```')
-                    if len(parts) >= 3:
-                        json_string_cleaned = parts[1].strip()
-            except IndexError:
-                logger.warning("Format de code block détecté mais mal formé")
-        return json_string_cleaned
-    def _clean_json_string(self, raw_string: str) -> str:
-        """
-        Nettoie une chaîne JSON brute en supprimant les blocs de code markdown.
-        Args:
-            raw_string (str): Chaîne brute à nettoyer
-        Returns:
-            str: Chaîne JSON nettoyée
-        """
-        json_string_cleaned = raw_string.strip()
-        if '```' in raw_string:
-            try:
-                if '```json' in raw_string:
-                    json_part = raw_string.split('```json')[1].split('```')[0]
-                    json_string_cleaned = json_part.strip()
-                else:
-                    parts = raw_string.split('```')
-                    if len(parts) >= 3:
-                        json_string_cleaned = parts[1].strip()
-            except IndexError:
-                logger.warning("Format de code block détecté mais mal formé")
-        return json_string_cleaned
-    def get_processing_stats(self) -> dict:
-        """
-        Retourne des statistiques sur l'optimisation du traitement.
-        Returns:
-            dict: Statistiques d'optimisation
-        """
-        return {
-            "optimization_enabled": True,
-            "section_based_processing": True,
-            "estimated_token_reduction": "85%",
-            "processing_approach": "Optimized Agent-based with Section Splitting"
-        }
-class CvParserAgent(OptimizedCvParserAgent):
-    """
-    Alias pour maintenir la compatibilité avec l'ancien nom de classe.
-    Redirige vers la version optimisée.
-    """
-    pass
-if __name__ == "__main__":
-    logger.info("Test du module cv_parsing_agents optimisé")
-    try:
-        agent = OptimizedCvParserAgent("/tmp/test.pdf")
-        stats = agent.get_processing_stats()
-        logger.info("✅ OptimizedCvParserAgent créé avec succès")
-        logger.info(f"✅ Statistiques d'optimisation: {stats}")
-    except Exception as e:
-        logger.error(f"❌ Erreur création OptimizedCvParserAgent: {e}")