# πŸ“„ src/core/intent_parser/ml_classifier.py import json import os import logging from typing import Dict, List, Optional, Any from dataclasses import dataclass from torch.quantization import quantize_dynamic import time # Π˜ΠΌΠΏΠΎΡ€Ρ‚Ρ‹ с ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΎΠΉ ошибок print("Π˜Π½ΠΈΡ†ΠΈΠ°Π»ΠΈΠ·Π°Ρ†ΠΈΡ ML классификатора...") try: import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification ML_AVAILABLE = True except ImportError as e: print(f"⚠️ ML Π±ΠΈΠ±Π»ΠΈΠΎΡ‚Π΅ΠΊΠΈ Π½Π΅ установлСны: {e}") ML_AVAILABLE = False torch = None AutoTokenizer = None AutoModelForSequenceClassification = None @dataclass class MLClassificationResult: """Π Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚ классификации ML модСлью""" intent: str confidence: float all_predictions: List[tuple] # Бписок всСх (ΠΈΠ½Ρ‚Π΅Π½Ρ‚, ΡƒΠ²Π΅Ρ€Π΅Π½Π½ΠΎΡΡ‚ΡŒ) multi_label_predictions: Optional[List[tuple]] = None # Π˜Π½Ρ‚Π΅Π½Ρ‚Ρ‹ Π²Ρ‹ΡˆΠ΅ ΠΏΠΎΡ€ΠΎΠ³Π° class MLIntentClassifier: """ ML классификатор Π½Π°ΠΌΠ΅Ρ€Π΅Π½ΠΈΠΉ Π½Π° основС DistilBERT. ΠŸΠΎΠ΄Π΄Π΅Ρ€ΠΆΠΈΠ²Π°Π΅Ρ‚ multi-label ΠΊΠ»Π°ΡΡΠΈΡ„ΠΈΠΊΠ°Ρ†ΠΈΡŽ ΠΊΠ°ΠΊ Π² ΠΎΠ±ΡƒΡ‡Π΅Π½Π½ΠΎΠΉ ΠΌΠΎΠ΄Π΅Π»ΠΈ. """ def __init__(self, model_path: Optional[str] = None): self.logger = logging.getLogger(__name__) self.model = None self.tokenizer = None self.device = None self.is_initialized = False # Π‘Π»ΠΎΠ²Π°Ρ€ΡŒ ΠΈΠ½Ρ‚Π΅Π½Ρ‚ΠΎΠ² self.intent_to_idx = {} self.idx_to_intent = {} # Настройки self.confidence_threshold = 0.3 self.max_length = 128 # ΠŸΡƒΡ‚ΡŒ ΠΊ ΠΌΠΎΠ΄Π΅Π»ΠΈ (ΠΏΠΎ ΡƒΠΌΠΎΠ»Ρ‡Π°Π½ΠΈΡŽ ΠΈΠ· вашСй структуры) if model_path is None: # АвтоматичСски опрСдСляСм ΠΏΡƒΡ‚ΡŒ Π² структурС ΠΏΡ€ΠΎΠ΅ΠΊΡ‚Π° base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) base_dir = "C:/PycharmProjects/Ariel" model_path = os.path.join(base_dir, "Data", "Models", "intent_classifier") base_dir = "C:/PycharmProjects/Ariel" model_path = os.path.join(base_dir, "Data", "models", "intent_classifier") self.model_path = model_path self._initialize_model() def _initialize_model(self): """Π˜Π½ΠΈΡ†ΠΈΠ°Π»ΠΈΠ·Π°Ρ†ΠΈΡ ΠΌΠΎΠ΄Π΅Π»ΠΈ с ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΎΠΉ ошибок""" if not ML_AVAILABLE: self.logger.warning("ML Π±ΠΈΠ±Π»ΠΈΠΎΡ‚Π΅ΠΊΠΈ Π½Π΅ установлСны. ИспользованиС Π·Π°Π³Π»ΡƒΡˆΠΊΠΈ.") return try: # ΠŸΡ€ΠΎΠ²Π΅Ρ€ΡΠ΅ΠΌ сущСствованиС Π΄ΠΈΡ€Π΅ΠΊΡ‚ΠΎΡ€ΠΈΠΈ if not os.path.exists(self.model_path): self.logger.error(f"НС Π½Π°ΠΉΠ΄Π΅Π½ Ρ„Π°ΠΉΠ»: {self.model_path}") self.logger.info("ΠŸΡ€ΠΎΠ²Π΅Ρ€ΡŒΡ‚Π΅, Ρ‡Ρ‚ΠΎ Π²Ρ‹ распаковали Π°Ρ€Ρ…ΠΈΠ² Π² ΠΏΡ€Π°Π²ΠΈΠ»ΡŒΠ½ΡƒΡŽ ΠΏΠ°ΠΏΠΊΡƒ") return # ΠŸΡ€ΠΎΠ²Π΅Ρ€ΡΠ΅ΠΌ Π½Π°Π»ΠΈΡ‡ΠΈΠ΅ ΠΊΠ»ΡŽΡ‡Π΅Π²Ρ‹Ρ… Ρ„Π°ΠΉΠ»ΠΎΠ² required_files = ['config.json'] weight_files = ['model.safetensors', 'pytorch_model.bin'] for file in required_files: if not os.path.exists(os.path.join(self.model_path, file)): self.logger.error(f"НС Π½Π°ΠΉΠ΄Π΅Π½ Ρ„Π°ΠΉΠ»: {os.path.join(self.model_path, file)}") return # ΠŸΡ€ΠΎΠ²Π΅Ρ€ΡΠ΅ΠΌ Π½Π°Π»ΠΈΡ‡ΠΈΠ΅ Ρ„Π°ΠΉΠ»Π° вСсов has_weights = any(os.path.exists(os.path.join(self.model_path, wf)) for wf in weight_files) if not has_weights: self.logger.error(f"НС Π½Π°ΠΉΠ΄Π΅Π½ Ρ„Π°ΠΉΠ» вСсов ΠΌΠΎΠ΄Π΅Π»ΠΈ. ΠžΠΆΠΈΠ΄Π°Π΅Ρ‚ΡΡ ΠΎΠ΄ΠΈΠ½ ΠΈΠ·: {weight_files}") self.logger.info(f"Π€Π°ΠΉΠ»Ρ‹ Π² Π΄ΠΈΡ€Π΅ΠΊΡ‚ΠΎΡ€ΠΈΠΈ: {os.listdir(self.model_path)}") return # Π—Π°Π³Ρ€ΡƒΠΆΠ°Π΅ΠΌ vocabulary ΠΈΠ½Ρ‚Π΅Π½Ρ‚ΠΎΠ² vocab_path = os.path.join(self.model_path, "intent_vocab.json") if os.path.exists(vocab_path): with open(vocab_path, 'r', encoding='utf-8') as f: self.intent_to_idx = json.load(f) # ΠŸΡ€Π΅ΠΎΠ±Ρ€Π°Π·ΡƒΠ΅ΠΌ индСксы Π² int Ссли ΠΎΠ½ΠΈ строки self.intent_to_idx = {k: int(v) for k, v in self.intent_to_idx.items()} self.idx_to_intent = {v: k for k, v in self.intent_to_idx.items()} self.logger.info(f"Π—Π°Π³Ρ€ΡƒΠΆΠ΅Π½ ΡΠ»ΠΎΠ²Π°Ρ€ΡŒ ΠΈΠ½Ρ‚Π΅Π½Ρ‚ΠΎΠ²: {len(self.intent_to_idx)} классов") else: self.logger.warning("Π€Π°ΠΉΠ» intent_vocab.json Π½Π΅ Π½Π°ΠΉΠ΄Π΅Π½. ΠŸΡ‹Ρ‚Π°ΡŽΡΡŒ ΠΎΠΏΡ€Π΅Π΄Π΅Π»ΠΈΡ‚ΡŒ ΠΈΠ· config.json") # ΠŸΠΎΠΏΡ€ΠΎΠ±ΡƒΠ΅ΠΌ ΠΏΠΎΠ»ΡƒΡ‡ΠΈΡ‚ΡŒ ΠΈΠ· ΠΊΠΎΠ½Ρ„ΠΈΠ³Π° ΠΌΠΎΠ΄Π΅Π»ΠΈ pass # Π—Π°Π³Ρ€ΡƒΠΆΠ°Π΅ΠΌ модСль ΠΈ Ρ‚ΠΎΠΊΠ΅Π½ΠΈΠ·Π°Ρ‚ΠΎΡ€ self.logger.info(f"Π—Π°Π³Ρ€ΡƒΠ·ΠΊΠ° ΠΌΠΎΠ΄Π΅Π»ΠΈ ΠΈΠ· {self.model_path}...") self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, local_files_only=True) # Π—Π°Π³Ρ€ΡƒΠΆΠ°Π΅ΠΌ модСль с multi-label ΠΊΠΎΠ½Ρ„ΠΈΠ³ΡƒΡ€Π°Ρ†ΠΈΠ΅ΠΉ self.model = AutoModelForSequenceClassification.from_pretrained( self.model_path, local_files_only=True, problem_type="multi_label_classification" ) self.model = quantize_dynamic(self.model, {torch.nn.Linear}, dtype=torch.qint8) # Настройка устройства self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) self.model.eval() self.is_initialized = True self.logger.info(f"βœ… МодСль Π·Π°Π³Ρ€ΡƒΠΆΠ΅Π½Π° ΡƒΡΠΏΠ΅ΡˆΠ½ΠΎ!") self.logger.info(f" Устройство: {self.device}") self.logger.info(f" Классов: {len(self.intent_to_idx) if self.intent_to_idx else 'нСизвСстно'}") except Exception as e: self.logger.error(f"❌ Ошибка Π·Π°Π³Ρ€ΡƒΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ: {e}") self.is_initialized = False def predict(self, text: str, threshold: Optional[float] = None) -> MLClassificationResult: """ΠŸΡ€Π΅Π΄ΡΠΊΠ°Π·Π°Π½ΠΈΠ΅ ΠΈΠ½Ρ‚Π΅Π½Ρ‚ΠΎΠ² для тСкста (multi-label)""" if not self.is_initialized: self.logger.warning("МодСль Π½Π΅ ΠΈΠ½ΠΈΡ†ΠΈΠ°Π»ΠΈΠ·ΠΈΡ€ΠΎΠ²Π°Π½Π°, Π²ΠΎΠ·Π²Ρ€Π°Ρ‰Π°Π΅ΠΌ fallback") return self._fallback_prediction(text) try: current_threshold = threshold if threshold is not None else self.confidence_threshold # ВокСнизация inputs = self.tokenizer( text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt" ) # ΠŸΠ΅Ρ€Π΅Π½ΠΎΡΠΈΠΌ Π½Π° Π½ΡƒΠΆΠ½ΠΎΠ΅ устройство inputs = {k: v.to(self.device) for k, v in inputs.items()} # ΠŸΡ€Π΅Π΄ΡΠΊΠ°Π·Π°Π½ΠΈΠ΅ with torch.no_grad(): outputs = self.model(**inputs) # Для multi-label ΠΈΡΠΏΠΎΠ»ΡŒΠ·ΡƒΠ΅ΠΌ sigmoid probabilities = torch.sigmoid(outputs.logits) # ΠŸΠΎΠ»ΡƒΡ‡Π°Π΅ΠΌ numpy массив probs = probabilities.cpu().numpy()[0] # Π‘ΠΎΠ±ΠΈΡ€Π°Π΅ΠΌ Ρ€Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚Ρ‹ all_predictions = [] multi_label_predictions = [] for idx, prob in enumerate(probs): if idx in self.idx_to_intent: intent_name = self.idx_to_intent[idx] confidence = float(prob) all_predictions.append((intent_name, confidence)) if confidence >= current_threshold: multi_label_predictions.append((intent_name, confidence)) # Π‘ΠΎΡ€Ρ‚ΠΈΡ€ΡƒΠ΅ΠΌ ΠΏΠΎ увСрСнности all_predictions.sort(key=lambda x: x[1], reverse=True) multi_label_predictions.sort(key=lambda x: x[1], reverse=True) # ΠžΠΏΡ€Π΅Π΄Π΅Π»ΡΠ΅ΠΌ основной ΠΈΠ½Ρ‚Π΅Π½Ρ‚ main_intent = "unknown" main_confidence = 0.0 if multi_label_predictions: main_intent = multi_label_predictions[0][0] main_confidence = multi_label_predictions[0][1] elif all_predictions: main_intent = all_predictions[0][0] main_confidence = all_predictions[0][1] return MLClassificationResult( intent=main_intent, confidence=main_confidence, all_predictions=all_predictions, multi_label_predictions=multi_label_predictions ) except Exception as e: self.logger.error(f"Ошибка прСдсказания: {e}") return self._fallback_prediction(text) def _fallback_prediction(self, text: str) -> MLClassificationResult: """Π—Π°Π³Π»ΡƒΡˆΠΊΠ° ΠΏΡ€ΠΈ ΠΎΡˆΠΈΠ±ΠΊΠ°Ρ…""" return MLClassificationResult( intent="unknown", confidence=0.5, all_predictions=[("unknown", 1.0)], multi_label_predictions=[] ) def get_model_info(self) -> Dict[str, Any]: """Π˜Π½Ρ„ΠΎΡ€ΠΌΠ°Ρ†ΠΈΡ ΠΎ ΠΌΠΎΠ΄Π΅Π»ΠΈ""" return { "is_initialized": self.is_initialized, "model_path": self.model_path, "num_intents": len(self.intent_to_idx), "intents": list(self.intent_to_idx.keys()) if self.intent_to_idx else [], "confidence_threshold": self.confidence_threshold, "device": str(self.device) if self.device else None } def create_ml_classifier(model_path: Optional[str] = None) -> MLIntentClassifier: """Ѐабричная функция для создания классификатора""" return MLIntentClassifier(model_path) # ΠŸΡ€ΠΈΠΌΠ΅Ρ€ использования (Ρ€Π°ΡΠΊΠΎΠΌΠΌΠ΅Π½Ρ‚ΠΈΡ€ΠΎΠ²Π°Ρ‚ΡŒ): start = time.time() print("Π—Π°Π³Ρ€ΡƒΠ·ΠΊΠ° ΠΌΠΎΠ΄Π΅Π»ΠΈ, ΠΎΠΆΠΈΠ΄Π°ΠΉΡ‚Π΅...") classifier = create_ml_classifier("/Data/Models/intent_classifier") print("βœ… МодСль Π·Π°Π³Ρ€ΡƒΠΆΠ΅Π½Π°! ВСстируйтС:") while True: text = input("\nΠ’Π²Π΅Π΄ΠΈΡ‚Π΅ тСкст: ") if text.lower() == 'Π²Ρ‹Ρ…ΠΎΠ΄': break result = classifier.predict(text) print(f"Π Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚: {result.intent} ({result.confidence:.1%})") for intent, conf in result.all_predictions[:3]: print(f" - {intent}: {conf:.1%}")