Spaces:
Running
Running
Refactor FineTuner class: Remove commented-out code for model directory creation and the finetune_from_annotations function, clean up comments, and enhance code readability.
Browse files- src/training/fine_tuner.py +3 -73
src/training/fine_tuner.py
CHANGED
|
@@ -47,7 +47,6 @@ class FineTuner:
|
|
| 47 |
self.model = None
|
| 48 |
self.chat_analyzer = ChatAnalyzer()
|
| 49 |
|
| 50 |
-
# Создаём директорию для сохранения моделей в датасете
|
| 51 |
os.makedirs(self.output_dir, exist_ok=True)
|
| 52 |
|
| 53 |
def prepare_training_data(self, output_file: Optional[str] = None) -> str:
|
|
@@ -204,76 +203,7 @@ class FineTuner:
|
|
| 204 |
)
|
| 205 |
|
| 206 |
return tokenized_dataset
|
| 207 |
-
|
| 208 |
-
# Добавить этот метод в класс fine_tuner.py или в функции модуля:
|
| 209 |
-
|
| 210 |
-
def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_rating=4):
|
| 211 |
-
"""
|
| 212 |
-
Fine-tune model using annotated QA pairs
|
| 213 |
-
|
| 214 |
-
Args:
|
| 215 |
-
epochs: Number of training epochs
|
| 216 |
-
batch_size: Batch size for training
|
| 217 |
-
learning_rate: Learning rate
|
| 218 |
-
min_rating: Minimum average rating for including examples
|
| 219 |
-
|
| 220 |
-
Returns:
|
| 221 |
-
(success, message)
|
| 222 |
-
"""
|
| 223 |
-
try:
|
| 224 |
-
import tempfile
|
| 225 |
-
import os
|
| 226 |
-
from src.analytics.chat_evaluator import ChatEvaluator
|
| 227 |
-
from config.settings import HF_TOKEN, DATASET_ID, CHAT_HISTORY_PATH
|
| 228 |
-
|
| 229 |
-
# Create evaluator
|
| 230 |
-
evaluator = ChatEvaluator(
|
| 231 |
-
hf_token=HF_TOKEN,
|
| 232 |
-
dataset_id=DATASET_ID,
|
| 233 |
-
chat_history_path=CHAT_HISTORY_PATH
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
# Create temporary file for training data
|
| 237 |
-
with tempfile.NamedTemporaryFile(mode='w+', suffix='.jsonl', delete=False) as temp_file:
|
| 238 |
-
temp_path = temp_file.name
|
| 239 |
-
|
| 240 |
-
# Export high-quality examples
|
| 241 |
-
success, message = evaluator.export_training_data(temp_path, min_rating)
|
| 242 |
-
|
| 243 |
-
if not success:
|
| 244 |
-
return False, f"Failed to export training data: {message}"
|
| 245 |
|
| 246 |
-
# Count examples
|
| 247 |
-
with open(temp_path, 'r') as f:
|
| 248 |
-
example_count = sum(1 for _ in f)
|
| 249 |
-
|
| 250 |
-
if example_count == 0:
|
| 251 |
-
return False, "No high-quality examples found for fine-tuning"
|
| 252 |
-
|
| 253 |
-
# Run actual fine-tuning using the export file
|
| 254 |
-
from src.training.fine_tuner import finetune_from_file
|
| 255 |
-
|
| 256 |
-
success, message = finetune_from_file(
|
| 257 |
-
training_file=temp_path,
|
| 258 |
-
epochs=epochs,
|
| 259 |
-
batch_size=batch_size,
|
| 260 |
-
learning_rate=learning_rate
|
| 261 |
-
)
|
| 262 |
-
|
| 263 |
-
# Clean up temporary file
|
| 264 |
-
try:
|
| 265 |
-
os.unlink(temp_path)
|
| 266 |
-
except:
|
| 267 |
-
pass
|
| 268 |
-
|
| 269 |
-
if success:
|
| 270 |
-
return True, f"Successfully fine-tuned model with {example_count} annotated examples: {message}"
|
| 271 |
-
else:
|
| 272 |
-
return False, f"Fine-tuning failed: {message}"
|
| 273 |
-
|
| 274 |
-
except Exception as e:
|
| 275 |
-
return False, f"Error during fine-tuning from annotations: {str(e)}"
|
| 276 |
-
|
| 277 |
def train(
|
| 278 |
self,
|
| 279 |
training_data_path: str,
|
|
@@ -286,7 +216,7 @@ def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_ra
|
|
| 286 |
) -> Tuple[bool, str]:
|
| 287 |
"""
|
| 288 |
Train the model using provided data
|
| 289 |
-
|
| 290 |
Args:
|
| 291 |
training_data_path: Path to training data file
|
| 292 |
num_train_epochs: Number of training epochs
|
|
@@ -295,7 +225,7 @@ def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_ra
|
|
| 295 |
learning_rate: Learning rate
|
| 296 |
logging_steps: Number of steps between logging
|
| 297 |
save_strategy: When to save checkpoints
|
| 298 |
-
|
| 299 |
Returns:
|
| 300 |
(success, message)
|
| 301 |
"""
|
|
@@ -404,7 +334,7 @@ def finetune_from_chat_history(epochs: int = 3,
|
|
| 404 |
(success, message)
|
| 405 |
"""
|
| 406 |
try:
|
| 407 |
-
# Create evaluator instance
|
| 408 |
evaluator = ChatEvaluator(
|
| 409 |
hf_token=HF_TOKEN,
|
| 410 |
dataset_id=DATASET_ID
|
|
|
|
| 47 |
self.model = None
|
| 48 |
self.chat_analyzer = ChatAnalyzer()
|
| 49 |
|
|
|
|
| 50 |
os.makedirs(self.output_dir, exist_ok=True)
|
| 51 |
|
| 52 |
def prepare_training_data(self, output_file: Optional[str] = None) -> str:
|
|
|
|
| 203 |
)
|
| 204 |
|
| 205 |
return tokenized_dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
def train(
|
| 208 |
self,
|
| 209 |
training_data_path: str,
|
|
|
|
| 216 |
) -> Tuple[bool, str]:
|
| 217 |
"""
|
| 218 |
Train the model using provided data
|
| 219 |
+
|
| 220 |
Args:
|
| 221 |
training_data_path: Path to training data file
|
| 222 |
num_train_epochs: Number of training epochs
|
|
|
|
| 225 |
learning_rate: Learning rate
|
| 226 |
logging_steps: Number of steps between logging
|
| 227 |
save_strategy: When to save checkpoints
|
| 228 |
+
|
| 229 |
Returns:
|
| 230 |
(success, message)
|
| 231 |
"""
|
|
|
|
| 334 |
(success, message)
|
| 335 |
"""
|
| 336 |
try:
|
| 337 |
+
# Create evaluator instance
|
| 338 |
evaluator = ChatEvaluator(
|
| 339 |
hf_token=HF_TOKEN,
|
| 340 |
dataset_id=DATASET_ID
|