Spaces:
Paused
Paused
no
Browse files- app.py +107 -79
- inference_chatgpt_simple.py +9 -2
app.py
CHANGED
|
@@ -13,7 +13,7 @@ import gradio as gr
|
|
| 13 |
from datasets import Dataset
|
| 14 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 15 |
from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
|
| 16 |
-
from
|
| 17 |
import warnings
|
| 18 |
import subprocess
|
| 19 |
import gc
|
|
@@ -126,7 +126,7 @@ def format_prompt(query, title, content):
|
|
| 126 |
if len(content) > 1000:
|
| 127 |
content = content[:1000] + "..."
|
| 128 |
|
| 129 |
-
return f"""
|
| 130 |
Answer only yes / no.
|
| 131 |
Document:
|
| 132 |
####DOCUMENT START
|
|
@@ -139,9 +139,7 @@ Query:
|
|
| 139 |
{query}
|
| 140 |
####Query END
|
| 141 |
|
| 142 |
-
ANSWER:
|
| 143 |
-
####ANSWER START
|
| 144 |
-
"""
|
| 145 |
|
| 146 |
|
| 147 |
def load_model_and_tokenizer(checkpoint_path=None, model_id=None):
|
|
@@ -240,7 +238,7 @@ def get_trained_models_list():
|
|
| 240 |
text += f"{i}. **{model['repo']}**\n"
|
| 241 |
text += f" - Accuracy: {model['accuracy']:.2%}\n"
|
| 242 |
text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
|
| 243 |
-
text += f" -
|
| 244 |
text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
|
| 245 |
|
| 246 |
return text
|
|
@@ -312,9 +310,9 @@ def collate_fn(batch):
|
|
| 312 |
}
|
| 313 |
|
| 314 |
|
| 315 |
-
def
|
| 316 |
-
"""Convert 4-category labels to
|
| 317 |
-
|
| 318 |
|
| 319 |
# Map 4 categories to yes/no
|
| 320 |
label_mapping = {
|
|
@@ -347,28 +345,21 @@ def prepare_dpo_dataset(df):
|
|
| 347 |
original_label = row['label']
|
| 348 |
mapped_label = label_mapping.get(original_label, original_label)
|
| 349 |
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
'prompt': prompt,
|
| 362 |
-
'chosen': 'no',
|
| 363 |
-
'rejected': 'yes',
|
| 364 |
-
'original_label': original_label # Keep original for analysis
|
| 365 |
-
})
|
| 366 |
-
|
| 367 |
-
return pd.DataFrame(dpo_data)
|
| 368 |
|
| 369 |
|
| 370 |
-
def train_model(train_df, val_df, epochs=
|
| 371 |
-
"""
|
| 372 |
global current_model, current_tokenizer
|
| 373 |
|
| 374 |
# Clear GPU memory before training
|
|
@@ -385,14 +376,14 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 385 |
train_df = train_df.sample(n=max_samples, random_state=42)
|
| 386 |
val_df = val_df.head(min(len(val_df), max_samples // 5)) # Proportional validation set
|
| 387 |
|
| 388 |
-
# Convert to
|
| 389 |
-
logger.info("
|
| 390 |
-
|
| 391 |
-
|
| 392 |
|
| 393 |
# Create datasets
|
| 394 |
-
train_dataset = Dataset.from_pandas(
|
| 395 |
-
val_dataset = Dataset.from_pandas(
|
| 396 |
|
| 397 |
# Prepare model for training
|
| 398 |
if hasattr(current_model, 'is_loaded_in_4bit') and current_model.is_loaded_in_4bit:
|
|
@@ -428,20 +419,19 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 428 |
target_modules=target_modules
|
| 429 |
)
|
| 430 |
|
| 431 |
-
logger.info(f"Starting
|
| 432 |
logger.info(f"Learning rate: {lr}, Effective batch size: {batch_size}, Epochs: {epochs}")
|
| 433 |
|
| 434 |
# Create output directory
|
| 435 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 436 |
|
| 437 |
-
#
|
| 438 |
-
|
| 439 |
-
target_batch_size = 32 # Target effective batch size
|
| 440 |
|
| 441 |
if current_model_id == "openai/gpt-oss-20b":
|
| 442 |
# For GPT-OSS-20B: use smaller per-device batch with gradient accumulation
|
| 443 |
actual_batch_size = 2 # Per-device batch size
|
| 444 |
-
seq_length =
|
| 445 |
grad_accum = target_batch_size // actual_batch_size # 16 gradient accumulation steps
|
| 446 |
else:
|
| 447 |
# For smaller models like Phi-3 - can use larger per-device batch
|
|
@@ -449,40 +439,76 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 449 |
grad_accum = max(1, target_batch_size // actual_batch_size) # Accumulate if needed
|
| 450 |
seq_length = 512
|
| 451 |
|
| 452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
output_dir=OUTPUT_DIR,
|
| 454 |
num_train_epochs=epochs,
|
| 455 |
per_device_train_batch_size=actual_batch_size,
|
| 456 |
per_device_eval_batch_size=actual_batch_size,
|
| 457 |
gradient_accumulation_steps=grad_accum,
|
| 458 |
-
gradient_checkpointing=True,
|
| 459 |
learning_rate=lr,
|
| 460 |
lr_scheduler_type="cosine",
|
| 461 |
-
warmup_steps=
|
| 462 |
logging_steps=10,
|
| 463 |
save_strategy="epoch",
|
| 464 |
-
|
| 465 |
-
bf16=True,
|
| 466 |
fp16=False,
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
report_to=[],
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
)
|
| 476 |
|
| 477 |
-
#
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
model=current_model,
|
| 480 |
-
ref_model=None, # Will use the model's initial state as reference
|
| 481 |
args=training_args,
|
| 482 |
-
train_dataset=
|
| 483 |
-
eval_dataset=
|
| 484 |
-
|
| 485 |
-
|
| 486 |
)
|
| 487 |
|
| 488 |
# Custom logging callback
|
|
@@ -501,11 +527,14 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 501 |
|
| 502 |
def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
|
| 503 |
"""Compute accuracy metrics and confusion matrix on a subset of eval data"""
|
|
|
|
|
|
|
|
|
|
| 504 |
# Sample subset for faster evaluation
|
| 505 |
-
|
|
|
|
| 506 |
|
| 507 |
# Initialize confusion matrix counters
|
| 508 |
-
# Rows: true labels, Cols: predicted labels
|
| 509 |
confusion_matrix = {
|
| 510 |
'easy_positive': {'yes': 0, 'no': 0},
|
| 511 |
'hard_positive': {'yes': 0, 'no': 0},
|
|
@@ -517,11 +546,10 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 517 |
predictions_no = 0
|
| 518 |
correct = 0
|
| 519 |
|
| 520 |
-
for idx in
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
original_label = item.get('original_label', None) # Get original 4-category label
|
| 525 |
|
| 526 |
# Tokenize and run inference
|
| 527 |
inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
|
@@ -546,7 +574,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 546 |
else:
|
| 547 |
predictions_no += 1
|
| 548 |
|
| 549 |
-
if prediction ==
|
| 550 |
correct += 1
|
| 551 |
|
| 552 |
# Update confusion matrix if we have original label
|
|
@@ -628,23 +656,23 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 628 |
training_status["progress"] = min(int((state.global_step / total_steps) * 100), 99)
|
| 629 |
|
| 630 |
# Add callback with trainer and eval dataset
|
| 631 |
-
status_callback = StatusCallback(
|
| 632 |
-
|
| 633 |
|
| 634 |
# Train
|
| 635 |
try:
|
| 636 |
-
logger.info("Starting
|
| 637 |
-
|
| 638 |
|
| 639 |
# Save final model
|
| 640 |
save_path = os.path.join(OUTPUT_DIR, "final")
|
| 641 |
-
|
| 642 |
current_tokenizer.save_pretrained(save_path)
|
| 643 |
logger.info(f"Model saved to {save_path}")
|
| 644 |
|
| 645 |
# Compute final metrics
|
| 646 |
logger.info("Computing final accuracy metrics...")
|
| 647 |
-
final_metrics = compute_accuracy_metrics(
|
| 648 |
logger.info(f"Final Accuracy: {final_metrics['accuracy']:.2%}")
|
| 649 |
logger.info(f"Final Prediction Distribution - Yes: {final_metrics['yes_ratio']:.1%}, No: {final_metrics['no_ratio']:.1%}")
|
| 650 |
|
|
@@ -666,7 +694,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
| 666 |
)
|
| 667 |
|
| 668 |
# Update global model reference
|
| 669 |
-
current_model =
|
| 670 |
current_model.eval()
|
| 671 |
|
| 672 |
# Push to hub if token available
|
|
@@ -710,11 +738,11 @@ model-index:
|
|
| 710 |
|
| 711 |
# {model_short_name} Document Relevance Classifier
|
| 712 |
|
| 713 |
-
This model was trained using
|
| 714 |
|
| 715 |
## Training Configuration
|
| 716 |
- Base Model: {current_model_id}
|
| 717 |
-
-
|
| 718 |
- Learning Rate: {training_args.learning_rate}
|
| 719 |
- Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
|
| 720 |
- Epochs: {training_args.num_train_epochs}
|
|
@@ -747,7 +775,7 @@ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1
|
|
| 747 |
current_model.push_to_hub(
|
| 748 |
repo_name,
|
| 749 |
use_auth_token=HF_TOKEN,
|
| 750 |
-
commit_message=f"
|
| 751 |
)
|
| 752 |
current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
|
| 753 |
|
|
@@ -779,7 +807,7 @@ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1
|
|
| 779 |
"accuracy": final_metrics['accuracy'],
|
| 780 |
"yes_ratio": final_metrics['yes_ratio'],
|
| 781 |
"no_ratio": final_metrics['no_ratio'],
|
| 782 |
-
"
|
| 783 |
"model_id": current_model_id
|
| 784 |
})
|
| 785 |
|
|
@@ -892,7 +920,7 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
|
|
| 892 |
max_samples = 2000 # Start conservative
|
| 893 |
else:
|
| 894 |
max_samples = None
|
| 895 |
-
train_model(train_df, test_df, epochs=
|
| 896 |
|
| 897 |
with training_lock:
|
| 898 |
training_status["status"] = "completed"
|
|
|
|
| 13 |
from datasets import Dataset
|
| 14 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 15 |
from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
|
| 16 |
+
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
| 17 |
import warnings
|
| 18 |
import subprocess
|
| 19 |
import gc
|
|
|
|
| 126 |
if len(content) > 1000:
|
| 127 |
content = content[:1000] + "..."
|
| 128 |
|
| 129 |
+
return f"""You would get a query and document's title and content and return yes (if the document is relevant to the query) or no (if the document is not relevant to the query).
|
| 130 |
Answer only yes / no.
|
| 131 |
Document:
|
| 132 |
####DOCUMENT START
|
|
|
|
| 139 |
{query}
|
| 140 |
####Query END
|
| 141 |
|
| 142 |
+
ANSWER: """
|
|
|
|
|
|
|
| 143 |
|
| 144 |
|
| 145 |
def load_model_and_tokenizer(checkpoint_path=None, model_id=None):
|
|
|
|
| 238 |
text += f"{i}. **{model['repo']}**\n"
|
| 239 |
text += f" - Accuracy: {model['accuracy']:.2%}\n"
|
| 240 |
text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
|
| 241 |
+
text += f" - LR: {model.get('lr', 'N/A')}, Model: {model['model_id'].split('/')[-1]}\n"
|
| 242 |
text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
|
| 243 |
|
| 244 |
return text
|
|
|
|
| 310 |
}
|
| 311 |
|
| 312 |
|
| 313 |
+
def prepare_finetuning_dataset(df):
|
| 314 |
+
"""Convert 4-category labels to standard fine-tuning format"""
|
| 315 |
+
ft_data = []
|
| 316 |
|
| 317 |
# Map 4 categories to yes/no
|
| 318 |
label_mapping = {
|
|
|
|
| 345 |
original_label = row['label']
|
| 346 |
mapped_label = label_mapping.get(original_label, original_label)
|
| 347 |
|
| 348 |
+
# Create the full text with prompt and answer
|
| 349 |
+
text = prompt + mapped_label
|
| 350 |
+
|
| 351 |
+
ft_data.append({
|
| 352 |
+
'text': text,
|
| 353 |
+
'prompt': prompt,
|
| 354 |
+
'label': mapped_label,
|
| 355 |
+
'original_label': original_label # Keep original for analysis
|
| 356 |
+
})
|
| 357 |
+
|
| 358 |
+
return pd.DataFrame(ft_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
|
| 361 |
+
def train_model(train_df, val_df, epochs=5, batch_size=32, lr=5e-6, max_samples=None):
|
| 362 |
+
"""Standard fine-tuning for document relevance classification"""
|
| 363 |
global current_model, current_tokenizer
|
| 364 |
|
| 365 |
# Clear GPU memory before training
|
|
|
|
| 376 |
train_df = train_df.sample(n=max_samples, random_state=42)
|
| 377 |
val_df = val_df.head(min(len(val_df), max_samples // 5)) # Proportional validation set
|
| 378 |
|
| 379 |
+
# Convert to fine-tuning format
|
| 380 |
+
logger.info("Preparing fine-tuning dataset...")
|
| 381 |
+
ft_train_df = prepare_finetuning_dataset(train_df)
|
| 382 |
+
ft_val_df = prepare_finetuning_dataset(val_df)
|
| 383 |
|
| 384 |
# Create datasets
|
| 385 |
+
train_dataset = Dataset.from_pandas(ft_train_df)
|
| 386 |
+
val_dataset = Dataset.from_pandas(ft_val_df)
|
| 387 |
|
| 388 |
# Prepare model for training
|
| 389 |
if hasattr(current_model, 'is_loaded_in_4bit') and current_model.is_loaded_in_4bit:
|
|
|
|
| 419 |
target_modules=target_modules
|
| 420 |
)
|
| 421 |
|
| 422 |
+
logger.info(f"Starting fine-tuning with {len(train_df)} train samples, {len(val_df)} val samples")
|
| 423 |
logger.info(f"Learning rate: {lr}, Effective batch size: {batch_size}, Epochs: {epochs}")
|
| 424 |
|
| 425 |
# Create output directory
|
| 426 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 427 |
|
| 428 |
+
# Training configuration optimized for standard fine-tuning
|
| 429 |
+
target_batch_size = batch_size # Target effective batch size
|
|
|
|
| 430 |
|
| 431 |
if current_model_id == "openai/gpt-oss-20b":
|
| 432 |
# For GPT-OSS-20B: use smaller per-device batch with gradient accumulation
|
| 433 |
actual_batch_size = 2 # Per-device batch size
|
| 434 |
+
seq_length = 512 # Standard sequence length
|
| 435 |
grad_accum = target_batch_size // actual_batch_size # 16 gradient accumulation steps
|
| 436 |
else:
|
| 437 |
# For smaller models like Phi-3 - can use larger per-device batch
|
|
|
|
| 439 |
grad_accum = max(1, target_batch_size // actual_batch_size) # Accumulate if needed
|
| 440 |
seq_length = 512
|
| 441 |
|
| 442 |
+
# Tokenize the datasets
|
| 443 |
+
def tokenize_function(examples):
|
| 444 |
+
# Tokenize the full texts (prompt + answer)
|
| 445 |
+
model_inputs = current_tokenizer(
|
| 446 |
+
examples['text'],
|
| 447 |
+
truncation=True,
|
| 448 |
+
padding="max_length",
|
| 449 |
+
max_length=seq_length,
|
| 450 |
+
return_tensors=None
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
# For causal LM, labels are the same as input_ids
|
| 454 |
+
model_inputs["labels"] = model_inputs["input_ids"].copy()
|
| 455 |
+
|
| 456 |
+
# Store metadata for evaluation
|
| 457 |
+
model_inputs["original_labels"] = examples['original_label']
|
| 458 |
+
model_inputs["mapped_labels"] = examples['label']
|
| 459 |
+
|
| 460 |
+
return model_inputs
|
| 461 |
+
|
| 462 |
+
# Tokenize datasets
|
| 463 |
+
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
|
| 464 |
+
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
|
| 465 |
+
|
| 466 |
+
# Standard training arguments
|
| 467 |
+
training_args = TrainingArguments(
|
| 468 |
output_dir=OUTPUT_DIR,
|
| 469 |
num_train_epochs=epochs,
|
| 470 |
per_device_train_batch_size=actual_batch_size,
|
| 471 |
per_device_eval_batch_size=actual_batch_size,
|
| 472 |
gradient_accumulation_steps=grad_accum,
|
| 473 |
+
gradient_checkpointing=True,
|
| 474 |
learning_rate=lr,
|
| 475 |
lr_scheduler_type="cosine",
|
| 476 |
+
warmup_steps=500, # More warmup for standard fine-tuning
|
| 477 |
logging_steps=10,
|
| 478 |
save_strategy="epoch",
|
| 479 |
+
evaluation_strategy="epoch",
|
| 480 |
+
bf16=True,
|
| 481 |
fp16=False,
|
| 482 |
+
weight_decay=0.01,
|
| 483 |
+
optim="adamw_torch",
|
| 484 |
+
save_total_limit=3,
|
| 485 |
+
load_best_model_at_end=True,
|
| 486 |
+
metric_for_best_model="eval_loss",
|
| 487 |
+
greater_is_better=False,
|
| 488 |
report_to=[],
|
| 489 |
+
run_name="standard-ft-relevance",
|
| 490 |
+
dataloader_num_workers=2,
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
# Create data collator
|
| 494 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 495 |
+
tokenizer=current_tokenizer,
|
| 496 |
+
mlm=False, # Causal LM, not masked LM
|
| 497 |
+
pad_to_multiple_of=8
|
| 498 |
)
|
| 499 |
|
| 500 |
+
# Apply LoRA to the model
|
| 501 |
+
current_model = get_peft_model(current_model, peft_config)
|
| 502 |
+
current_model.print_trainable_parameters()
|
| 503 |
+
|
| 504 |
+
# Create standard trainer
|
| 505 |
+
trainer = Trainer(
|
| 506 |
model=current_model,
|
|
|
|
| 507 |
args=training_args,
|
| 508 |
+
train_dataset=tokenized_train,
|
| 509 |
+
eval_dataset=tokenized_val,
|
| 510 |
+
data_collator=data_collator,
|
| 511 |
+
tokenizer=current_tokenizer,
|
| 512 |
)
|
| 513 |
|
| 514 |
# Custom logging callback
|
|
|
|
| 527 |
|
| 528 |
def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
|
| 529 |
"""Compute accuracy metrics and confusion matrix on a subset of eval data"""
|
| 530 |
+
# Get the original dataframe for easier access to prompts and labels
|
| 531 |
+
eval_df = ft_val_df
|
| 532 |
+
|
| 533 |
# Sample subset for faster evaluation
|
| 534 |
+
sample_size = min(num_samples, len(eval_df))
|
| 535 |
+
sample_df = eval_df.sample(n=sample_size, random_state=42)
|
| 536 |
|
| 537 |
# Initialize confusion matrix counters
|
|
|
|
| 538 |
confusion_matrix = {
|
| 539 |
'easy_positive': {'yes': 0, 'no': 0},
|
| 540 |
'hard_positive': {'yes': 0, 'no': 0},
|
|
|
|
| 546 |
predictions_no = 0
|
| 547 |
correct = 0
|
| 548 |
|
| 549 |
+
for idx, row in sample_df.iterrows():
|
| 550 |
+
prompt = row['prompt']
|
| 551 |
+
true_label = row['label'] # This is the mapped label (yes/no)
|
| 552 |
+
original_label = row['original_label'] # Get original 4-category label
|
|
|
|
| 553 |
|
| 554 |
# Tokenize and run inference
|
| 555 |
inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
|
|
|
| 574 |
else:
|
| 575 |
predictions_no += 1
|
| 576 |
|
| 577 |
+
if prediction == true_label:
|
| 578 |
correct += 1
|
| 579 |
|
| 580 |
# Update confusion matrix if we have original label
|
|
|
|
| 656 |
training_status["progress"] = min(int((state.global_step / total_steps) * 100), 99)
|
| 657 |
|
| 658 |
# Add callback with trainer and eval dataset
|
| 659 |
+
status_callback = StatusCallback(trainer, val_dataset)
|
| 660 |
+
trainer.add_callback(status_callback)
|
| 661 |
|
| 662 |
# Train
|
| 663 |
try:
|
| 664 |
+
logger.info("Starting fine-tuning...")
|
| 665 |
+
trainer.train()
|
| 666 |
|
| 667 |
# Save final model
|
| 668 |
save_path = os.path.join(OUTPUT_DIR, "final")
|
| 669 |
+
trainer.save_model(save_path)
|
| 670 |
current_tokenizer.save_pretrained(save_path)
|
| 671 |
logger.info(f"Model saved to {save_path}")
|
| 672 |
|
| 673 |
# Compute final metrics
|
| 674 |
logger.info("Computing final accuracy metrics...")
|
| 675 |
+
final_metrics = compute_accuracy_metrics(trainer, val_dataset, num_samples=200)
|
| 676 |
logger.info(f"Final Accuracy: {final_metrics['accuracy']:.2%}")
|
| 677 |
logger.info(f"Final Prediction Distribution - Yes: {final_metrics['yes_ratio']:.1%}, No: {final_metrics['no_ratio']:.1%}")
|
| 678 |
|
|
|
|
| 694 |
)
|
| 695 |
|
| 696 |
# Update global model reference
|
| 697 |
+
current_model = trainer.model
|
| 698 |
current_model.eval()
|
| 699 |
|
| 700 |
# Push to hub if token available
|
|
|
|
| 738 |
|
| 739 |
# {model_short_name} Document Relevance Classifier
|
| 740 |
|
| 741 |
+
This model was trained using standard fine-tuning for document relevance classification.
|
| 742 |
|
| 743 |
## Training Configuration
|
| 744 |
- Base Model: {current_model_id}
|
| 745 |
+
- Training Type: Standard Fine-tuning
|
| 746 |
- Learning Rate: {training_args.learning_rate}
|
| 747 |
- Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
|
| 748 |
- Epochs: {training_args.num_train_epochs}
|
|
|
|
| 775 |
current_model.push_to_hub(
|
| 776 |
repo_name,
|
| 777 |
use_auth_token=HF_TOKEN,
|
| 778 |
+
commit_message=f"Standard fine-tuning with lr={training_args.learning_rate}, accuracy={final_metrics['accuracy']:.2%}"
|
| 779 |
)
|
| 780 |
current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
|
| 781 |
|
|
|
|
| 807 |
"accuracy": final_metrics['accuracy'],
|
| 808 |
"yes_ratio": final_metrics['yes_ratio'],
|
| 809 |
"no_ratio": final_metrics['no_ratio'],
|
| 810 |
+
"lr": training_args.learning_rate,
|
| 811 |
"model_id": current_model_id
|
| 812 |
})
|
| 813 |
|
|
|
|
| 920 |
max_samples = 2000 # Start conservative
|
| 921 |
else:
|
| 922 |
max_samples = None
|
| 923 |
+
train_model(train_df, test_df, epochs=5, batch_size=32, lr=5e-6, max_samples=max_samples)
|
| 924 |
|
| 925 |
with training_lock:
|
| 926 |
training_status["status"] = "completed"
|
inference_chatgpt_simple.py
CHANGED
|
@@ -49,7 +49,7 @@ def main():
|
|
| 49 |
df = pd.read_csv(csv_path)
|
| 50 |
# Process each row
|
| 51 |
prds = [(str(row['query_text']),str(row['title']),str(row['text'])) for idx, row in df.iterrows()]
|
| 52 |
-
predictions = ThreadPool(
|
| 53 |
|
| 54 |
df['prediction'] = predictions
|
| 55 |
conf_matrix = pd.crosstab(
|
|
@@ -69,5 +69,12 @@ def main():
|
|
| 69 |
print("\nResults:")
|
| 70 |
print(df['prediction'].value_counts())
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
if __name__ == "__main__":
|
| 73 |
-
|
|
|
|
| 49 |
df = pd.read_csv(csv_path)
|
| 50 |
# Process each row
|
| 51 |
prds = [(str(row['query_text']),str(row['title']),str(row['text'])) for idx, row in df.iterrows()]
|
| 52 |
+
predictions = ThreadPool(100).starmap(get_prediction,prds)
|
| 53 |
|
| 54 |
df['prediction'] = predictions
|
| 55 |
conf_matrix = pd.crosstab(
|
|
|
|
| 69 |
print("\nResults:")
|
| 70 |
print(df['prediction'].value_counts())
|
| 71 |
|
| 72 |
+
def make_sample_db():
|
| 73 |
+
df = pd.read_csv(rf"train_datasets_creation/full_train_dataset.csv")
|
| 74 |
+
dfs = [df[df['label']==d].sample(100) for d in df['label'].unique()]
|
| 75 |
+
df = pd.concat(dfs).reset_index()
|
| 76 |
+
df.to_csv(f"sample_db_{datetime.now().isoformat()}.csv")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
if __name__ == "__main__":
|
| 80 |
+
make_sample_db()
|