File size: 52,343 Bytes
27c46c6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 | import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
from typing import Dict, List, Tuple
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import evaluate
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
class CounselorBenchmark:
def __init__(self, base_model_path: str, finetuned_model_path: str):
"""
Initialize benchmark suite for counselor models
"""
self.base_model_path = base_model_path
self.finetuned_model_path = finetuned_model_path
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load evaluation metrics
self.bleu = evaluate.load("sacrebleu")
self.rouge = evaluate.load("rouge")
self.bertscore = evaluate.load("bertscore")
def load_models(self):
"""Load both base and fine-tuned models for comparison"""
# Load base model
print("Loading base model...")
self.base_tokenizer = AutoTokenizer.from_pretrained(self.base_model_path)
self.base_model = AutoModelForCausalLM.from_pretrained(
self.base_model_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
# Load fine-tuned model
print("Loading fine-tuned model...")
self.ft_tokenizer = AutoTokenizer.from_pretrained(self.finetuned_model_path)
self.ft_model = AutoModelForCausalLM.from_pretrained(
self.finetuned_model_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
def generate_response(self, model, tokenizer, prompt: str, max_length: int = 256):
"""Generate response from model"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
temperature=0.7,
do_sample=True,
top_p=0.9,
repetition_penalty=1.1
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the generated part
response = response[len(prompt):].strip()
return response
def evaluate_empathy_score(self, response: str) -> float:
"""
Evaluate empathy in counselor response
Custom metric based on Japanese counseling keywords
"""
empathy_keywords = [
'ใใใใพใ', '็่งฃ', 'ๅ
ฑๆ', 'ๆฐๆใก', 'ๆใ',
'ใคใใ', 'ๅคงๅค', 'ใๅฏใ', 'ๅฟ้
', 'ๆฏๆด'
]
score = sum(1 for keyword in empathy_keywords if keyword in response)
return min(score / len(empathy_keywords), 1.0)
def evaluate_response_quality(self, response: str) -> Dict[str, float]:
"""
Comprehensive response quality evaluation
"""
metrics = {}
# Length appropriateness (not too short, not too long)
response_length = len(response)
if 50 <= response_length <= 300:
metrics['length_score'] = 1.0
elif response_length < 50:
metrics['length_score'] = response_length / 50
else:
metrics['length_score'] = max(0, 1 - (response_length - 300) / 500)
# Question engagement (does counselor ask clarifying questions?)
metrics['question_score'] = 1.0 if '๏ผ' in response or 'ใ๏ผ' in response else 0.0
# Supportive language
support_phrases = ['ๅคงไธๅคซ', 'ไธ็ทใซ', 'ๆฏๆด', 'ใตใใผใ', 'ๅฉใ']
metrics['support_score'] = sum(1 for phrase in support_phrases if phrase in response) / len(support_phrases)
# Empathy score
metrics['empathy_score'] = self.evaluate_empathy_score(response)
return metrics
def benchmark_on_test_set(self, test_data_path: str, num_samples: int = 100):
"""
Run comprehensive benchmark on test set
"""
# Load test data
test_dataset = load_dataset('json', data_files=test_data_path, split='train')
test_samples = test_dataset.select(range(min(num_samples, len(test_dataset))))
results = {
'base_model': {'responses': [], 'metrics': []},
'finetuned_model': {'responses': [], 'metrics': []}
}
print(f"Evaluating on {len(test_samples)} test samples...")
for sample in tqdm(test_samples):
prompt = sample['text'].split('### Response:')[0] + '### Response:'
reference = sample['text'].split('### Response:')[1].strip() if '### Response:' in sample['text'] else ""
# Generate responses
base_response = self.generate_response(self.base_model, self.base_tokenizer, prompt)
ft_response = self.generate_response(self.ft_model, self.ft_tokenizer, prompt)
# Store responses
results['base_model']['responses'].append(base_response)
results['finetuned_model']['responses'].append(ft_response)
# Evaluate quality
base_metrics = self.evaluate_response_quality(base_response)
ft_metrics = self.evaluate_response_quality(ft_response)
results['base_model']['metrics'].append(base_metrics)
results['finetuned_model']['metrics'].append(ft_metrics)
return results
def calculate_aggregate_metrics(self, results: Dict) -> Dict:
"""Calculate aggregate metrics for comparison"""
aggregate = {}
for model_name in ['base_model', 'finetuned_model']:
model_metrics = results[model_name]['metrics']
aggregate[model_name] = {}
# Calculate average for each metric
metric_names = model_metrics[0].keys() if model_metrics else []
for metric in metric_names:
values = [m[metric] for m in model_metrics]
aggregate[model_name][metric] = {
'mean': np.mean(values),
'std': np.std(values),
'min': np.min(values),
'max': np.max(values)
}
return aggregate
def generate_comparison_report(self, results: Dict, aggregate: Dict):
"""Generate detailed comparison report"""
report = []
report.append("=" * 80)
report.append("COUNSELOR MODEL BENCHMARK REPORT")
report.append("=" * 80)
report.append("")
# Overall performance comparison
report.append("PERFORMANCE COMPARISON:")
report.append("-" * 40)
for metric in aggregate['base_model'].keys():
base_score = aggregate['base_model'][metric]['mean']
ft_score = aggregate['finetuned_model'][metric]['mean']
improvement = ((ft_score - base_score) / base_score * 100) if base_score > 0 else 0
report.append(f"\n{metric.upper()}:")
report.append(f" Base Model: {base_score:.3f} (ยฑ{aggregate['base_model'][metric]['std']:.3f})")
report.append(f" Fine-tuned Model: {ft_score:.3f} (ยฑ{aggregate['finetuned_model'][metric]['std']:.3f})")
report.append(f" Improvement: {improvement:+.1f}%")
# Calculate overall score
base_overall = np.mean([aggregate['base_model'][m]['mean'] for m in aggregate['base_model']])
ft_overall = np.mean([aggregate['finetuned_model'][m]['mean'] for m in aggregate['finetuned_model']])
overall_improvement = ((ft_overall - base_overall) / base_overall * 100) if base_overall > 0 else 0
report.append("\n" + "=" * 40)
report.append("OVERALL PERFORMANCE:")
report.append(f" Base Model: {base_overall:.3f}")
report.append(f" Fine-tuned Model: {ft_overall:.3f}")
report.append(f" Overall Improvement: {overall_improvement:+.1f}%")
report.append("=" * 40)
return "\n".join(report)
def visualize_results(self, aggregate: Dict):
"""Create visualization of benchmark results"""
# Prepare data for plotting
metrics = list(aggregate['base_model'].keys())
base_scores = [aggregate['base_model'][m]['mean'] for m in metrics]
ft_scores = [aggregate['finetuned_model'][m]['mean'] for m in metrics]
# Create comparison plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Bar plot comparison
x = np.arange(len(metrics))
width = 0.35
ax1.bar(x - width/2, base_scores, width, label='Base Model', color='lightblue')
ax1.bar(x + width/2, ft_scores, width, label='Fine-tuned Model', color='darkblue')
ax1.set_xlabel('Metrics')
ax1.set_ylabel('Score')
ax1.set_title('Model Performance Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(metrics, rotation=45, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Improvement percentage plot
improvements = [((ft - base) / base * 100) if base > 0 else 0
for base, ft in zip(base_scores, ft_scores)]
colors = ['green' if imp > 0 else 'red' for imp in improvements]
ax2.bar(metrics, improvements, color=colors, alpha=0.7)
ax2.set_xlabel('Metrics')
ax2.set_ylabel('Improvement (%)')
ax2.set_title('Fine-tuning Improvement over Base Model')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax2.set_xticklabels(metrics, rotation=45, ha='right')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight')
plt.show()
print("Visualization saved as 'benchmark_results.png'")
# Run benchmarking
if __name__ == "__main__":
# Initialize benchmark
benchmark = CounselorBenchmark(
base_model_path="./models/LFM2-2.6B",
finetuned_model_path="./merged_counselor_mode_2b"
)
# Load models
benchmark.load_models()
# Run benchmark
print("Running benchmark evaluation...")
results = benchmark.benchmark_on_test_set("./processed_data_score80/test.jsonl", num_samples=100)
# Calculate aggregate metrics
aggregate = benchmark.calculate_aggregate_metrics(results)
# Generate report
report = benchmark.generate_comparison_report(results, aggregate)
print(report)
# Save report
with open("benchmark_report_2b.txt", "w") as f:
f.write(report)
# Visualize results
benchmark.visualize_results(aggregate)
print("\nBenchmarking completed! Check 'benchmark_report.txt' for detailed results.")
####################
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel, PeftConfig
# import numpy as np
# from typing import List, Dict, Tuple, Optional
# import json
# from tqdm import tqdm
# import os
# import gc
# import warnings
# from datetime import datetime
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
# from rouge_score import rouge_scorer
# import nltk
# from collections import defaultdict
# # Download required NLTK data
# try:
# nltk.download('punkt', quiet=True)
# except:
# pass
# warnings.filterwarnings('ignore')
# class AdvancedCounselorBenchmark:
# def __init__(self,
# base_model_name: str = "LiquidAI/LFM2-1.2B",
# finetuned_model_path: str = "./counselor_model/best_model",
# merged_model_path: str = "./merged_counselor_model",
# test_data_path: str = "./processed_data_score70/test.jsonl",
# device: str = None):
# """
# Initialize advanced benchmark suite with BLEU and ROUGE metrics
# Args:
# base_model_name: Name/path of base model
# finetuned_model_path: Path to fine-tuned LoRA adapter
# merged_model_path: Path to save/load merged model
# test_data_path: Path to test dataset with reference responses
# device: Device to run on (cuda/cpu)
# """
# self.base_model_name = base_model_name
# self.finetuned_model_path = finetuned_model_path
# self.merged_model_path = merged_model_path
# self.test_data_path = test_data_path
# self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
# print(f"๐ง Initializing Advanced Benchmark Suite")
# print(f" Device: {self.device}")
# if self.device == "cuda":
# print(f" GPU: {torch.cuda.get_device_name(0)}")
# print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
# # Initialize ROUGE scorer
# self.rouge_scorer = rouge_scorer.RougeScorer(
# ['rouge1', 'rouge2', 'rougeL'],
# use_stemmer=False, # Set to False for Japanese
# lang='japanese'
# )
# # Smoothing function for BLEU scores
# self.smoothing = SmoothingFunction().method1
# self.results = {}
# def load_test_data(self) -> List[Dict]:
# """Load test dataset with reference responses"""
# print(f"\n๐ Loading test data from {self.test_data_path}")
# test_data = []
# if os.path.exists(self.test_data_path):
# with open(self.test_data_path, 'r', encoding='utf-8') as f:
# for line in f:
# data = json.loads(line)
# test_data.append(data)
# print(f" Loaded {len(test_data)} test examples")
# else:
# print(f"โ ๏ธ Test data not found. Creating synthetic test data...")
# test_data = self.create_synthetic_test_data()
# return test_data
# def create_synthetic_test_data(self) -> List[Dict]:
# """Create synthetic test data if real data is not available"""
# synthetic_data = [
# {
# "text": "### Input:\nๆ่ฟในใใฌในใๆใใฆใใพใใ\n\n### Response:\nในใใฌในใๆใใฆใใใฎใงใใญใใใใฏๅคงๅคใคใใใใจใ ใจๆใใพใใใฉใฎใใใช็ถๆณใงในใใฌในใๆใใใใจใๅคใใงใใ๏ผใ่ฉฑใ่ใใใฆใใใ ใใใฐใไธ็ทใซๅฏพๅฆๆณใ่ใใใใจใใงใใพใใ",
# "input": "ๆ่ฟในใใฌในใๆใใฆใใพใใ",
# "reference": "ในใใฌในใๆใใฆใใใฎใงใใญใใใใฏๅคงๅคใคใใใใจใ ใจๆใใพใใใฉใฎใใใช็ถๆณใงในใใฌในใๆใใใใจใๅคใใงใใ๏ผใ่ฉฑใ่ใใใฆใใใ ใใใฐใไธ็ทใซๅฏพๅฆๆณใ่ใใใใจใใงใใพใใ"
# },
# {
# "text": "### Input:\nไปไบใใใพใใใใชใใฆๆฉใใงใใพใใ\n\n### Response:\nไปไบใงใๆฉใฟใชใฎใงใใญใใใพใใใใชใใจๆใใใจใๆฌๅฝใซ่พใใงใใใญใๅ
ทไฝ็ใซใฉใฎใใใช็นใงๅฐ้ฃใๆใใฆใใใฃใใใใพใใ๏ผไธ็ทใซๆด็ใใฆใฟใพใใใใ",
# "input": "ไปไบใใใพใใใใชใใฆๆฉใใงใใพใใ",
# "reference": "ไปไบใงใๆฉใฟใชใฎใงใใญใใใพใใใใชใใจๆใใใจใๆฌๅฝใซ่พใใงใใใญใๅ
ทไฝ็ใซใฉใฎใใใช็นใงๅฐ้ฃใๆใใฆใใใฃใใใใพใใ๏ผไธ็ทใซๆด็ใใฆใฟใพใใใใ"
# },
# {
# "text": "### Input:\nไบบ้้ขไฟใงๅฐใฃใฆใใพใใ\n\n### Response:\nไบบ้้ขไฟใฎๆฉใฟใฏๆฌๅฝใซๅฟใ็ฒใใพใใใญใใๆฐๆใกใๅฏใใใพใใใฉใฎใใใช้ขไฟๆงใงใๅฐใใงใใใใ๏ผ่ทๅ ดใๅฎถๆใๅไบบ้ขไฟใชใฉใใใๅฐใ่ฉณใใใ่ใใใใใ ใใพใใ๏ผ",
# "input": "ไบบ้้ขไฟใงๅฐใฃใฆใใพใใ",
# "reference": "ไบบ้้ขไฟใฎๆฉใฟใฏๆฌๅฝใซๅฟใ็ฒใใพใใใญใใๆฐๆใกใๅฏใใใพใใใฉใฎใใใช้ขไฟๆงใงใๅฐใใงใใใใ๏ผ่ทๅ ดใๅฎถๆใๅไบบ้ขไฟใชใฉใใใๅฐใ่ฉณใใใ่ใใใใใ ใใพใใ๏ผ"
# },
# {
# "text": "### Input:\nๅฐๆฅใไธๅฎใงใใ\n\n### Response:\nๅฐๆฅใธใฎไธๅฎใๆฑใใฆใใใฃใใใใฎใงใใญใๅ
ใ่ฆใใชใไธๅฎใฏใใจใฆใ้ใๆใใใใใใจใจๆใใพใใ็นใซใฉใฎใใใช็นใซใคใใฆไธๅฎใๆใใฆใใใฃใใใใพใใ๏ผ",
# "input": "ๅฐๆฅใไธๅฎใงใใ",
# "reference": "ๅฐๆฅใธใฎไธๅฎใๆฑใใฆใใใฃใใใใฎใงใใญใๅ
ใ่ฆใใชใไธๅฎใฏใใจใฆใ้ใๆใใใใใใจใจๆใใพใใ็นใซใฉใฎใใใช็นใซใคใใฆไธๅฎใๆใใฆใใใฃใใใใพใใ๏ผ"
# },
# {
# "text": "### Input:\n่ชไฟกใๆใฆใพใใใ\n\n### Response:\n่ชไฟกใๆใฆใชใใจใใใๆฐๆใกใใใใใใใพใใๅคใใฎๆนใๅใใใใชๆฉใฟใๆฑใใฆใใพใใใฉใฎใใใชๅ ด้ขใง็นใซ่ชไฟกใๆใฆใชใใจๆใใพใใ๏ผใใชใใฎๅผทใฟใไธ็ทใซ่ฆใคใใฆใใใพใใใใ",
# "input": "่ชไฟกใๆใฆใพใใใ",
# "reference": "่ชไฟกใๆใฆใชใใจใใใๆฐๆใกใใใใใใใพใใๅคใใฎๆนใๅใใใใชๆฉใฟใๆฑใใฆใใพใใใฉใฎใใใชๅ ด้ขใง็นใซ่ชไฟกใๆใฆใชใใจๆใใพใใ๏ผใใชใใฎๅผทใฟใไธ็ทใซ่ฆใคใใฆใใใพใใใใ"
# }
# ]
# return synthetic_data
# def merge_and_save_model(self, force_merge: bool = False):
# """Merge LoRA weights with base model and save"""
# if os.path.exists(self.merged_model_path) and not force_merge:
# print(f"โ
Merged model already exists at {self.merged_model_path}")
# return
# print("\n๐ Merging LoRA adapter with base model...")
# try:
# # Load base model
# print(" Loading base model...")
# base_model = AutoModelForCausalLM.from_pretrained(
# self.base_model_name,
# torch_dtype=torch.float16,
# device_map="auto" if self.device == "cuda" else None,
# trust_remote_code=True,
# low_cpu_mem_usage=True
# )
# # Check if adapter exists
# adapter_config_path = os.path.join(self.finetuned_model_path, "adapter_config.json")
# if not os.path.exists(adapter_config_path):
# print(f"โ ๏ธ No LoRA adapter found at {self.finetuned_model_path}")
# model = base_model
# else:
# # Load LoRA adapter
# print(" Loading LoRA adapter...")
# model = PeftModel.from_pretrained(
# base_model,
# self.finetuned_model_path,
# torch_dtype=torch.float16
# )
# # Merge weights
# print(" Merging weights...")
# model = model.merge_and_unload()
# # Save merged model
# print(f" Saving merged model to {self.merged_model_path}...")
# model.save_pretrained(self.merged_model_path)
# # Save tokenizer
# tokenizer = AutoTokenizer.from_pretrained(
# self.finetuned_model_path
# if os.path.exists(os.path.join(self.finetuned_model_path, "tokenizer_config.json"))
# else self.base_model_name
# )
# tokenizer.save_pretrained(self.merged_model_path)
# print("โ
Model merged and saved successfully!")
# # Clean up memory
# del base_model, model
# gc.collect()
# torch.cuda.empty_cache()
# except Exception as e:
# print(f"โ Error during merging: {e}")
# raise
# def load_models(self):
# """Load base and fine-tuned models for comparison"""
# print("\n๐ Loading models for benchmarking...")
# # Load tokenizer
# self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
# if self.tokenizer.pad_token is None:
# self.tokenizer.pad_token = self.tokenizer.eos_token
# # Load base model
# print(" Loading base model...")
# self.base_model = AutoModelForCausalLM.from_pretrained(
# self.base_model_name,
# torch_dtype=torch.float16,
# device_map="auto" if self.device == "cuda" else None,
# trust_remote_code=True,
# low_cpu_mem_usage=True
# )
# self.base_model.eval()
# # Load merged fine-tuned model
# if os.path.exists(self.merged_model_path):
# print(" Loading merged fine-tuned model...")
# self.finetuned_model = AutoModelForCausalLM.from_pretrained(
# self.merged_model_path,
# torch_dtype=torch.float16,
# device_map="auto" if self.device == "cuda" else None,
# trust_remote_code=True,
# low_cpu_mem_usage=True
# )
# else:
# print(" Loading fine-tuned model (attempting PEFT)...")
# try:
# base_for_peft = AutoModelForCausalLM.from_pretrained(
# self.base_model_name,
# torch_dtype=torch.float16,
# device_map="auto" if self.device == "cuda" else None,
# trust_remote_code=True,
# low_cpu_mem_usage=True
# )
# self.finetuned_model = PeftModel.from_pretrained(
# base_for_peft,
# self.finetuned_model_path,
# torch_dtype=torch.float16
# )
# except:
# self.finetuned_model = AutoModelForCausalLM.from_pretrained(
# self.finetuned_model_path,
# torch_dtype=torch.float16,
# device_map="auto" if self.device == "cuda" else None,
# trust_remote_code=True,
# low_cpu_mem_usage=True
# )
# self.finetuned_model.eval()
# print("โ
Models loaded successfully!")
# def generate_response(self, model, prompt: str, max_length: int = 150) -> str:
# """Generate response from model"""
# inputs = self.tokenizer(
# prompt,
# return_tensors="pt",
# truncation=True,
# max_length=512
# )
# if self.device == "cuda":
# inputs = {k: v.cuda() for k, v in inputs.items()}
# with torch.no_grad():
# outputs = model.generate(
# **inputs,
# max_new_tokens=max_length,
# temperature=0.7,
# do_sample=True,
# top_p=0.9,
# pad_token_id=self.tokenizer.pad_token_id,
# eos_token_id=self.tokenizer.eos_token_id
# )
# response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# # Extract only the generated response
# if "### Response:" in response:
# response = response.split("### Response:")[-1].strip()
# elif "Response:" in response:
# response = response.split("Response:")[-1].strip()
# else:
# # Remove the input prompt from response
# response = response[len(prompt):].strip()
# return response
# def tokenize_japanese(self, text: str) -> List[str]:
# """Tokenize Japanese text for BLEU calculation"""
# # Simple character-based tokenization for Japanese
# # In production, use MeCab or similar for better tokenization
# import re
# # Remove special characters and split
# text = re.sub(r'[ใใ๏ผ๏ผ\n]', ' ', text)
# tokens = text.strip().split()
# # Character-level tokenization as fallback
# if not tokens:
# tokens = list(text.strip())
# return tokens
# def calculate_bleu_scores(self, reference: str, hypothesis: str) -> Dict[str, float]:
# """Calculate BLEU-1, BLEU-2, BLEU-3, BLEU-4 scores"""
# # Tokenize texts
# ref_tokens = self.tokenize_japanese(reference)
# hyp_tokens = self.tokenize_japanese(hypothesis)
# # Calculate BLEU scores with different n-grams
# scores = {}
# # BLEU-1 (unigram)
# scores['BLEU-1'] = sentence_bleu(
# [ref_tokens], hyp_tokens,
# weights=(1.0, 0, 0, 0),
# smoothing_function=self.smoothing
# )
# # BLEU-2 (bigram)
# scores['BLEU-2'] = sentence_bleu(
# [ref_tokens], hyp_tokens,
# weights=(0.5, 0.5, 0, 0),
# smoothing_function=self.smoothing
# )
# # BLEU-3 (trigram)
# scores['BLEU-3'] = sentence_bleu(
# [ref_tokens], hyp_tokens,
# weights=(0.33, 0.33, 0.34, 0),
# smoothing_function=self.smoothing
# )
# # BLEU-4 (4-gram)
# scores['BLEU-4'] = sentence_bleu(
# [ref_tokens], hyp_tokens,
# weights=(0.25, 0.25, 0.25, 0.25),
# smoothing_function=self.smoothing
# )
# return scores
# def calculate_rouge_scores(self, reference: str, hypothesis: str) -> Dict[str, float]:
# """Calculate ROUGE-1, ROUGE-2, ROUGE-L scores"""
# scores = self.rouge_scorer.score(reference, hypothesis)
# return {
# 'ROUGE-1': scores['rouge1'].fmeasure,
# 'ROUGE-2': scores['rouge2'].fmeasure,
# 'ROUGE-L': scores['rougeL'].fmeasure
# }
# def run_bleu_rouge_benchmark(self, num_samples: int = None):
# """Run comprehensive BLEU and ROUGE benchmark"""
# print("\n" + "="*70)
# print("๐ RUNNING BLEU & ROUGE BENCHMARK")
# print("="*70)
# # Load test data
# test_data = self.load_test_data()
# if num_samples:
# test_data = test_data[:num_samples]
# print(f" Using {num_samples} samples for benchmarking")
# # Initialize score collectors
# base_scores = defaultdict(list)
# finetuned_scores = defaultdict(list)
# # Metrics to calculate
# metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4',
# 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
# print(f"\n๐ Evaluating {len(test_data)} test examples...")
# print("-" * 70)
# detailed_results = []
# for i, example in enumerate(tqdm(test_data, desc="Evaluating")):
# # Extract input and reference
# if 'input' in example:
# input_text = example['input']
# else:
# # Try to extract from text field
# if "### Input:" in example['text']:
# input_text = example['text'].split("### Input:")[1].split("### Response:")[0].strip()
# else:
# input_text = example['text'].split("\n")[0].strip()
# if 'reference' in example:
# reference = example['reference']
# else:
# # Try to extract from text field
# if "### Response:" in example['text']:
# reference = example['text'].split("### Response:")[1].strip()
# else:
# parts = example['text'].split("\n")
# reference = parts[1] if len(parts) > 1 else parts[0]
# # Format input for models
# formatted_input = f"### Instruction:\nใใชใใฏๆใใใใฎใใๅฟ็ใซใฆใณใปใฉใผใงใใ\n\n### Input:\n{input_text}\n\n### Response:\n"
# # Generate responses
# base_response = self.generate_response(self.base_model, formatted_input)
# finetuned_response = self.generate_response(self.finetuned_model, formatted_input)
# # Calculate BLEU scores
# base_bleu = self.calculate_bleu_scores(reference, base_response)
# finetuned_bleu = self.calculate_bleu_scores(reference, finetuned_response)
# # Calculate ROUGE scores
# base_rouge = self.calculate_rouge_scores(reference, base_response)
# finetuned_rouge = self.calculate_rouge_scores(reference, finetuned_response)
# # Combine scores
# base_all_scores = {**base_bleu, **base_rouge}
# finetuned_all_scores = {**finetuned_bleu, **finetuned_rouge}
# # Collect scores
# for metric in metrics:
# base_scores[metric].append(base_all_scores[metric])
# finetuned_scores[metric].append(finetuned_all_scores[metric])
# # Store detailed results
# detailed_results.append({
# 'input': input_text,
# 'reference': reference,
# 'base_response': base_response,
# 'finetuned_response': finetuned_response,
# 'base_scores': base_all_scores,
# 'finetuned_scores': finetuned_all_scores
# })
# # Print sample results
# if i < 3: # Show first 3 examples
# print(f"\n๐ Example {i+1}:")
# print(f" Input: {input_text[:50]}...")
# print(f" Reference: {reference[:50]}...")
# print(f" Base response: {base_response[:50]}...")
# print(f" Fine-tuned response: {finetuned_response[:50]}...")
# print(f" Base BLEU-4: {base_bleu['BLEU-4']:.3f}")
# print(f" Fine-tuned BLEU-4: {finetuned_bleu['BLEU-4']:.3f}")
# # Calculate aggregate statistics
# print("\n" + "="*70)
# print("๐ BENCHMARK RESULTS")
# print("="*70)
# self.results = {
# 'detailed_results': detailed_results,
# 'aggregate_scores': {},
# 'improvements': {}
# }
# # Print and store results
# print("\n" + "-"*70)
# print(f"{'Metric':<12} {'Base Model':<20} {'Fine-tuned Model':<20} {'Improvement':<15}")
# print("-"*70)
# for metric in metrics:
# base_mean = np.mean(base_scores[metric])
# base_std = np.std(base_scores[metric])
# finetuned_mean = np.mean(finetuned_scores[metric])
# finetuned_std = np.std(finetuned_scores[metric])
# # Calculate improvement
# if base_mean > 0:
# improvement = ((finetuned_mean - base_mean) / base_mean) * 100
# else:
# improvement = 0
# # Store results
# self.results['aggregate_scores'][metric] = {
# 'base_mean': base_mean,
# 'base_std': base_std,
# 'finetuned_mean': finetuned_mean,
# 'finetuned_std': finetuned_std
# }
# self.results['improvements'][metric] = improvement
# # Print results
# base_str = f"{base_mean:.3f} (ยฑ{base_std:.3f})"
# finetuned_str = f"{finetuned_mean:.3f} (ยฑ{finetuned_std:.3f})"
# imp_str = f"{improvement:+.1f}%"
# # Color code improvement
# if improvement > 0:
# imp_str = f"โ
{imp_str}"
# elif improvement < 0:
# imp_str = f"โ ๏ธ {imp_str}"
# else:
# imp_str = f"โ {imp_str}"
# print(f"{metric:<12} {base_str:<20} {finetuned_str:<20} {imp_str:<15}")
# # Calculate overall scores
# print("\n" + "="*70)
# print("๐ฏ OVERALL PERFORMANCE")
# print("="*70)
# # Average BLEU score
# bleu_metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']
# base_bleu_avg = np.mean([np.mean(base_scores[m]) for m in bleu_metrics])
# finetuned_bleu_avg = np.mean([np.mean(finetuned_scores[m]) for m in bleu_metrics])
# bleu_improvement = ((finetuned_bleu_avg - base_bleu_avg) / base_bleu_avg) * 100 if base_bleu_avg > 0 else 0
# # Average ROUGE score
# rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
# base_rouge_avg = np.mean([np.mean(base_scores[m]) for m in rouge_metrics])
# finetuned_rouge_avg = np.mean([np.mean(finetuned_scores[m]) for m in rouge_metrics])
# rouge_improvement = ((finetuned_rouge_avg - base_rouge_avg) / base_rouge_avg) * 100 if base_rouge_avg > 0 else 0
# # Overall average
# base_overall = np.mean([np.mean(base_scores[m]) for m in metrics])
# finetuned_overall = np.mean([np.mean(finetuned_scores[m]) for m in metrics])
# overall_improvement = ((finetuned_overall - base_overall) / base_overall) * 100 if base_overall > 0 else 0
# self.results['summary'] = {
# 'bleu_average': {
# 'base': base_bleu_avg,
# 'finetuned': finetuned_bleu_avg,
# 'improvement': bleu_improvement
# },
# 'rouge_average': {
# 'base': base_rouge_avg,
# 'finetuned': finetuned_rouge_avg,
# 'improvement': rouge_improvement
# },
# 'overall': {
# 'base': base_overall,
# 'finetuned': finetuned_overall,
# 'improvement': overall_improvement
# }
# }
# print(f"\n๐ Average BLEU Score:")
# print(f" Base Model: {base_bleu_avg:.3f}")
# print(f" Fine-tuned Model: {finetuned_bleu_avg:.3f}")
# print(f" Improvement: {bleu_improvement:+.1f}%")
# print(f"\n๐ Average ROUGE Score:")
# print(f" Base Model: {base_rouge_avg:.3f}")
# print(f" Fine-tuned Model: {finetuned_rouge_avg:.3f}")
# print(f" Improvement: {rouge_improvement:+.1f}%")
# print(f"\n๐ฏ Overall Average:")
# print(f" Base Model: {base_overall:.3f}")
# print(f" Fine-tuned Model: {finetuned_overall:.3f}")
# print(f" Improvement: {overall_improvement:+.1f}%")
# print("="*70)
# return self.results
# def visualize_results(self, save_path: str = "bleu_rouge_benchmark.png"):
# """Create comprehensive visualization of BLEU and ROUGE results"""
# if 'aggregate_scores' not in self.results:
# print("โ No results to visualize. Run benchmark first.")
# return
# print("\n๐ Creating visualizations...")
# fig, axes = plt.subplots(2, 3, figsize=(18, 12))
# # Color scheme
# base_color = '#3498db'
# finetuned_color = '#e74c3c'
# improvement_positive = '#27ae60'
# improvement_negative = '#c0392b'
# # 1. BLEU Scores Comparison
# bleu_metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']
# bleu_base = [self.results['aggregate_scores'][m]['base_mean'] for m in bleu_metrics]
# bleu_finetuned = [self.results['aggregate_scores'][m]['finetuned_mean'] for m in bleu_metrics]
# x = np.arange(len(bleu_metrics))
# width = 0.35
# axes[0, 0].bar(x - width/2, bleu_base, width, label='Base Model',
# color=base_color, alpha=0.8)
# axes[0, 0].bar(x + width/2, bleu_finetuned, width, label='Fine-tuned Model',
# color=finetuned_color, alpha=0.8)
# axes[0, 0].set_xlabel('BLEU Metrics')
# axes[0, 0].set_ylabel('Score')
# axes[0, 0].set_title('BLEU Score Comparison')
# axes[0, 0].set_xticks(x)
# axes[0, 0].set_xticklabels(bleu_metrics)
# axes[0, 0].legend()
# axes[0, 0].grid(True, alpha=0.3)
# axes[0, 0].set_ylim([0, max(max(bleu_base), max(bleu_finetuned)) * 1.2])
# # 2. ROUGE Scores Comparison
# rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
# rouge_base = [self.results['aggregate_scores'][m]['base_mean'] for m in rouge_metrics]
# rouge_finetuned = [self.results['aggregate_scores'][m]['finetuned_mean'] for m in rouge_metrics]
# x = np.arange(len(rouge_metrics))
# axes[0, 1].bar(x - width/2, rouge_base, width, label='Base Model',
# color=base_color, alpha=0.8)
# axes[0, 1].bar(x + width/2, rouge_finetuned, width, label='Fine-tuned Model',
# color=finetuned_color, alpha=0.8)
# axes[0, 1].set_xlabel('ROUGE Metrics')
# axes[0, 1].set_ylabel('Score')
# axes[0, 1].set_title('ROUGE Score Comparison')
# axes[0, 1].set_xticks(x)
# axes[0, 1].set_xticklabels(rouge_metrics)
# axes[0, 1].legend()
# axes[0, 1].grid(True, alpha=0.3)
# axes[0, 1].set_ylim([0, max(max(rouge_base), max(rouge_finetuned)) * 1.2])
# # 3. Improvement Percentages
# all_metrics = bleu_metrics + rouge_metrics
# improvements = [self.results['improvements'][m] for m in all_metrics]
# colors = [improvement_positive if imp > 0 else improvement_negative for imp in improvements]
# axes[0, 2].barh(range(len(all_metrics)), improvements, color=colors, alpha=0.7)
# axes[0, 2].set_yticks(range(len(all_metrics)))
# axes[0, 2].set_yticklabels(all_metrics)
# axes[0, 2].set_xlabel('Improvement (%)')
# axes[0, 2].set_title('Performance Improvement by Metric')
# axes[0, 2].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
# axes[0, 2].grid(True, alpha=0.3, axis='x')
# # 4. Line plot showing progression
# axes[1, 0].plot(bleu_metrics, bleu_base, 'o-', label='Base Model',
# color=base_color, linewidth=2, markersize=8)
# axes[1, 0].plot(bleu_metrics, bleu_finetuned, 's-', label='Fine-tuned Model',
# color=finetuned_color, linewidth=2, markersize=8)
# axes[1, 0].set_xlabel('BLEU N-gram')
# axes[1, 0].set_ylabel('Score')
# axes[1, 0].set_title('BLEU Score Progression')
# axes[1, 0].legend()
# axes[1, 0].grid(True, alpha=0.3)
# # 5. Summary Statistics
# ax5 = axes[1, 1]
# ax5.axis('off')
# summary_text = f"""
# BENCHMARK SUMMARY
# {'='*30}
# BLEU Average:
# Base: {self.results['summary']['bleu_average']['base']:.3f}
# Fine-tuned: {self.results['summary']['bleu_average']['finetuned']:.3f}
# Improvement: {self.results['summary']['bleu_average']['improvement']:+.1f}%
# ROUGE Average:
# Base: {self.results['summary']['rouge_average']['base']:.3f}
# Fine-tuned: {self.results['summary']['rouge_average']['finetuned']:.3f}
# Improvement: {self.results['summary']['rouge_average']['improvement']:+.1f}%
# Overall Performance:
# Base: {self.results['summary']['overall']['base']:.3f}
# Fine-tuned: {self.results['summary']['overall']['finetuned']:.3f}
# Improvement: {self.results['summary']['overall']['improvement']:+.1f}%
# Best Improvements:
# """
# # Find best improvements
# sorted_metrics = sorted(all_metrics,
# key=lambda m: self.results['improvements'][m],
# reverse=True)
# for m in sorted_metrics[:2]:
# summary_text += f" โข {m}: {self.results['improvements'][m]:+.1f}%\n"
# if any(self.results['improvements'][m] < 0 for m in all_metrics):
# summary_text += f"\nNeeds Attention:\n"
# for m in sorted_metrics[-2:]:
# if self.results['improvements'][m] < 0:
# summary_text += f" โข {m}: {self.results['improvements'][m]:+.1f}%\n"
# ax5.text(0.1, 0.9, summary_text, transform=ax5.transAxes,
# fontsize=10, verticalalignment='top', fontfamily='monospace')
# # 6. Heatmap of all scores
# metrics_for_heatmap = all_metrics
# models = ['Base', 'Fine-tuned']
# heatmap_data = []
# for metric in metrics_for_heatmap:
# heatmap_data.append([
# self.results['aggregate_scores'][metric]['base_mean'],
# self.results['aggregate_scores'][metric]['finetuned_mean']
# ])
# im = axes[1, 2].imshow(heatmap_data, cmap='YlOrRd', aspect='auto')
# axes[1, 2].set_xticks(np.arange(len(models)))
# axes[1, 2].set_yticks(np.arange(len(metrics_for_heatmap)))
# axes[1, 2].set_xticklabels(models)
# axes[1, 2].set_yticklabels(metrics_for_heatmap)
# axes[1, 2].set_title('Score Heatmap')
# # Add text annotations
# for i in range(len(metrics_for_heatmap)):
# for j in range(len(models)):
# text = axes[1, 2].text(j, i, f'{heatmap_data[i][j]:.3f}',
# ha="center", va="center", color="black", fontsize=8)
# plt.colorbar(im, ax=axes[1, 2])
# plt.suptitle('BLEU & ROUGE Benchmark Results', fontsize=16, fontweight='bold')
# plt.tight_layout()
# plt.savefig(save_path, dpi=300, bbox_inches='tight')
# print(f"โ
Visualization saved to {save_path}")
# plt.show()
# def save_results(self, output_path: str = "bleu_rouge_results.json"):
# """Save benchmark results to JSON"""
# # Convert numpy types to Python native types for JSON serialization
# def convert_to_native(obj):
# if isinstance(obj, np.floating):
# return float(obj)
# elif isinstance(obj, np.integer):
# return int(obj)
# elif isinstance(obj, np.ndarray):
# return obj.tolist()
# elif isinstance(obj, dict):
# return {k: convert_to_native(v) for k, v in obj.items()}
# elif isinstance(obj, list):
# return [convert_to_native(item) for item in obj]
# return obj
# results_native = convert_to_native(self.results)
# with open(output_path, 'w', encoding='utf-8') as f:
# json.dump(results_native, f, ensure_ascii=False, indent=2)
# print(f"โ
Results saved to {output_path}")
# def generate_detailed_report(self, output_path: str = "bleu_rouge_report.md"):
# """Generate detailed markdown report"""
# if not self.results:
# print("โ No results to report. Run benchmark first.")
# return
# report = f"""# BLEU & ROUGE Benchmark Report
# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
# ## Executive Summary
# Comprehensive evaluation of the fine-tuned counseling model using BLEU and ROUGE metrics.
# ### Overall Performance
# - **Base Model Score**: {self.results['summary']['overall']['base']:.3f}
# - **Fine-tuned Model Score**: {self.results['summary']['overall']['finetuned']:.3f}
# - **Overall Improvement**: {self.results['summary']['overall']['improvement']:+.1f}%
# ## Detailed Metrics
# ### BLEU Scores
# | Metric | Base Model | Fine-tuned Model | Improvement |
# |--------|------------|------------------|-------------|
# """
# for metric in ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']:
# scores = self.results['aggregate_scores'][metric]
# report += f"| {metric} | {scores['base_mean']:.3f} (ยฑ{scores['base_std']:.3f}) | "
# report += f"{scores['finetuned_mean']:.3f} (ยฑ{scores['finetuned_std']:.3f}) | "
# report += f"{self.results['improvements'][metric]:+.1f}% |\n"
# report += f"""
# **BLEU Average**: {self.results['summary']['bleu_average']['improvement']:+.1f}% improvement
# ### ROUGE Scores
# | Metric | Base Model | Fine-tuned Model | Improvement |
# |--------|------------|------------------|-------------|
# """
# for metric in ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']:
# scores = self.results['aggregate_scores'][metric]
# report += f"| {metric} | {scores['base_mean']:.3f} (ยฑ{scores['base_std']:.3f}) | "
# report += f"{scores['finetuned_mean']:.3f} (ยฑ{scores['finetuned_std']:.3f}) | "
# report += f"{self.results['improvements'][metric]:+.1f}% |\n"
# report += f"""
# **ROUGE Average**: {self.results['summary']['rouge_average']['improvement']:+.1f}% improvement
# ## Sample Outputs
# """
# # Add sample outputs
# for i, result in enumerate(self.results['detailed_results'][:3]):
# report += f"""### Example {i+1}
# **Input**: {result['input']}
# **Reference**: {result['reference'][:200]}...
# **Base Model Response**: {result['base_response'][:200]}...
# **Fine-tuned Model Response**: {result['finetuned_response'][:200]}...
# **Scores**:
# - Base BLEU-4: {result['base_scores']['BLEU-4']:.3f}, ROUGE-L: {result['base_scores']['ROUGE-L']:.3f}
# - Fine-tuned BLEU-4: {result['finetuned_scores']['BLEU-4']:.3f}, ROUGE-L: {result['finetuned_scores']['ROUGE-L']:.3f}
# ---
# """
# report += """## Analysis & Recommendations
# """
# overall_imp = self.results['summary']['overall']['improvement']
# if overall_imp < -10:
# report += """### โ ๏ธ Significant Performance Degradation
# The fine-tuned model shows significant degradation in BLEU/ROUGE scores. This indicates:
# 1. **Catastrophic Forgetting**: The model has lost its language generation capabilities
# 2. **Overfitting**: The model memorized training data instead of learning patterns
# 3. **Format Mismatch**: Training and inference formats may differ
# **Immediate Actions Required**:
# - โ
Ensure proper model merging (LoRA weights with base model)
# - โ
Reduce learning rate (try 1e-5 or 2e-5)
# - โ
Use smaller LoRA rank (r=4 or r=8)
# - โ
Mix general conversation data with counseling data (80/20 ratio)
# - โ
Implement regularization (weight decay=0.1, dropout=0.1)
# - โ
Use early stopping with patience=3
# """
# elif overall_imp < 0:
# report += """### โ ๏ธ Minor Performance Degradation
# The model shows slight degradation. Common causes:
# 1. **Aggressive Fine-tuning**: Parameters changed too much
# 2. **Limited Training Data**: Not enough diverse examples
# 3. **Domain Shift**: Counseling domain too different from base training
# **Recommended Actions**:
# - โ
Fine-tune for fewer epochs (1-2 instead of 3)
# - โ
Use gradient accumulation for larger effective batch size
# - โ
Implement knowledge distillation from base model
# - โ
Add more diverse training examples
# """
# elif overall_imp < 10:
# report += """### ๐ Modest Improvement
# The model shows small but positive improvements.
# **To Further Improve**:
# - โ
Increase training data quality and quantity
# - โ
Experiment with different generation parameters
# - โ
Fine-tune on domain-specific pre-training
# - โ
Use ensemble methods with base model
# """
# else:
# report += """### โ
Significant Improvement
# Excellent results! The fine-tuned model shows substantial improvements.
# **Next Steps**:
# - โ
Deploy for A/B testing with users
# - โ
Monitor performance on edge cases
# - โ
Consider model compression for deployment
# - โ
Collect user feedback for iterative improvement
# """
# with open(output_path, 'w', encoding='utf-8') as f:
# f.write(report)
# print(f"โ
Detailed report saved to {output_path}")
# # Main execution
# if __name__ == "__main__":
# import argparse
# parser = argparse.ArgumentParser(description='Advanced BLEU & ROUGE Benchmark')
# parser.add_argument('--base_model', type=str, default='LiquidAI/LFM2-2.6B',
# help='Base model name')
# parser.add_argument('--finetuned_path', type=str, default='./counselor_model/best_model',
# help='Path to fine-tuned model')
# parser.add_argument('--merged_path', type=str, default='./merged_counselor_mode_2b',
# help='Path to save/load merged model')
# parser.add_argument('--test_data', type=str, default='./processed_data_score80/test.jsonl',
# help='Path to test data')
# parser.add_argument('--num_samples', type=int, default=None,
# help='Number of samples to evaluate (None for all)')
# parser.add_argument('--force_merge', action='store_true',
# help='Force re-merge even if merged model exists')
# parser.add_argument('--skip_merge', action='store_true',
# help='Skip merging step')
# parser.add_argument('--output_dir', type=str, default='./benchmark_results',
# help='Directory to save results')
# args = parser.parse_args()
# # Create output directory
# os.makedirs(args.output_dir, exist_ok=True)
# try:
# # Initialize benchmark
# print("๐ Initializing Advanced BLEU & ROUGE Benchmark")
# benchmark = AdvancedCounselorBenchmark(
# base_model_name=args.base_model,
# finetuned_model_path=args.finetuned_path,
# merged_model_path=args.merged_path,
# test_data_path=args.test_data
# )
# # Merge models if needed
# if not args.skip_merge:
# benchmark.merge_and_save_model(force_merge=args.force_merge)
# # Load models
# benchmark.load_models()
# # Run BLEU & ROUGE benchmark
# results = benchmark.run_bleu_rouge_benchmark(num_samples=args.num_samples)
# # Save results
# benchmark.save_results(os.path.join(args.output_dir, "bleu_rouge_results_2b.json"))
# # Generate visualizations
# benchmark.visualize_results(os.path.join(args.output_dir, "bleu_rouge_visualization_2b.png"))
# # Generate detailed report
# benchmark.generate_detailed_report(os.path.join(args.output_dir, "bleu_rouge_report_2b.md"))
# print("\nโ
BLEU & ROUGE Benchmarking completed successfully!")
# print(f"๐ Results saved to {args.output_dir}/")
# except Exception as e:
# print(f"\nโ Error during benchmarking: {e}")
# import traceback
# traceback.print_exc()
|