tlemagueresse commited on
Commit
d75519d
·
1 Parent(s): e646162
Files changed (5) hide show
  1. __init__.py +0 -0
  2. demo.ipynb +138 -0
  3. model.py +91 -0
  4. packages.txt +1 -0
  5. requirements.txt +10 -0
__init__.py ADDED
File without changes
demo.ipynb ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {
5
+ "ExecuteTime": {
6
+ "end_time": "2025-02-15T20:40:20.092409Z",
7
+ "start_time": "2025-02-15T20:40:14.408247Z"
8
+ }
9
+ },
10
+ "cell_type": "code",
11
+ "source": [
12
+ "import numpy as np\n",
13
+ "from datasets import load_dataset\n",
14
+ "from bert_score import score\n",
15
+ "from model import MetaModel\n",
16
+ "import time"
17
+ ],
18
+ "id": "5d14705fffbcfb64",
19
+ "outputs": [],
20
+ "execution_count": 1
21
+ },
22
+ {
23
+ "metadata": {
24
+ "jupyter": {
25
+ "is_executing": true
26
+ },
27
+ "ExecuteTime": {
28
+ "start_time": "2025-02-15T20:40:20.098977Z"
29
+ }
30
+ },
31
+ "cell_type": "code",
32
+ "source": [
33
+ "dataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
34
+ "\n",
35
+ "train_texts = dataset[\"article\"][:100]\n",
36
+ "train_summaries = dataset[\"highlights\"][:100]\n",
37
+ "test_texts = dataset[\"article\"][100:200]\n",
38
+ "test_summaries = dataset[\"highlights\"][100:200]\n",
39
+ "\n",
40
+ "model_names = [\"google-t5/t5-small\", \"google-t5/t5-base\", \"google-t5/t5-large\"]\n",
41
+ "meta_model = MetaModel(model_names)\n",
42
+ "start_time = time.time()\n",
43
+ "meta_model.fit(train_texts, train_summaries)\n",
44
+ "print(\"MetaModel fitting time [sec]:\" % (time.time() - start_time))\n"
45
+ ],
46
+ "id": "6d68f234e372396d",
47
+ "outputs": [],
48
+ "execution_count": null
49
+ },
50
+ {
51
+ "metadata": {},
52
+ "cell_type": "code",
53
+ "outputs": [],
54
+ "execution_count": null,
55
+ "source": [
56
+ "# Evaluation on test set\n",
57
+ "meta_model_scores = []\n",
58
+ "meta_model_times = []\n",
59
+ "model_scores = {name: [] for name in model_names}\n",
60
+ "model_times = {name: [] for name in model_names}\n",
61
+ "correct_predictions = 0\n",
62
+ "tolerance = 0.05\n",
63
+ "\n",
64
+ "for i, text in enumerate(test_texts):\n",
65
+ " predicted_summary, meta_time = meta_model.summarize(text)\n",
66
+ " P, R, F1 = score([predicted_summary], [test_summaries[i]], lang=\"en\", verbose=False)\n",
67
+ " meta_model_scores.append(F1.item())\n",
68
+ " meta_model_times.append(meta_time)\n",
69
+ "\n",
70
+ " model_results = []\n",
71
+ " for model_name in model_names:\n",
72
+ " model = meta_model.models[model_name]\n",
73
+ " summary, elapsed_time = model.summarize(text)\n",
74
+ " P, R, F1 = score([summary], [test_summaries[i]], lang=\"en\", verbose=False)\n",
75
+ " f1_score = F1.item()\n",
76
+ "\n",
77
+ " model_scores[model_name].append(f1_score)\n",
78
+ " model_times[model_name].append(elapsed_time)\n",
79
+ " model_results.append((model_name, f1_score, elapsed_time))\n",
80
+ "\n",
81
+ " model_results.sort(key=lambda x: (-x[1], x[2]))\n",
82
+ " best_model, best_score, best_time = model_results[0]\n",
83
+ "\n",
84
+ " for model_name, f1_score, elapsed_time in model_results[1:]:\n",
85
+ " if best_score - f1_score <= tolerance and elapsed_time < best_time:\n",
86
+ " best_model, best_score, best_time = model_name, f1_score, elapsed_time\n",
87
+ "\n",
88
+ " if best_model == predicted_summary:\n",
89
+ " correct_predictions += 1\n",
90
+ "\n",
91
+ "def compute_avg(values):\n",
92
+ " return np.mean(values)\n",
93
+ "\n",
94
+ "print(\"\\n===== Model Evaluation =====\")\n",
95
+ "for model_name in model_names:\n",
96
+ " avg_score = compute_avg(model_scores[model_name])\n",
97
+ " avg_time = compute_avg(model_times[model_name])\n",
98
+ " print(f\"{model_name}: BERTScore={avg_score:.4f}, Time={avg_time:.4f}s\")\n",
99
+ "\n",
100
+ "print(\n",
101
+ " f\"\\nMeta-Model: Accuracy={correct_predictions / len(test_texts):.2%}, \"\n",
102
+ " f\"BERTScore={compute_avg(meta_model_scores):.4f}, \"\n",
103
+ " f\"Time={compute_avg(meta_model_times):.4f}s\"\n",
104
+ ")\n"
105
+ ],
106
+ "id": "6fd91b97e4b6e588"
107
+ },
108
+ {
109
+ "metadata": {},
110
+ "cell_type": "code",
111
+ "outputs": [],
112
+ "execution_count": null,
113
+ "source": "",
114
+ "id": "204e55cee1ee63e4"
115
+ }
116
+ ],
117
+ "metadata": {
118
+ "kernelspec": {
119
+ "display_name": "Python 3",
120
+ "language": "python",
121
+ "name": "python3"
122
+ },
123
+ "language_info": {
124
+ "codemirror_mode": {
125
+ "name": "ipython",
126
+ "version": 2
127
+ },
128
+ "file_extension": ".py",
129
+ "mimetype": "text/x-python",
130
+ "name": "python",
131
+ "nbconvert_exporter": "python",
132
+ "pygments_lexer": "ipython2",
133
+ "version": "2.7.6"
134
+ }
135
+ },
136
+ "nbformat": 4,
137
+ "nbformat_minor": 5
138
+ }
model.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from collections import Counter
3
+
4
+ import numpy as np
5
+ import spacy
6
+ import torch
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
+ from datasets import load_dataset
9
+ from bert_score import score
10
+ from sklearn.ensemble import RandomForestClassifier
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.preprocessing import StandardScaler
13
+ from scipy.stats import entropy
14
+
15
+ def compute_entropy(text):
16
+ words = text.split()
17
+ word_freq = Counter(words)
18
+ probs = np.array(list(word_freq.values())) / sum(word_freq.values())
19
+ return entropy(probs)
20
+
21
+
22
+ def compute_syntactic_complexity(text):
23
+ nlp = spacy.load("en_core_web_sm")
24
+ doc = nlp(text)
25
+ depths = [token.head.i - token.i for token in doc if token.head != token]
26
+ return np.mean(depths) if depths else 0
27
+
28
+
29
+ class T5Model:
30
+ def __init__(self, model_name):
31
+ self.model_name = model_name
32
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
34
+
35
+ def summarize(self, text):
36
+ inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
37
+ start_time = time.time()
38
+ outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
39
+ end_time = time.time()
40
+ summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
41
+ return summary, end_time - start_time
42
+
43
+
44
+ class MetaModel:
45
+ def __init__(self, model_names):
46
+ self.models = {name: T5Model(name) for name in model_names}
47
+ self.classifier = RandomForestClassifier(n_estimators=100, random_state=42)
48
+
49
+ def extract_features(self, text):
50
+ words = text.split()
51
+ num_words = len(words)
52
+ avg_word_length = np.mean([len(w) for w in words]) if words else 0
53
+ complexity = compute_syntactic_complexity(text)
54
+ entropy = compute_entropy(text)
55
+ return [num_words, avg_word_length, complexity, entropy]
56
+
57
+ def fit(self, texts, summaries):
58
+ X = np.array([self.extract_features(text) for text in texts])
59
+
60
+ best_model_labels = []
61
+ tolerance = 0.05 # BERTScore tolerance
62
+
63
+ for i, text in enumerate(texts):
64
+ model_results = []
65
+ for model_name, model in self.models.items():
66
+ summary, elapsed_time = model.summarize(text)
67
+ P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False)
68
+ f1_score = F1.item()
69
+ model_results.append((model_name, f1_score, elapsed_time))
70
+
71
+ # Sort models by BERTScore (desc) and then by time (asc)
72
+ model_results.sort(key=lambda x: (-x[1], x[2]))
73
+
74
+ # Select best model based on tolerance rule
75
+ best_model, best_score, best_time = model_results[0]
76
+ for model_name, f1_score, elapsed_time in model_results[1:]:
77
+ if best_score - f1_score <= tolerance and elapsed_time < best_time:
78
+ best_model, best_score, best_time = model_name, f1_score, elapsed_time
79
+
80
+ best_model_labels.append(best_model)
81
+
82
+ y = np.array([list(self.models.keys()).index(m) for m in best_model_labels])
83
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
84
+ self.classifier.fit(X_train, y_train)
85
+
86
+ def summarize(self, text):
87
+ features = np.array([self.extract_features(text)])
88
+ predicted_model_index = self.classifier.predict(features)[0]
89
+ predicted_model_name = list(self.models.keys())[predicted_model_index]
90
+ return self.models[predicted_model_name].summarize(text)
91
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python -m spacy download en_core_web_lg
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ spacy
5
+ numpy
6
+ scipy
7
+ rouge_score
8
+ bert_score
9
+ ipywidgets
10
+ scikit-learn