Mr-FineTuner commited on
Commit
15db98f
·
verified ·
1 Parent(s): cdc262f

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +41 -0
  2. example_notebook.ipynb +122 -0
  3. inference.py +33 -0
  4. model-card.md +65 -0
  5. model1.joblib +3 -0
  6. model2.joblib +3 -0
  7. requirements.txt +6 -0
README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sentiment Analysis Models
2
+
3
+ This repository contains two logistic regression models trained to predict sentiment scores.
4
+
5
+ ## Model Details
6
+ - Base embedding model: mixedbread-ai/mxbai-embed-large-v1
7
+ - Architecture: LogisticRegression (scikit-learn)
8
+ - Training data: Custom sentiment dataset with dual expert annotations
9
+ - Data split: 70% training, 15% development, 15% test
10
+
11
+ ## Performance Metrics
12
+
13
+ ### Development Set
14
+ #### Against Expert 1:
15
+ - Exact match: 49.27%
16
+ - Within 1 level: 96.05%
17
+
18
+ #### Against Expert 2:
19
+ - Exact match: 41.00%
20
+ - Within 1 level: 93.05%
21
+
22
+ ### Test Set
23
+ #### Against Expert 1:
24
+ - Exact match: 49.32%
25
+ - Within 1 level: 94.93%
26
+
27
+ #### Against Expert 2:
28
+ - Exact match: 41.44%
29
+ - Within 1 level: 91.51%
30
+
31
+ ## Usage
32
+
33
+ See `inference.py` for an example of how to use these models to predict sentiment for new text.
34
+
35
+ ## Model Files
36
+ - `model1.joblib`: Model trained on Expert 1 annotations
37
+ - `model2.joblib`: Model trained on Expert 2 annotations
38
+
39
+ ## Data Files
40
+ - `dev_results.csv`: Complete predictions on development set
41
+ - `test_results.csv`: Complete predictions on test set
example_notebook.ipynb ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Sentiment Analysis Model Demo\n",
8
+ "This notebook demonstrates how to use the sentiment analysis models to predict sentiment for new text."
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "import joblib\n",
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "from sentence_transformers import SentenceTransformer"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "# Load the models\n",
30
+ "model1 = joblib.load('model1.joblib')\n",
31
+ "model2 = joblib.load('model2.joblib')\n",
32
+ "\n",
33
+ "# Load the embedder\n",
34
+ "embedder = SentenceTransformer('BAAI/bge-large-en-v1.5')"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "def predict_sentiment(text):\n",
44
+ " # Generate embedding\n",
45
+ " embedding = embedder.encode([text])\n",
46
+ " \n",
47
+ " # Make predictions\n",
48
+ " pred1 = model1.predict(embedding)[0]\n",
49
+ " pred2 = model2.predict(embedding)[0]\n",
50
+ " \n",
51
+ " # Average and round\n",
52
+ " final_prediction = np.round((pred1 + pred2) / 2).astype(int)\n",
53
+ " \n",
54
+ " return final_prediction, pred1, pred2"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "# Try with a sample text\n",
64
+ "sample_text = \"I absolutely loved this movie! The actors were amazing and the plot was fantastic.\"\n",
65
+ "final_score, score1, score2 = predict_sentiment(sample_text)\n",
66
+ "\n",
67
+ "print(f\"Text: {sample_text}\")\n",
68
+ "print(f\"Final sentiment score: {final_score}\")\n",
69
+ "print(f\"Model 1 score: {score1}\")\n",
70
+ "print(f\"Model 2 score: {score2}\")"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": null,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "# Try with multiple texts\n",
80
+ "texts = [\n",
81
+ " \"This product is terrible. Complete waste of money.\",\n",
82
+ " \"The service was okay, nothing special.\",\n",
83
+ " \"Absolutely fantastic experience! Would highly recommend.\",\n",
84
+ " \"Not what I expected, but it wasn't bad either.\"\n",
85
+ "]\n",
86
+ "\n",
87
+ "results = []\n",
88
+ "for text in texts:\n",
89
+ " final_score, score1, score2 = predict_sentiment(text)\n",
90
+ " results.append({\n",
91
+ " 'Text': text,\n",
92
+ " 'Final Score': final_score,\n",
93
+ " 'Expert 1 Score': score1,\n",
94
+ " 'Expert 2 Score': score2\n",
95
+ " })\n",
96
+ "\n",
97
+ "pd.DataFrame(results)"
98
+ ]
99
+ }
100
+ ],
101
+ "metadata": {
102
+ "kernelspec": {
103
+ "display_name": "Python 3",
104
+ "language": "python",
105
+ "name": "python3"
106
+ },
107
+ "language_info": {
108
+ "codemirror_mode": {
109
+ "name": "ipython",
110
+ "version": 3
111
+ },
112
+ "file_extension": ".py",
113
+ "mimetype": "text/x-python",
114
+ "name": "python",
115
+ "nbconvert_exporter": "python",
116
+ "pygments_lexer": "ipython3",
117
+ "version": "3.8.10"
118
+ }
119
+ },
120
+ "nbformat": 4,
121
+ "nbformat_minor": 4
122
+ }
inference.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import joblib
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ # Load the models
7
+ model1 = joblib.load('model1.joblib')
8
+ model2 = joblib.load('model2.joblib')
9
+
10
+ # Load the embedder
11
+ embedder = SentenceTransformer('BAAI/bge-large-en-v1.5')
12
+
13
+ def predict_sentiment(text):
14
+ # Generate embedding
15
+ embedding = embedder.encode([text])
16
+
17
+ # Make predictions
18
+ pred1 = model1.predict(embedding)[0]
19
+ pred2 = model2.predict(embedding)[0]
20
+
21
+ # Average and round
22
+ final_prediction = np.round((pred1 + pred2) / 2).astype(int)
23
+
24
+ return final_prediction, pred1, pred2
25
+
26
+ # Example usage
27
+ if __name__ == "__main__":
28
+ test_text = "I really enjoyed this product!"
29
+ final_score, score1, score2 = predict_sentiment(test_text)
30
+ print(f"Text: {test_text}")
31
+ print(f"Final sentiment score: {final_score}")
32
+ print(f"Model 1 score: {score1}")
33
+ print(f"Model 2 score: {score2}")
model-card.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ library_name: scikit-learn
5
+ tags:
6
+ - sentiment-analysis
7
+ - text-classification
8
+ - scikit-learn
9
+ - sentence-transformers
10
+ datasets:
11
+ - custom_sentiment_dataset
12
+ metrics:
13
+ - accuracy
14
+ ---
15
+
16
+ # Sentiment Analysis Model
17
+
18
+ This model predicts sentiment scores based on text input. It uses sentence embeddings from BAAI/bge-large-en-v1.5 and logistic regression classifiers.
19
+
20
+ ## Model Description
21
+
22
+ This repository contains two logistic regression models trained to predict sentiment scores based on text embeddings. The models were trained on a custom dataset with annotations from two different experts.
23
+
24
+ ### Model Architecture
25
+
26
+ - Base embedding model: BAAI/bge-large-en-v1.5
27
+ - Classifier: LogisticRegression (scikit-learn)
28
+ - Final prediction: Average of both model predictions, rounded to nearest integer
29
+
30
+ ## Intended Use and Limitations
31
+
32
+ The model is designed for sentiment analysis tasks. The model works best with English text similar to the training data.
33
+
34
+ ## Training and Evaluation Data
35
+
36
+ The model was trained on a custom dataset with:
37
+ - 70% training data
38
+ - 15% development data
39
+ - 15% test data
40
+
41
+ Each sample has annotations from two human experts.
42
+
43
+ ## Evaluation Results
44
+
45
+ See README.md for detailed performance metrics on both development and test sets.
46
+
47
+ ## Using the Models
48
+
49
+ ```python
50
+ import joblib
51
+ import numpy as np
52
+ from sentence_transformers import SentenceTransformer
53
+
54
+ # Load the models
55
+ model1 = joblib.load('model1.joblib')
56
+ model2 = joblib.load('model2.joblib')
57
+ embedder = SentenceTransformer('BAAI/bge-large-en-v1.5')
58
+
59
+ def predict_sentiment(text):
60
+ embedding = embedder.encode([text])
61
+ pred1 = model1.predict(embedding)[0]
62
+ pred2 = model2.predict(embedding)[0]
63
+ final_prediction = np.round((pred1 + pred2) / 2).astype(int)
64
+ return final_prediction
65
+ ```
model1.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e7d13df3bef2dfdc42a8fbcaa19a9f8cbf74b06399cf2b57f17a6198a4a693
3
+ size 50087
model2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5bcef3af140f0f8d33b0b4e4cefa681bb0504bebbc5ce9f40b4f87822b655a0
3
+ size 50087
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ sentence-transformers>=2.2.2
2
+ scikit-learn>=1.0.0
3
+ numpy>=1.20.0
4
+ joblib>=1.1.0
5
+ pandas>=1.3.0
6
+ tabulate>=0.8.9