multilingual upgrade upload of sentiment-multilingual
Browse files- .gitattributes +1 -0
- README.md +210 -105
- config.json +12 -8
- model.safetensors +2 -2
- tokenizer.json +2 -14
- tokenizer_config.json +43 -3
.gitattributes
CHANGED
|
@@ -25,6 +25,7 @@
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,144 +1,249 @@
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
tags:
|
| 4 |
-
- sentiment-analysis
|
| 5 |
-
- text-classification
|
| 6 |
-
- zero-shot-distillation
|
| 7 |
-
- distillation
|
| 8 |
-
- zero-shot-classification
|
| 9 |
-
- debarta-v3
|
| 10 |
-
model-index:
|
| 11 |
-
- name: distilbert-base-multilingual-cased-sentiments-student
|
| 12 |
-
results: []
|
| 13 |
-
datasets:
|
| 14 |
-
- tyqiangz/multilingual-sentiments
|
| 15 |
language:
|
| 16 |
- en
|
|
|
|
|
|
|
|
|
|
| 17 |
- ar
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
- de
|
| 19 |
-
-
|
|
|
|
|
|
|
|
|
|
| 20 |
- fr
|
| 21 |
-
-
|
| 22 |
-
- zh
|
| 23 |
-
- id
|
| 24 |
-
- hi
|
| 25 |
- it
|
| 26 |
-
-
|
| 27 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
---
|
| 29 |
|
| 30 |
-
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
| 31 |
-
should probably proofread and complete it, then remove this comment. -->
|
| 32 |
|
| 33 |
-
#
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
-
In reality the multilingual-sentiment dataset is annotated of course,
|
| 39 |
-
but we'll pretend and ignore the annotations for the sake of example.
|
| 40 |
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
Teacher hypothesis template: "The sentiment of this text is {}."
|
| 44 |
-
Student model: distilbert-base-multilingual-cased
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
|
| 49 |
-
```python
|
| 50 |
-
from transformers import pipeline
|
| 51 |
|
| 52 |
-
|
| 53 |
-
model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
|
| 54 |
-
return_all_scores=True
|
| 55 |
-
)
|
| 56 |
|
| 57 |
-
|
| 58 |
-
distilled_student_sentiment_classifier ("I love this movie and i would watch it again and again!")
|
| 59 |
-
>> [[{'label': 'positive', 'score': 0.9731044769287109},
|
| 60 |
-
{'label': 'neutral', 'score': 0.016910076141357422},
|
| 61 |
-
{'label': 'negative', 'score': 0.009985478594899178}]]
|
| 62 |
|
| 63 |
-
|
| 64 |
-
distilled_student_sentiment_classifier("Saya suka filem ini dan saya akan menontonnya lagi dan lagi!")
|
| 65 |
-
[[{'label': 'positive', 'score': 0.9760093688964844},
|
| 66 |
-
{'label': 'neutral', 'score': 0.01804516464471817},
|
| 67 |
-
{'label': 'negative', 'score': 0.005945465061813593}]]
|
| 68 |
-
|
| 69 |
-
# japanese
|
| 70 |
-
distilled_student_sentiment_classifier("็งใฏใใฎๆ ็ปใๅคงๅฅฝใใงใไฝๅบฆใ่ฆใพใ๏ผ")
|
| 71 |
-
>> [[{'label': 'positive', 'score': 0.9342429041862488},
|
| 72 |
-
{'label': 'neutral', 'score': 0.040193185210227966},
|
| 73 |
-
{'label': 'negative', 'score': 0.025563929229974747}]]
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
```
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
|
| 81 |
-
Notebook link: [here](https://github.com/LxYuan0420/nlp/blob/main/notebooks/Distilling_Zero_Shot_multilingual_distilbert_sentiments_student.ipynb)
|
| 82 |
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
| 86 |
|
| 87 |
-
|
| 88 |
-
python transformers/examples/research_projects/zero-shot-distillation/distill_classifier.py \
|
| 89 |
-
--data_file ./multilingual-sentiments/train_unlabeled.txt \
|
| 90 |
-
--class_names_file ./multilingual-sentiments/class_names.txt \
|
| 91 |
-
--hypothesis_template "The sentiment of this text is {}." \
|
| 92 |
-
--teacher_name_or_path MoritzLaurer/mDeBERTa-v3-base-mnli-xnli \
|
| 93 |
-
--teacher_batch_size 32 \
|
| 94 |
-
--student_name_or_path distilbert-base-multilingual-cased \
|
| 95 |
-
--output_dir ./distilbert-base-multilingual-cased-sentiments-student \
|
| 96 |
-
--per_device_train_batch_size 16 \
|
| 97 |
-
--fp16
|
| 98 |
-
```
|
| 99 |
|
| 100 |
-
|
| 101 |
-
```bash
|
| 102 |
-
###### modify L78 to disable fast tokenizer
|
| 103 |
-
default=False,
|
| 104 |
|
| 105 |
-
|
| 106 |
-
dataset = dataset.map(tokenizer, input_columns="text", fn_kwargs={"padding": "max_length", "truncation": True, "max_length": 512})
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
print(f"Manually deleted Teacher model, free some memory for student model.")
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
```
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
Training completed. Do not forget to share your model on huggingface.co/models =)
|
| 122 |
-
|
| 123 |
-
{'train_runtime': 2009.8864, 'train_samples_per_second': 73.0, 'train_steps_per_second': 4.563, 'train_loss': 0.6473459283913797, 'epoch': 1.0}
|
| 124 |
-
100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 9171/9171 [33:29<00:00, 4.56it/s]
|
| 125 |
-
[INFO|trainer.py:762] 2023-05-06 10:56:18,555 >> The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.
|
| 126 |
-
[INFO|trainer.py:3129] 2023-05-06 10:56:18,557 >> ***** Running Evaluation *****
|
| 127 |
-
[INFO|trainer.py:3131] 2023-05-06 10:56:18,557 >> Num examples = 146721
|
| 128 |
-
[INFO|trainer.py:3134] 2023-05-06 10:56:18,557 >> Batch size = 128
|
| 129 |
-
100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1147/1147 [08:59<00:00, 2.13it/s]
|
| 130 |
-
05/06/2023 11:05:18 - INFO - __main__ - Agreement of student and teacher predictions: 88.29%
|
| 131 |
-
[INFO|trainer.py:2868] 2023-05-06 11:05:18,251 >> Saving model checkpoint to ./distilbert-base-multilingual-cased-sentiments-student
|
| 132 |
-
[INFO|configuration_utils.py:457] 2023-05-06 11:05:18,251 >> Configuration saved in ./distilbert-base-multilingual-cased-sentiments-student/config.json
|
| 133 |
-
[INFO|modeling_utils.py:1847] 2023-05-06 11:05:18,905 >> Model weights saved in ./distilbert-base-multilingual-cased-sentiments-student/pytorch_model.bin
|
| 134 |
-
[INFO|tokenization_utils_base.py:2171] 2023-05-06 11:05:18,905 >> tokenizer config file saved in ./distilbert-base-multilingual-cased-sentiments-student/tokenizer_config.json
|
| 135 |
-
[INFO|tokenization_utils_base.py:2178] 2023-05-06 11:05:18,905 >> Special tokens file saved in ./distilbert-base-multilingual-cased-sentiments-student/special_tokens_map.json
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
```
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
base_model: distilbert/distilbert-base-multilingual-cased
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
language:
|
| 4 |
- en
|
| 5 |
+
- zh
|
| 6 |
+
- es
|
| 7 |
+
- hi
|
| 8 |
- ar
|
| 9 |
+
- bn
|
| 10 |
+
- pt
|
| 11 |
+
- ru
|
| 12 |
+
- ja
|
| 13 |
- de
|
| 14 |
+
- ms
|
| 15 |
+
- te
|
| 16 |
+
- vi
|
| 17 |
+
- ko
|
| 18 |
- fr
|
| 19 |
+
- tr
|
|
|
|
|
|
|
|
|
|
| 20 |
- it
|
| 21 |
+
- pl
|
| 22 |
+
- uk
|
| 23 |
+
- tl
|
| 24 |
+
- nl
|
| 25 |
+
- gsw
|
| 26 |
+
- sw
|
| 27 |
+
library_name: transformers
|
| 28 |
+
license: cc-by-nc-4.0
|
| 29 |
+
pipeline_tag: text-classification
|
| 30 |
+
tags:
|
| 31 |
+
- text-classification
|
| 32 |
+
- sentiment-analysis
|
| 33 |
+
- sentiment
|
| 34 |
+
- synthetic data
|
| 35 |
+
- multi-class
|
| 36 |
+
- social-media-analysis
|
| 37 |
+
- customer-feedback
|
| 38 |
+
- product-reviews
|
| 39 |
+
- brand-monitoring
|
| 40 |
+
- multilingual
|
| 41 |
+
- ๐ช๐บ
|
| 42 |
+
- region:eu
|
| 43 |
+
datasets:
|
| 44 |
+
- tabularisai/swahili_sentiment_dataset
|
| 45 |
---
|
| 46 |
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
# ๐ Multilingual Sentiment Classification Model (23 Languages)
|
| 49 |
|
| 50 |
+
<!-- TRY IT HERE: `coming soon`
|
| 51 |
+
-->
|
| 52 |
+
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord%20button.png" width="200"/>](https://discord.gg/sznxwdqBXj)
|
| 53 |
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
# NEWS!
|
| 56 |
+
- 2025/8: Major model update +1 new language: **Swahili**! Also, general improvements accross all languages.
|
| 57 |
|
| 58 |
+
- 2025/8: Free API for our model! Please see below!
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
- 2025/7: Weโve just released ModernFinBERT, a model weโve been working on for a while. Itโs built on the ModernBERT architecture and trained on a mix of real and synthetic data, with LLM-based label correction applied to public datasets to fix human annotation errors.
|
| 61 |
+
Itโs performing well across a range of benchmarks โ in some cases improving accuracy by up to 48% over existing models like FinBERT.
|
| 62 |
+
You can check it out here on Hugging Face:
|
| 63 |
+
๐ https://huggingface.co/tabularisai/ModernFinBERT
|
| 64 |
|
| 65 |
+
- 2024/12: We are excited to introduce a multilingual sentiment model! Now you can analyze sentiment across multiple languages, enhancing your global reach.
|
| 66 |
|
|
|
|
|
|
|
| 67 |
|
| 68 |
+
## ๐ Hosted API
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
We provide a hosted inference API:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
**Example request body:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
```json
|
| 75 |
+
curl -X POST https://api.tabularis.ai/ \
|
| 76 |
+
-H "Content-Type: application/json" \
|
| 77 |
+
-d '{"text":"I love the design","return_all_scores":false}'
|
| 78 |
|
| 79 |
```
|
| 80 |
|
| 81 |
+
## Model Details
|
| 82 |
+
- `Model Name:` tabularisai/multilingual-sentiment-analysis
|
| 83 |
+
- `Base Model:` distilbert/distilbert-base-multilingual-cased
|
| 84 |
+
- `Task:` Text Classification (Sentiment Analysis)
|
| 85 |
+
- `Languages:` Supports English plus Chinese (ไธญๆ), Spanish (Espaรฑol), Hindi (เคนเคฟเคจเฅเคฆเฅ), Arabic (ุงูุนุฑุจูุฉ), Bengali (เฆฌเฆพเฆเฆฒเฆพ), Portuguese (Portuguรชs), Russian (ะ ัััะบะธะน), Japanese (ๆฅๆฌ่ช), German (Deutsch), Malay (Bahasa Melayu), Telugu (เฐคเฑเฐฒเฑเฐเฑ), Vietnamese (Tiแบฟng Viแปt), Korean (ํ๊ตญ์ด), French (Franรงais), Turkish (Tรผrkรงe), Italian (Italiano), Polish (Polski), Ukrainian (ะฃะบัะฐัะฝััะบะฐ), Tagalog, Dutch (Nederlands), Swiss German (Schweizerdeutsch), and Swahili.
|
| 86 |
+
- `Number of Classes:` 5 (*Very Negative, Negative, Neutral, Positive, Very Positive*)
|
| 87 |
+
- `Usage:`
|
| 88 |
+
- Social media analysis
|
| 89 |
+
- Customer feedback analysis
|
| 90 |
+
- Product reviews classification
|
| 91 |
+
- Brand monitoring
|
| 92 |
+
- Market research
|
| 93 |
+
- Customer service optimization
|
| 94 |
+
- Competitive intelligence
|
| 95 |
|
| 96 |
+
> If you wish to use this model for commercial purposes, please obtain a license by contacting: info@tabularis.ai
|
| 97 |
|
|
|
|
| 98 |
|
| 99 |
+
## Model Description
|
| 100 |
|
| 101 |
+
This model is a fine-tuned version of `distilbert/distilbert-base-multilingual-cased` for multilingual sentiment analysis. It leverages synthetic data from multiple sources to achieve robust performance across different languages and cultural contexts.
|
| 102 |
|
| 103 |
+
### Training Data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
Trained exclusively on synthetic multilingual data generated by advanced LLMs, ensuring wide coverage of sentiment expressions from various languages.
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
+
### Training Procedure
|
|
|
|
| 108 |
|
| 109 |
+
- Fine-tuned for 3.5 epochs.
|
| 110 |
+
- Achieved a train_acc_off_by_one of approximately 0.93 on the validation dataset.
|
|
|
|
| 111 |
|
| 112 |
+
## Intended Use
|
| 113 |
+
|
| 114 |
+
Ideal for:
|
| 115 |
+
- Multilingual social media monitoring
|
| 116 |
+
- International customer feedback analysis
|
| 117 |
+
- Global product review sentiment classification
|
| 118 |
+
- Worldwide brand sentiment tracking
|
| 119 |
+
|
| 120 |
+
## How to Use
|
| 121 |
+
|
| 122 |
+
Using pipelines, it takes only 4 lines:
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
from transformers import pipeline
|
| 126 |
+
|
| 127 |
+
# Load the classification pipeline with the specified model
|
| 128 |
+
pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")
|
| 129 |
+
|
| 130 |
+
# Classify a new sentence
|
| 131 |
+
sentence = "I love this product! It's amazing and works perfectly."
|
| 132 |
+
result = pipe(sentence)
|
| 133 |
+
|
| 134 |
+
# Print the result
|
| 135 |
+
print(result)
|
| 136 |
```
|
| 137 |
|
| 138 |
+
Below is a Python example on how to use the multilingual sentiment model without pipelines:
|
| 139 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
```python
|
| 142 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 143 |
+
import torch
|
| 144 |
+
|
| 145 |
+
model_name = "tabularisai/multilingual-sentiment-analysis"
|
| 146 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 147 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 148 |
+
|
| 149 |
+
def predict_sentiment(texts):
|
| 150 |
+
inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 151 |
+
with torch.no_grad():
|
| 152 |
+
outputs = model(**inputs)
|
| 153 |
+
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
| 154 |
+
sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
|
| 155 |
+
return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]
|
| 156 |
+
|
| 157 |
+
texts = [
|
| 158 |
+
# English
|
| 159 |
+
"I absolutely love the new design of this app!", "The customer service was disappointing.", "The weather is fine, nothing special.",
|
| 160 |
+
# Chinese
|
| 161 |
+
"่ฟๅฎถ้คๅ
็่ๅณ้้ๅธธๆฃ๏ผ", "ๆๅฏนไป็ๅ็ญๅพๅคฑๆใ", "ๅคฉๆฐไปๅคฉไธ่ฌใ",
|
| 162 |
+
# Spanish
|
| 163 |
+
"ยกMe encanta cรณmo quedรณ la decoraciรณn!", "El servicio fue terrible y muy lento.", "El libro estuvo mรกs o menos.",
|
| 164 |
+
# Arabic
|
| 165 |
+
"ุงูุฎุฏู
ุฉ ูู ูุฐุง ุงูููุฏู ุฑุงุฆุนุฉ ุฌุฏูุง!", "ูู
ูุนุฌุจูู ุงูุทุนุงู
ูู ูุฐุง ุงูู
ุทุนู
.", "ูุงูุช ุงูุฑุญูุฉ ุนุงุฏูุฉใ",
|
| 166 |
+
# Ukrainian
|
| 167 |
+
"ะะตะฝั ะดัะถะต ัะฟะพะดะพะฑะฐะปะฐัั ัั ะฒะธััะฐะฒะฐ!", "ะะฑัะปัะณะพะฒัะฒะฐะฝะฝั ะฑัะปะพ ะถะฐั
ะปะธะฒะธะผ.", "ะะฝะธะณะฐ ะฑัะปะฐ ะฟะพัะตัะตะดะฝัะพัใ",
|
| 168 |
+
# Hindi
|
| 169 |
+
"เคฏเคน เคเคเคน เคธเค เคฎเฅเค เค
เคฆเฅเคญเฅเคค เคนเฅ!", "เคฏเคน เค
เคจเฅเคญเคต เคฌเคนเฅเคค เคเคฐเคพเคฌ เคฅเคพเฅค", "เคซเคฟเคฒเฅเคฎ เค เฅเค-เค เคพเค เคฅเฅเฅค",
|
| 170 |
+
# Bengali
|
| 171 |
+
"เฆเฆเฆพเฆจเฆเฆพเฆฐ เฆชเฆฐเฆฟเฆฌเงเฆถ เฆ
เฆธเฆพเฆงเฆพเฆฐเฆฃ!", "เฆธเงเฆฌเฆพเฆฐ เฆฎเฆพเฆจ เฆเฆเงเฆฌเฆพเฆฐเงเฆ เฆเฆพเฆฐเฆพเฆชเฅค", "เฆเฆพเฆฌเฆพเฆฐเฆเฆพ เฆฎเงเฆเฆพเฆฎเงเฆเฆฟ เฆเฆฟเฆฒเฅค",
|
| 172 |
+
# Portuguese
|
| 173 |
+
"Este livro รฉ fantรกstico! Eu aprendi muitas coisas novas e inspiradoras.",
|
| 174 |
+
"Nรฃo gostei do produto, veio quebrado.", "O filme foi ok, nada de especial.",
|
| 175 |
+
# Japanese
|
| 176 |
+
"ใใฎใฌในใใฉใณใฎๆ็ใฏๆฌๅฝใซ็พๅณใใใงใ๏ผ", "ใใฎใใใซใฎใตใผใในใฏใใฃใใใใพใใใ", "ๅคฉๆฐใฏใพใใพใใงใใ",
|
| 177 |
+
# Russian
|
| 178 |
+
"ะฏ ะฒ ะฒะพััะพัะณะต ะพั ััะพะณะพ ะฝะพะฒะพะณะพ ะณะฐะดะถะตัะฐ!", "ะญัะพั ัะตัะฒะธั ะพััะฐะฒะธะป ั ะผะตะฝั ัะพะปัะบะพ ัะฐะทะพัะฐัะพะฒะฐะฝะธะต.", "ะัััะตัะฐ ะฑัะปะฐ ะพะฑััะฝะพะน, ะฝะธัะตะณะพ ะพัะพะฑะตะฝะฝะพะณะพ.",
|
| 179 |
+
# French
|
| 180 |
+
"J'adore ce restaurant, c'est excellent !", "L'attente รฉtait trop longue et frustrante.", "Le film รฉtait moyen, sans plus.",
|
| 181 |
+
# Turkish
|
| 182 |
+
"Bu otelin manzarasฤฑna bayฤฑldฤฑm!", "รrรผn tam bir hayal kฤฑrฤฑklฤฑฤฤฑydฤฑ.", "Konser fena deฤildi, ortalamaydฤฑ.",
|
| 183 |
+
# Italian
|
| 184 |
+
"Adoro questo posto, รจ fantastico!", "Il servizio clienti รจ stato pessimo.", "La cena era nella media.",
|
| 185 |
+
# Polish
|
| 186 |
+
"Uwielbiam tฤ restauracjฤ, jedzenie jest ลwietne!", "Obsลuga klienta byลa rozczarowujฤ
ca.", "Pogoda jest w porzฤ
dku, nic szczegรณlnego.",
|
| 187 |
+
# Tagalog
|
| 188 |
+
"Ang ganda ng lugar na ito, sobrang aliwalas!", "Hindi maganda ang serbisyo nila dito.", "Maayos lang ang palabas, walang espesyal.",
|
| 189 |
+
# Dutch
|
| 190 |
+
"Ik ben echt blij met mijn nieuwe aankoop!", "De klantenservice was echt slecht.", "De presentatie was gewoon okรฉ, niet bijzonder.",
|
| 191 |
+
# Malay
|
| 192 |
+
"Saya suka makanan di sini, sangat sedap!", "Pengalaman ini sangat mengecewakan.", "Hari ini cuacanya biasa sahaja.",
|
| 193 |
+
# Korean
|
| 194 |
+
"์ด ๊ฐ๊ฒ์ ์ผ์ดํฌ๋ ์ ๋ง ๋ง์์ด์!", "์๋น์ค๊ฐ ๋๋ฌด ๋ณ๋ก์์ด์.", "๋ ์จ๊ฐ ๊ทธ์ ๊ทธ๋ ๋ค์.",
|
| 195 |
+
# Swiss German
|
| 196 |
+
"Ich find dรค Service i de Beiz mega guet!", "Dรคs Esรค het mir nรถd gfalle.", "D Wรคtter hรผt isch so naja."
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
for text, sentiment in zip(texts, predict_sentiment(texts)):
|
| 200 |
+
print(f"Text: {text}\nSentiment: {sentiment}\n")
|
| 201 |
```
|
| 202 |
|
| 203 |
+
## Ethical Considerations
|
| 204 |
+
|
| 205 |
+
Synthetic data reduces bias, but validation in real-world scenarios is advised.
|
| 206 |
+
|
| 207 |
+
## Citation
|
| 208 |
+
```bib
|
| 209 |
+
@misc{tabularisai_2025,
|
| 210 |
+
author = { tabularisai and Samuel Gyamfi and Vadim Borisov and Richard H. Schreiber },
|
| 211 |
+
title = { multilingual-sentiment-analysis (Revision 69afb83) },
|
| 212 |
+
year = 2025,
|
| 213 |
+
url = { https://huggingface.co/tabularisai/multilingual-sentiment-analysis },
|
| 214 |
+
doi = { 10.57967/hf/5968 },
|
| 215 |
+
publisher = { Hugging Face }
|
| 216 |
+
}
|
| 217 |
+
```
|
| 218 |
|
| 219 |
+
## Contact
|
| 220 |
+
|
| 221 |
+
For inquiries, data, private APIs, better models, contact info@tabularis.ai
|
| 222 |
+
|
| 223 |
+
tabularis.ai
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
<table align="center">
|
| 227 |
+
<tr>
|
| 228 |
+
<td align="center">
|
| 229 |
+
<a href="https://www.linkedin.com/company/tabularis-ai/">
|
| 230 |
+
<img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/linkedin.svg" alt="LinkedIn" width="30" height="30">
|
| 231 |
+
</a>
|
| 232 |
+
</td>
|
| 233 |
+
<td align="center">
|
| 234 |
+
<a href="https://x.com/tabularis_ai">
|
| 235 |
+
<img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/x.svg" alt="X" width="30" height="30">
|
| 236 |
+
</a>
|
| 237 |
+
</td>
|
| 238 |
+
<td align="center">
|
| 239 |
+
<a href="https://github.com/tabularis-ai">
|
| 240 |
+
<img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/github.svg" alt="GitHub" width="30" height="30">
|
| 241 |
+
</a>
|
| 242 |
+
</td>
|
| 243 |
+
<td align="center">
|
| 244 |
+
<a href="https://tabularis.ai">
|
| 245 |
+
<img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/internetarchive.svg" alt="Website" width="30" height="30">
|
| 246 |
+
</a>
|
| 247 |
+
</td>
|
| 248 |
+
</tr>
|
| 249 |
+
</table>
|
config.json
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "distilbert-base-multilingual-cased",
|
| 3 |
"activation": "gelu",
|
| 4 |
"architectures": [
|
| 5 |
"DistilBertForSequenceClassification"
|
|
@@ -9,15 +8,19 @@
|
|
| 9 |
"dropout": 0.1,
|
| 10 |
"hidden_dim": 3072,
|
| 11 |
"id2label": {
|
| 12 |
-
"0": "
|
| 13 |
-
"1": "
|
| 14 |
-
"2": "
|
|
|
|
|
|
|
| 15 |
},
|
| 16 |
"initializer_range": 0.02,
|
| 17 |
"label2id": {
|
| 18 |
-
"
|
| 19 |
-
"
|
| 20 |
-
"
|
|
|
|
|
|
|
| 21 |
},
|
| 22 |
"max_position_embeddings": 512,
|
| 23 |
"model_type": "distilbert",
|
|
@@ -25,11 +28,12 @@
|
|
| 25 |
"n_layers": 6,
|
| 26 |
"output_past": true,
|
| 27 |
"pad_token_id": 0,
|
|
|
|
| 28 |
"qa_dropout": 0.1,
|
| 29 |
"seq_classif_dropout": 0.2,
|
| 30 |
"sinusoidal_pos_embds": false,
|
| 31 |
"tie_weights_": true,
|
| 32 |
"torch_dtype": "float32",
|
| 33 |
-
"transformers_version": "4.
|
| 34 |
"vocab_size": 119547
|
| 35 |
}
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"activation": "gelu",
|
| 3 |
"architectures": [
|
| 4 |
"DistilBertForSequenceClassification"
|
|
|
|
| 8 |
"dropout": 0.1,
|
| 9 |
"hidden_dim": 3072,
|
| 10 |
"id2label": {
|
| 11 |
+
"0": "Very Negative",
|
| 12 |
+
"1": "Negative",
|
| 13 |
+
"2": "Neutral",
|
| 14 |
+
"3": "Positive",
|
| 15 |
+
"4": "Very Positive"
|
| 16 |
},
|
| 17 |
"initializer_range": 0.02,
|
| 18 |
"label2id": {
|
| 19 |
+
"Negative": 1,
|
| 20 |
+
"Neutral": 2,
|
| 21 |
+
"Positive": 3,
|
| 22 |
+
"Very Negative": 0,
|
| 23 |
+
"Very Positive": 4
|
| 24 |
},
|
| 25 |
"max_position_embeddings": 512,
|
| 26 |
"model_type": "distilbert",
|
|
|
|
| 28 |
"n_layers": 6,
|
| 29 |
"output_past": true,
|
| 30 |
"pad_token_id": 0,
|
| 31 |
+
"problem_type": "single_label_classification",
|
| 32 |
"qa_dropout": 0.1,
|
| 33 |
"seq_classif_dropout": 0.2,
|
| 34 |
"sinusoidal_pos_embds": false,
|
| 35 |
"tie_weights_": true,
|
| 36 |
"torch_dtype": "float32",
|
| 37 |
+
"transformers_version": "4.55.0",
|
| 38 |
"vocab_size": 119547
|
| 39 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ab3cecb8605da0a240e5b4e18d969704d44e27c6ea48533ef6693d31dbb926a
|
| 3 |
+
size 541326604
|
tokenizer.json
CHANGED
|
@@ -1,19 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
-
"truncation":
|
| 4 |
-
|
| 5 |
-
"max_length": 512,
|
| 6 |
-
"strategy": "LongestFirst",
|
| 7 |
-
"stride": 0
|
| 8 |
-
},
|
| 9 |
-
"padding": {
|
| 10 |
-
"strategy": "BatchLongest",
|
| 11 |
-
"direction": "Right",
|
| 12 |
-
"pad_to_multiple_of": null,
|
| 13 |
-
"pad_id": 0,
|
| 14 |
-
"pad_type_id": 0,
|
| 15 |
-
"pad_token": "[PAD]"
|
| 16 |
-
},
|
| 17 |
"added_tokens": [
|
| 18 |
{
|
| 19 |
"id": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|
tokenizer_config.json
CHANGED
|
@@ -1,11 +1,51 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"cls_token": "[CLS]",
|
| 4 |
-
"do_basic_tokenize": true,
|
| 5 |
"do_lower_case": false,
|
| 6 |
"mask_token": "[MASK]",
|
| 7 |
"model_max_length": 512,
|
| 8 |
-
"never_split": null,
|
| 9 |
"pad_token": "[PAD]",
|
| 10 |
"sep_token": "[SEP]",
|
| 11 |
"strip_accents": null,
|
|
|
|
| 1 |
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
"cls_token": "[CLS]",
|
|
|
|
| 46 |
"do_lower_case": false,
|
| 47 |
"mask_token": "[MASK]",
|
| 48 |
"model_max_length": 512,
|
|
|
|
| 49 |
"pad_token": "[PAD]",
|
| 50 |
"sep_token": "[SEP]",
|
| 51 |
"strip_accents": null,
|