initial commit: add model
Browse files- README.md +68 -0
- config.json +40 -0
- pytorch_model.bin +3 -0
README.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# 📝 Usage
|
| 3 |
+
|
| 4 |
+
This model is a **FLAUBERT** fine-tuned version to categorize French texts into the following categories:
|
| 5 |
+
|
| 6 |
+
> **CULTURE**, **DEBATS_ET_OPINIONS**, **ECONOMIE**, **EDUCATION**, **FAIT_DIVERS**, **INTERNATIONAL**, **LIFESTYLE**, **NUMERIQUE**, **POLITIQUE**, **RELIGION**, **SANTE**, **SCIENCE_ET_ENVIRONNEMENT**, **SOCIETE**, **SPORT**, **INDEFINI**
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## 🚀 Quick Start
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from transformers import AutoModelForSequenceClassification
|
| 14 |
+
|
| 15 |
+
model = AutoModelForSequenceClassification.from_pretrained("juenp/FrenchTextCategorizer")
|
| 16 |
+
model.eval()
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 🔎 Full Example (with Tokenizer, Prediction and Probabilities)
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 25 |
+
import torch
|
| 26 |
+
import torch.nn.functional as F
|
| 27 |
+
|
| 28 |
+
# Load model and tokenizer
|
| 29 |
+
model = AutoModelForSequenceClassification.from_pretrained("juenp/FrenchTextCategorizer")
|
| 30 |
+
tokenizer = AutoTokenizer.from_pretrained("juenp/FrenchTextCategorizer")
|
| 31 |
+
model.eval()
|
| 32 |
+
|
| 33 |
+
# Input text
|
| 34 |
+
text = "Ce film est un chef-d'œuvre incroyable, tout était parfait."
|
| 35 |
+
|
| 36 |
+
# Tokenize
|
| 37 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
| 38 |
+
inputs.pop("token_type_ids", None)
|
| 39 |
+
|
| 40 |
+
# Predict
|
| 41 |
+
with torch.no_grad():
|
| 42 |
+
outputs = model(**inputs)
|
| 43 |
+
|
| 44 |
+
logits = outputs.logits
|
| 45 |
+
probs = F.softmax(logits, dim=-1)
|
| 46 |
+
predicted_class_idx = torch.argmax(probs, dim=-1).item()
|
| 47 |
+
|
| 48 |
+
# Decode predicted class from config
|
| 49 |
+
predicted_class = model.config.id2label[str(predicted_class_idx)]
|
| 50 |
+
prob_percentages = [round(p.item() * 100, 2) for p in probs[0]]
|
| 51 |
+
|
| 52 |
+
# Output
|
| 53 |
+
print(f"Texte : {text}")
|
| 54 |
+
print(f"Classe prédite : {predicted_class}")
|
| 55 |
+
print(f"Probabilités (%) : {prob_percentages}")
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
# 📋 Notes
|
| 61 |
+
|
| 62 |
+
- `model.config.id2label` is automatically loaded from the model's configuration (`config.json`).
|
| 63 |
+
- If you want to process multiple texts at once, simply pass a list of texts to the tokenizer.
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
# ✅ Ready for Inference!
|
| 68 |
+
|
config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": ["AutoModelForSequenceClassification"],
|
| 3 |
+
"model_type": "flaubert",
|
| 4 |
+
"num_labels": 15,
|
| 5 |
+
"id2label": {
|
| 6 |
+
"0": "CULTURE",
|
| 7 |
+
"1": "DEBATS_ET_OPINIONS",
|
| 8 |
+
"2": "ECONOMIE",
|
| 9 |
+
"3": "EDUCATION",
|
| 10 |
+
"4": "FAIT_DIVERS",
|
| 11 |
+
"5": "INTERNATIONAL",
|
| 12 |
+
"6": "LIFESTYLE",
|
| 13 |
+
"7": "NUMERIQUE",
|
| 14 |
+
"8": "POLITIQUE",
|
| 15 |
+
"9": "RELIGION",
|
| 16 |
+
"10": "SANTE",
|
| 17 |
+
"11": "SCIENCE_ET_ENVIRONNEMENT",
|
| 18 |
+
"12": "SOCIETE",
|
| 19 |
+
"13": "SPORT",
|
| 20 |
+
"14": "INDEFINI"
|
| 21 |
+
},
|
| 22 |
+
"label2id": {
|
| 23 |
+
"CULTURE": 0,
|
| 24 |
+
"DEBATS_ET_OPINIONS": 1,
|
| 25 |
+
"ECONOMIE": 2,
|
| 26 |
+
"EDUCATION": 3,
|
| 27 |
+
"FAIT_DIVERS": 4,
|
| 28 |
+
"INTERNATIONAL": 5,
|
| 29 |
+
"LIFESTYLE": 6,
|
| 30 |
+
"NUMERIQUE": 7,
|
| 31 |
+
"POLITIQUE": 8,
|
| 32 |
+
"RELIGION": 9,
|
| 33 |
+
"SANTE": 10,
|
| 34 |
+
"SCIENCE_ET_ENVIRONNEMENT": 11,
|
| 35 |
+
"SOCIETE": 12,
|
| 36 |
+
"SPORT": 13,
|
| 37 |
+
"INDEFINI": 14
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d02f23076b19327716955de2f53867228cbb967551fa0005eb5d365a87284af9
|
| 3 |
+
size 553806078
|