Commit
·
949e803
1
Parent(s):
4747a28
Update README
Browse files
README.md
CHANGED
|
@@ -5,8 +5,7 @@ tags:
|
|
| 5 |
- capitalization
|
| 6 |
- punctuation
|
| 7 |
- token-classification
|
| 8 |
-
|
| 9 |
-
license: mit
|
| 10 |
datasets:
|
| 11 |
- oscar-corpus/OSCAR-2109
|
| 12 |
metrics:
|
|
@@ -32,19 +31,16 @@ import os
|
|
| 32 |
import shutil
|
| 33 |
import sys
|
| 34 |
from huggingface_hub import snapshot_download
|
| 35 |
-
|
| 36 |
cache_dir = "./capu"
|
| 37 |
def download_files(repo_id, cache_dir=None, ignore_regex=None):
|
| 38 |
download_dir = snapshot_download(repo_id=repo_id, cache_dir=cache_dir, ignore_regex=ignore_regex)
|
| 39 |
if cache_dir is None or download_dir == cache_dir:
|
| 40 |
return download_dir
|
| 41 |
-
|
| 42 |
file_names = os.listdir(download_dir)
|
| 43 |
for file_name in file_names:
|
| 44 |
shutil.move(os.path.join(download_dir, file_name), cache_dir)
|
| 45 |
os.rmdir(download_dir)
|
| 46 |
return cache_dir
|
| 47 |
-
|
| 48 |
download_files(repo_id="dragonSwing/vibert-capu", cache_dir=cache_dir, ignore_regex=["*.json", "*.bin"])
|
| 49 |
sys.path.append(cache_dir)
|
| 50 |
```
|
|
@@ -66,14 +62,14 @@ model("theo đó thủ tướng dự kiến tiếp bộ trưởng nông nghiệp
|
|
| 66 |
-----------------------------------------------
|
| 67 |
## 📡 Training data
|
| 68 |
Here is the number of product reviews we used for fine-tuning the model:
|
| 69 |
-
| Language | Number of text samples|
|
| 70 |
| -------- | ----------------- |
|
| 71 |
-
| Vietnamese | 5,600,000
|
| 72 |
-----------------------------------------------
|
| 73 |
## 🎯 Accuracy
|
| 74 |
Below is a breakdown of the performance of the model by each label on 120,000 held-out text samples:
|
| 75 |
-
| label | precision | recall | f1-score | support|
|
| 76 |
-
| --------- |
|
| 77 |
| **Upper** | 0.88 | 0.89 | 0.89 | 56497 |
|
| 78 |
| **Complex-Upper** | 0.92 | 0.83 | 0.88 | 480 |
|
| 79 |
| **.** | 0.81 | 0.82 | 0.82 | 18139 |
|
|
|
|
| 5 |
- capitalization
|
| 6 |
- punctuation
|
| 7 |
- token-classification
|
| 8 |
+
license: cc-by-sa-4.0
|
|
|
|
| 9 |
datasets:
|
| 10 |
- oscar-corpus/OSCAR-2109
|
| 11 |
metrics:
|
|
|
|
| 31 |
import shutil
|
| 32 |
import sys
|
| 33 |
from huggingface_hub import snapshot_download
|
|
|
|
| 34 |
cache_dir = "./capu"
|
| 35 |
def download_files(repo_id, cache_dir=None, ignore_regex=None):
|
| 36 |
download_dir = snapshot_download(repo_id=repo_id, cache_dir=cache_dir, ignore_regex=ignore_regex)
|
| 37 |
if cache_dir is None or download_dir == cache_dir:
|
| 38 |
return download_dir
|
|
|
|
| 39 |
file_names = os.listdir(download_dir)
|
| 40 |
for file_name in file_names:
|
| 41 |
shutil.move(os.path.join(download_dir, file_name), cache_dir)
|
| 42 |
os.rmdir(download_dir)
|
| 43 |
return cache_dir
|
|
|
|
| 44 |
download_files(repo_id="dragonSwing/vibert-capu", cache_dir=cache_dir, ignore_regex=["*.json", "*.bin"])
|
| 45 |
sys.path.append(cache_dir)
|
| 46 |
```
|
|
|
|
| 62 |
-----------------------------------------------
|
| 63 |
## 📡 Training data
|
| 64 |
Here is the number of product reviews we used for fine-tuning the model:
|
| 65 |
+
| Language | Number of text samples |
|
| 66 |
| -------- | ----------------- |
|
| 67 |
+
| Vietnamese | 5,600,000 |
|
| 68 |
-----------------------------------------------
|
| 69 |
## 🎯 Accuracy
|
| 70 |
Below is a breakdown of the performance of the model by each label on 120,000 held-out text samples:
|
| 71 |
+
| label | precision | recall | f1-score | support |
|
| 72 |
+
| --------- | ------------- | -------- | ---------- | -------- |
|
| 73 |
| **Upper** | 0.88 | 0.89 | 0.89 | 56497 |
|
| 74 |
| **Complex-Upper** | 0.92 | 0.83 | 0.88 | 480 |
|
| 75 |
| **.** | 0.81 | 0.82 | 0.82 | 18139 |
|