Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- ru
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
RUPunct_small - самая маленькая модель из семейства RUPunct. Идеально подходит для несложных текстов и там, где требуется высокая скорость работы на CPU.
|
| 8 |
+
|
| 9 |
+
Код инференса:
|
| 10 |
+
```py
|
| 11 |
+
from transformers import pipeline
|
| 12 |
+
from transformers import AutoTokenizer
|
| 13 |
+
|
| 14 |
+
pt = "RUPunct/RUPunct_small"
|
| 15 |
+
|
| 16 |
+
tk = AutoTokenizer.from_pretrained(pt, strip_accents=False, add_prefix_space=True)
|
| 17 |
+
classifier = pipeline("ner", model=pt, tokenizer=tk, aggregation_strategy="first")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def process_token(token, label):
|
| 21 |
+
if label == "LOWER_O":
|
| 22 |
+
return token
|
| 23 |
+
if label == "LOWER_PERIOD":
|
| 24 |
+
return token + "."
|
| 25 |
+
if label == "LOWER_COMMA":
|
| 26 |
+
return token + ","
|
| 27 |
+
if label == "LOWER_QUESTION":
|
| 28 |
+
return token + "?"
|
| 29 |
+
if label == "LOWER_TIRE":
|
| 30 |
+
return token + "—"
|
| 31 |
+
if label == "LOWER_DVOETOCHIE":
|
| 32 |
+
return token + ":"
|
| 33 |
+
if label == "LOWER_VOSKL":
|
| 34 |
+
return token + "!"
|
| 35 |
+
if label == "LOWER_PERIODCOMMA":
|
| 36 |
+
return token + ";"
|
| 37 |
+
if label == "LOWER_DEFIS":
|
| 38 |
+
return token + "-"
|
| 39 |
+
if label == "LOWER_MNOGOTOCHIE":
|
| 40 |
+
return token + "..."
|
| 41 |
+
if label == "LOWER_QUESTIONVOSKL":
|
| 42 |
+
return token + "?!"
|
| 43 |
+
if label == "UPPER_O":
|
| 44 |
+
return token.capitalize()
|
| 45 |
+
if label == "UPPER_PERIOD":
|
| 46 |
+
return token.capitalize() + "."
|
| 47 |
+
if label == "UPPER_COMMA":
|
| 48 |
+
return token.capitalize() + ","
|
| 49 |
+
if label == "UPPER_QUESTION":
|
| 50 |
+
return token.capitalize() + "?"
|
| 51 |
+
if label == "UPPER_TIRE":
|
| 52 |
+
return token.capitalize() + " —"
|
| 53 |
+
if label == "UPPER_DVOETOCHIE":
|
| 54 |
+
return token.capitalize() + ":"
|
| 55 |
+
if label == "UPPER_VOSKL":
|
| 56 |
+
return token.capitalize() + "!"
|
| 57 |
+
if label == "UPPER_PERIODCOMMA":
|
| 58 |
+
return token.capitalize() + ";"
|
| 59 |
+
if label == "UPPER_DEFIS":
|
| 60 |
+
return token.capitalize() + "-"
|
| 61 |
+
if label == "UPPER_MNOGOTOCHIE":
|
| 62 |
+
return token.capitalize() + "..."
|
| 63 |
+
if label == "UPPER_QUESTIONVOSKL":
|
| 64 |
+
return token.capitalize() + "?!"
|
| 65 |
+
if label == "UPPER_TOTAL_O":
|
| 66 |
+
return token.upper()
|
| 67 |
+
if label == "UPPER_TOTAL_PERIOD":
|
| 68 |
+
return token.upper() + "."
|
| 69 |
+
if label == "UPPER_TOTAL_COMMA":
|
| 70 |
+
return token.upper() + ","
|
| 71 |
+
if label == "UPPER_TOTAL_QUESTION":
|
| 72 |
+
return token.upper() + "?"
|
| 73 |
+
if label == "UPPER_TOTAL_TIRE":
|
| 74 |
+
return token.upper() + " —"
|
| 75 |
+
if label == "UPPER_TOTAL_DVOETOCHIE":
|
| 76 |
+
return token.upper() + ":"
|
| 77 |
+
if label == "UPPER_TOTAL_VOSKL":
|
| 78 |
+
return token.upper() + "!"
|
| 79 |
+
if label == "UPPER_TOTAL_PERIODCOMMA":
|
| 80 |
+
return token.upper() + ";"
|
| 81 |
+
if label == "UPPER_TOTAL_DEFIS":
|
| 82 |
+
return token.upper() + "-"
|
| 83 |
+
if label == "UPPER_TOTAL_MNOGOTOCHIE":
|
| 84 |
+
return token.upper() + "..."
|
| 85 |
+
if label == "UPPER_TOTAL_QUESTIONVOSKL":
|
| 86 |
+
return token.upper() + "?!"
|
| 87 |
+
|
| 88 |
+
while 1:
|
| 89 |
+
input_text = input(":> ")
|
| 90 |
+
preds = classifier(input_text)
|
| 91 |
+
output = ""
|
| 92 |
+
for item in preds:
|
| 93 |
+
if item["word"] == ".":
|
| 94 |
+
item["entity_group"] = "O"
|
| 95 |
+
output += " " + process_token(item['word'].strip(), item['entity_group'])
|
| 96 |
+
print(">>>", output)
|
| 97 |
+
```
|