Spaces:
Runtime error
Runtime error
Add files
Browse files- test_run.py +181 -0
test_run.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# !pip install gr-nlp-toolkit
|
| 2 |
+
|
| 3 |
+
from gr_nlp_toolkit import Pipeline
|
| 4 |
+
|
| 5 |
+
# Instantiate the Pipeline
|
| 6 |
+
nlp_pos_ner_dp_with_g2g = Pipeline("pos,ner,dp,g2g")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def greeklish_to_greek(text: str) -> str:
|
| 10 |
+
"""
|
| 11 |
+
Convert Greeklish (Greek written with Latin characters) to Greek. ("larisa" -> "λαρισα")
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
text (str): The Greeklish text to convert.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
str: The transliterated Greek text.
|
| 18 |
+
|
| 19 |
+
Examples:
|
| 20 |
+
>>> greeklish_to_greek("H thessaloniki einai wraia polh")
|
| 21 |
+
'η θεσσαλονικη ειναι ωραια πολη'
|
| 22 |
+
"""
|
| 23 |
+
doc = nlp_pos_ner_dp_with_g2g(text)
|
| 24 |
+
return " ".join([token.text for token in doc.tokens])
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def process_ner(text: str) -> dict:
|
| 28 |
+
"""
|
| 29 |
+
Process text to extract Named Entity Recognition (NER) information.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
text (str): The text to process.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
dict: A dictionary with the text and the NER value.
|
| 36 |
+
|
| 37 |
+
Examples:
|
| 38 |
+
>>> process_ner("Η Αργεντινή κέρδισε το Παγκόσμιο Κύπελλο το 2022")
|
| 39 |
+
{
|
| 40 |
+
'η': 'O',
|
| 41 |
+
'αργεντινη': 'S-ORG',
|
| 42 |
+
'κερδισε': 'O',
|
| 43 |
+
'το': 'O',
|
| 44 |
+
'παγκοσμιο': 'B-EVENT',
|
| 45 |
+
'κυπελλο': 'E-EVENT',
|
| 46 |
+
'το': 'O',
|
| 47 |
+
'2022': 'S-DATE'
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
NER Possible Labels List:
|
| 51 |
+
ner_labels = [
|
| 52 |
+
'O', 'S-GPE', 'S-ORG', 'S-CARDINAL', 'B-ORG', 'E-ORG', 'B-DATE', 'E-DATE', 'S-NORP',
|
| 53 |
+
'B-GPE', 'E-GPE', 'S-EVENT', 'S-DATE', 'S-PRODUCT', 'S-LOC', 'I-ORG', 'S-PERSON',
|
| 54 |
+
'S-ORDINAL', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'B-LAW', 'I-LAW', 'E-LAW', 'B-MONEY',
|
| 55 |
+
'I-MONEY', 'E-MONEY', 'B-EVENT', 'I-EVENT', 'E-EVENT', 'B-FAC', 'E-FAC', 'I-DATE',
|
| 56 |
+
'S-PERCENT', 'B-QUANTITY', 'E-QUANTITY', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART',
|
| 57 |
+
'I-FAC', 'S-LAW', 'S-TIME', 'B-LOC', 'E-LOC', 'I-LOC', 'S-FAC', 'B-TIME', 'E-TIME',
|
| 58 |
+
'S-WORK_OF_ART', 'B-PRODUCT', 'E-PRODUCT', 'B-CARDINAL', 'E-CARDINAL', 'S-MONEY',
|
| 59 |
+
'S-LANGUAGE', 'I-TIME', 'I-PRODUCT', 'I-GPE', 'I-QUANTITY', 'B-NORP', 'E-NORP',
|
| 60 |
+
'S-QUANTITY', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'I-CARDINAL', 'B-ORDINAL',
|
| 61 |
+
'I-ORDINAL', 'E-ORDINAL'
|
| 62 |
+
]
|
| 63 |
+
"""
|
| 64 |
+
doc = nlp_pos_ner_dp_with_g2g(text)
|
| 65 |
+
ner_dict = {token.text: token.ner for token in doc.tokens}
|
| 66 |
+
return ner_dict
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def process_pos(text: str) -> dict:
|
| 70 |
+
"""
|
| 71 |
+
Process text to extract Part-of-Speech information (UPOS tags and morphological features).
|
| 72 |
+
|
| 73 |
+
# Complete list of UPOS (https://universaldependencies.org/u/pos/ & https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py)
|
| 74 |
+
ADJ: adjective
|
| 75 |
+
ADP: adposition
|
| 76 |
+
ADV: adverb
|
| 77 |
+
AUX: auxiliary
|
| 78 |
+
CCONJ: coordinating conjunction
|
| 79 |
+
DET: determiner
|
| 80 |
+
INTJ: interjection
|
| 81 |
+
NOUN: noun
|
| 82 |
+
NUM: numeral
|
| 83 |
+
PART: particle
|
| 84 |
+
PRON: pronoun
|
| 85 |
+
PROPN: proper noun
|
| 86 |
+
PUNCT: punctuation
|
| 87 |
+
SCONJ: subordinating conjunction
|
| 88 |
+
SYM: symbol
|
| 89 |
+
VERB: verb
|
| 90 |
+
X: other
|
| 91 |
+
|
| 92 |
+
# Complete list of the morphological features can be found here: (https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py
|
| 93 |
+
Due to the large number of features, only the most common ones are listed here:
|
| 94 |
+
- Aspect
|
| 95 |
+
- Case
|
| 96 |
+
- Definite
|
| 97 |
+
- Mood
|
| 98 |
+
- Number
|
| 99 |
+
- Person
|
| 100 |
+
- PronType
|
| 101 |
+
- Tense
|
| 102 |
+
- Gender
|
| 103 |
+
- VerbForm
|
| 104 |
+
- Voice
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
text (str): The text to process.
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
dict: A dictionary with the text and the POS information, containing UPOS and morphological features as keys.
|
| 111 |
+
|
| 112 |
+
Examples:
|
| 113 |
+
>>> process_pos("Μου αρέσει να διαβάζω τα post του Andrew Ng στο Twitter.")
|
| 114 |
+
{
|
| 115 |
+
'μου': {'UPOS': 'PRON', 'Morphological_Features': {'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '1', 'Poss': '_', 'PronType': 'Prs'}},
|
| 116 |
+
'αρεσει': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}},
|
| 117 |
+
'να': {'UPOS': 'AUX', 'Morphological_Features': {'Aspect': '_', 'Mood': '_', 'Number': '_', 'Person': '_', 'Tense': '_', 'VerbForm': '_', 'Voice': '_'}},
|
| 118 |
+
'διαβαζω': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}},
|
| 119 |
+
'τα': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Acc', 'Definite': 'Def', 'Gender': 'Neut', 'Number': 'Plur', 'PronType': 'Art'}},
|
| 120 |
+
'post': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
|
| 121 |
+
'του': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Gen', 'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', 'PronType': 'Art'}},
|
| 122 |
+
'andrew': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
|
| 123 |
+
'ng': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
|
| 124 |
+
'στο': {'UPOS': '_', 'Morphological_Features': {}},
|
| 125 |
+
'twitter': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
|
| 126 |
+
'.': {'UPOS': 'PUNCT', 'Morphological_Features': {}}
|
| 127 |
+
}
|
| 128 |
+
"""
|
| 129 |
+
doc = nlp_pos_ner_dp_with_g2g(text)
|
| 130 |
+
pos_dict = {
|
| 131 |
+
token.text: {"UPOS": token.upos, "Morphological_Features": token.feats}
|
| 132 |
+
for token in doc.tokens
|
| 133 |
+
}
|
| 134 |
+
return pos_dict
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def process_dp(text: str) -> dict:
|
| 138 |
+
"""
|
| 139 |
+
Process text to extract Dependency Parsing information.
|
| 140 |
+
|
| 141 |
+
This method analyzes the given text and returns dependency parsing information for each word,
|
| 142 |
+
including its syntactic head and dependency relation.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
text (str): The text to process.
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
dict: A dictionary where each key is a word from the input text, and the value is another
|
| 149 |
+
dictionary containing:
|
| 150 |
+
- 'Head': The position of the syntactic head of the word (0 indicates the root).
|
| 151 |
+
- 'Deprel': The dependency relation to the head.
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
Examples:
|
| 155 |
+
>>> process_dp("Προτιμώ την πρωινή πτήση από την Αθήνα στη Θεσσαλονίκη.")
|
| 156 |
+
{
|
| 157 |
+
'προτιμω': {'Head': 0, 'Deprel': 'root'},
|
| 158 |
+
'την': {'Head': 4, 'Deprel': 'det'},
|
| 159 |
+
'πρωινη': {'Head': 4, 'Deprel': 'amod'},
|
| 160 |
+
'πτηση': {'Head': 1, 'Deprel': 'obj'},
|
| 161 |
+
'απο': {'Head': 7, 'Deprel': 'case'},
|
| 162 |
+
'την': {'Head': 7, 'Deprel': 'det'},
|
| 163 |
+
'αθηνα': {'Head': 4, 'Deprel': 'nmod'},
|
| 164 |
+
'στη': {'Head': 9, 'Deprel': 'case'},
|
| 165 |
+
'θεσσαλονικη': {'Head': 4, 'Deprel': 'nmod'},
|
| 166 |
+
'.': {'Head': 1, 'Deprel': 'punct'}
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
Dependency Parsing Possible Labels List:
|
| 170 |
+
dp_labels = [
|
| 171 |
+
'obl', 'obj', 'dep', 'mark', 'case', 'flat', 'nummod', 'obl:arg', 'punct', 'cop',
|
| 172 |
+
'acl:relcl', 'expl', 'nsubj', 'csubj:pass', 'root', 'advmod', 'nsubj:pass', 'ccomp',
|
| 173 |
+
'conj', 'amod', 'xcomp', 'aux', 'appos', 'csubj', 'fixed', 'nmod', 'iobj', 'parataxis',
|
| 174 |
+
'orphan', 'det', 'advcl', 'vocative', 'compound', 'cc', 'discourse', 'acl', 'obl:agent'
|
| 175 |
+
]
|
| 176 |
+
"""
|
| 177 |
+
doc = nlp_pos_ner_dp_with_g2g(text)
|
| 178 |
+
dp_dict = {
|
| 179 |
+
token.text: {"Head": token.head, "Deprel": token.deprel} for token in doc.tokens
|
| 180 |
+
}
|
| 181 |
+
return dp_dict
|