Bohui Zhang commited on
Commit ·
4389c07
1
Parent(s): 4975487
Adjust parsers and prefix mapping
Browse files- esgen/utils.py +116 -18
esgen/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import requests
|
| 2 |
|
| 3 |
from esgen.config import N_SEARCH_RESULTS
|
|
@@ -26,7 +27,7 @@ def wikidata_api_search(inputs: str = " ", search_type: str = "item") -> list:
|
|
| 26 |
choices = list()
|
| 27 |
for item in data:
|
| 28 |
try:
|
| 29 |
-
name = f"{item['display']['label']['value']} ({item['id']}) {item['display']['description']['value']}"
|
| 30 |
value = f"{item['id']}"
|
| 31 |
# (name: the displayed name of the checkbox button, value: the value to be passed to the function)
|
| 32 |
choices.append((name, value))
|
|
@@ -43,28 +44,125 @@ def unify_comment(comment: str) -> str:
|
|
| 43 |
:param comment:
|
| 44 |
:return:
|
| 45 |
"""
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
else:
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
def
|
| 58 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
:param
|
| 61 |
-
:param prefix:
|
| 62 |
-
:param prefix_type:
|
| 63 |
:return:
|
| 64 |
"""
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
else:
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
import requests
|
| 3 |
|
| 4 |
from esgen.config import N_SEARCH_RESULTS
|
|
|
|
| 27 |
choices = list()
|
| 28 |
for item in data:
|
| 29 |
try:
|
| 30 |
+
name = f"{item['display']['label']['value']} ({item['id']}): {item['display']['description']['value']}"
|
| 31 |
value = f"{item['id']}"
|
| 32 |
# (name: the displayed name of the checkbox button, value: the value to be passed to the function)
|
| 33 |
choices.append((name, value))
|
|
|
|
| 44 |
:param comment:
|
| 45 |
:return:
|
| 46 |
"""
|
| 47 |
+
return comment.strip().lstrip("#").strip()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def prefix_mapping(prefix_mappings, entity_string):
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
:param prefix_mappings:
|
| 54 |
+
:param entity_string:
|
| 55 |
+
:return:
|
| 56 |
+
"""
|
| 57 |
+
if entity_string.startswith("wdt:") or entity_string.startswith("wd:"):
|
| 58 |
+
return entity_string
|
| 59 |
+
elif re.match(r"P[0-9]+", entity_string):
|
| 60 |
+
return f"wdt:{entity_string}"
|
| 61 |
+
elif re.match(r"Q[0-9]+", entity_string):
|
| 62 |
+
return f"wd:{entity_string}"
|
| 63 |
else:
|
| 64 |
+
for abbr, prefix in prefix_mappings.items():
|
| 65 |
+
entity_string = entity_string.replace(prefix, abbr + ":")
|
| 66 |
+
return entity_string
|
| 67 |
|
| 68 |
|
| 69 |
+
# parsers
|
| 70 |
+
def shex_blocks_parser(inputs: str) -> list:
|
| 71 |
"""
|
| 72 |
+
assumptions:
|
| 73 |
+
only has patterns like:
|
| 74 |
+
```
|
| 75 |
+
<a> {
|
| 76 |
+
... (no brackets inside)
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
<b> {
|
| 80 |
+
... (no brackets inside)
|
| 81 |
+
}
|
| 82 |
+
```
|
| 83 |
|
| 84 |
+
:param inputs:
|
|
|
|
|
|
|
| 85 |
:return:
|
| 86 |
"""
|
| 87 |
+
shex_pattern = r"\{([\S\n\t\v ]*?)\}"
|
| 88 |
+
shex_blocks = re.findall(shex_pattern, inputs)
|
| 89 |
+
return shex_blocks
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def prefix_parser(inputs: str) -> [str]:
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
:param inputs:
|
| 96 |
+
:return:
|
| 97 |
+
"""
|
| 98 |
+
inputs = inputs.replace("PREFIX", "")
|
| 99 |
+
split_index = inputs.index(":")
|
| 100 |
+
prefix_abbr = inputs[:split_index].strip()
|
| 101 |
+
prefix_url = inputs[split_index + 1:].strip().lstrip("<").rstrip(">")
|
| 102 |
+
return prefix_abbr, prefix_url
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def value_shape_name_parser(inputs: str) -> list:
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
:param inputs:
|
| 109 |
+
:return:
|
| 110 |
+
"""
|
| 111 |
+
value_shape_names = []
|
| 112 |
+
|
| 113 |
+
lines = inputs.split("\n")
|
| 114 |
+
for line in lines:
|
| 115 |
+
# if not line.startswith("start"):
|
| 116 |
+
# matches = re.findall(r"@<.+>", line)
|
| 117 |
+
if line.startswith("<"):
|
| 118 |
+
matches = re.findall(r"<.+>", line)
|
| 119 |
+
if matches:
|
| 120 |
+
matches = [name.lstrip("@<").rstrip(">") for name in matches]
|
| 121 |
+
value_shape_names += matches
|
| 122 |
+
return value_shape_names
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def triple_constraint_parser(inputs: str) -> [str]:
|
| 126 |
+
"""
|
| 127 |
+
case: wdt:P31 [ wd:Q5] ;
|
| 128 |
+
|
| 129 |
+
:param inputs:
|
| 130 |
+
:return:
|
| 131 |
+
"""
|
| 132 |
+
inputs = inputs.strip().rstrip(";").strip()
|
| 133 |
+
inputs = re.sub(r"\s+", " ", inputs)
|
| 134 |
+
# constraint type
|
| 135 |
+
constraint_type = "triple"
|
| 136 |
+
# prop
|
| 137 |
+
prop = inputs.split()[0]
|
| 138 |
+
# value
|
| 139 |
+
if " . " in inputs:
|
| 140 |
+
value = "."
|
| 141 |
+
elif "[" and "]" in inputs: # value sets
|
| 142 |
+
# value = re.findall(r"[[].*]", inputs)[0] # Possible nested set
|
| 143 |
+
value = re.findall(r"\[([^\[\]]*)\]", inputs)[0]
|
| 144 |
+
value = value.split() # list of values
|
| 145 |
+
elif "@<" in inputs: # value shape
|
| 146 |
+
value = re.findall(r"<.+>", inputs)[0]
|
| 147 |
+
value = value[1:-1]
|
| 148 |
+
constraint_type = "value_shape"
|
| 149 |
+
else:
|
| 150 |
+
value = inputs.split()[1]
|
| 151 |
+
# comment
|
| 152 |
+
if "#" in inputs:
|
| 153 |
+
comment = unify_comment(inputs[inputs.index("#"):])
|
| 154 |
else:
|
| 155 |
+
comment = ""
|
| 156 |
+
# cardinality
|
| 157 |
+
if "?" in inputs:
|
| 158 |
+
cardinality = "?"
|
| 159 |
+
elif "*" in inputs:
|
| 160 |
+
cardinality = "*"
|
| 161 |
+
elif "+" in inputs:
|
| 162 |
+
cardinality = "+"
|
| 163 |
+
elif re.search(r"{.+}", inputs):
|
| 164 |
+
cardinality = re.findall(r"{.+}", inputs)[0]
|
| 165 |
+
else:
|
| 166 |
+
cardinality = ""
|
| 167 |
+
|
| 168 |
+
return prop, value, cardinality, comment, constraint_type
|