Bohui Zhang commited on
Commit
4389c07
·
1 Parent(s): 4975487

Adjust parsers and prefix mapping

Browse files
Files changed (1) hide show
  1. esgen/utils.py +116 -18
esgen/utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import requests
2
 
3
  from esgen.config import N_SEARCH_RESULTS
@@ -26,7 +27,7 @@ def wikidata_api_search(inputs: str = " ", search_type: str = "item") -> list:
26
  choices = list()
27
  for item in data:
28
  try:
29
- name = f"{item['display']['label']['value']} ({item['id']}) {item['display']['description']['value']}"
30
  value = f"{item['id']}"
31
  # (name: the displayed name of the checkbox button, value: the value to be passed to the function)
32
  choices.append((name, value))
@@ -43,28 +44,125 @@ def unify_comment(comment: str) -> str:
43
  :param comment:
44
  :return:
45
  """
46
- if comment.startswith(" #"):
47
- return comment
48
- elif comment.startswith("#"):
49
- return " " + comment
50
- elif comment.startswith(" "):
51
- return " #" + comment
 
 
 
 
 
 
 
 
 
 
52
  else:
53
- return " # " + comment
 
 
54
 
55
 
56
- # TODO: INCOMPLETE
57
- def prefix_mapping(text: str, prefix: dict, prefix_type: str) -> str:
58
  """
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- :param text:
61
- :param prefix:
62
- :param prefix_type:
63
  :return:
64
  """
65
- if prefix_type == "property":
66
- return "wdt:{}".format(text)
67
- elif prefix_type == "class":
68
- return "wd:{}".format(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  else:
70
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
  import requests
3
 
4
  from esgen.config import N_SEARCH_RESULTS
 
27
  choices = list()
28
  for item in data:
29
  try:
30
+ name = f"{item['display']['label']['value']} ({item['id']}): {item['display']['description']['value']}"
31
  value = f"{item['id']}"
32
  # (name: the displayed name of the checkbox button, value: the value to be passed to the function)
33
  choices.append((name, value))
 
44
  :param comment:
45
  :return:
46
  """
47
+ return comment.strip().lstrip("#").strip()
48
+
49
+
50
+ def prefix_mapping(prefix_mappings, entity_string):
51
+ """
52
+
53
+ :param prefix_mappings:
54
+ :param entity_string:
55
+ :return:
56
+ """
57
+ if entity_string.startswith("wdt:") or entity_string.startswith("wd:"):
58
+ return entity_string
59
+ elif re.match(r"P[0-9]+", entity_string):
60
+ return f"wdt:{entity_string}"
61
+ elif re.match(r"Q[0-9]+", entity_string):
62
+ return f"wd:{entity_string}"
63
  else:
64
+ for abbr, prefix in prefix_mappings.items():
65
+ entity_string = entity_string.replace(prefix, abbr + ":")
66
+ return entity_string
67
 
68
 
69
+ # parsers
70
+ def shex_blocks_parser(inputs: str) -> list:
71
  """
72
+ assumptions:
73
+ only has patterns like:
74
+ ```
75
+ <a> {
76
+ ... (no brackets inside)
77
+ }
78
+
79
+ <b> {
80
+ ... (no brackets inside)
81
+ }
82
+ ```
83
 
84
+ :param inputs:
 
 
85
  :return:
86
  """
87
+ shex_pattern = r"\{([\S\n\t\v ]*?)\}"
88
+ shex_blocks = re.findall(shex_pattern, inputs)
89
+ return shex_blocks
90
+
91
+
92
+ def prefix_parser(inputs: str) -> [str]:
93
+ """
94
+
95
+ :param inputs:
96
+ :return:
97
+ """
98
+ inputs = inputs.replace("PREFIX", "")
99
+ split_index = inputs.index(":")
100
+ prefix_abbr = inputs[:split_index].strip()
101
+ prefix_url = inputs[split_index + 1:].strip().lstrip("<").rstrip(">")
102
+ return prefix_abbr, prefix_url
103
+
104
+
105
+ def value_shape_name_parser(inputs: str) -> list:
106
+ """
107
+
108
+ :param inputs:
109
+ :return:
110
+ """
111
+ value_shape_names = []
112
+
113
+ lines = inputs.split("\n")
114
+ for line in lines:
115
+ # if not line.startswith("start"):
116
+ # matches = re.findall(r"@<.+>", line)
117
+ if line.startswith("<"):
118
+ matches = re.findall(r"<.+>", line)
119
+ if matches:
120
+ matches = [name.lstrip("@<").rstrip(">") for name in matches]
121
+ value_shape_names += matches
122
+ return value_shape_names
123
+
124
+
125
+ def triple_constraint_parser(inputs: str) -> [str]:
126
+ """
127
+ case: wdt:P31 [ wd:Q5] ;
128
+
129
+ :param inputs:
130
+ :return:
131
+ """
132
+ inputs = inputs.strip().rstrip(";").strip()
133
+ inputs = re.sub(r"\s+", " ", inputs)
134
+ # constraint type
135
+ constraint_type = "triple"
136
+ # prop
137
+ prop = inputs.split()[0]
138
+ # value
139
+ if " . " in inputs:
140
+ value = "."
141
+ elif "[" and "]" in inputs: # value sets
142
+ # value = re.findall(r"[[].*]", inputs)[0] # Possible nested set
143
+ value = re.findall(r"\[([^\[\]]*)\]", inputs)[0]
144
+ value = value.split() # list of values
145
+ elif "@<" in inputs: # value shape
146
+ value = re.findall(r"<.+>", inputs)[0]
147
+ value = value[1:-1]
148
+ constraint_type = "value_shape"
149
+ else:
150
+ value = inputs.split()[1]
151
+ # comment
152
+ if "#" in inputs:
153
+ comment = unify_comment(inputs[inputs.index("#"):])
154
  else:
155
+ comment = ""
156
+ # cardinality
157
+ if "?" in inputs:
158
+ cardinality = "?"
159
+ elif "*" in inputs:
160
+ cardinality = "*"
161
+ elif "+" in inputs:
162
+ cardinality = "+"
163
+ elif re.search(r"{.+}", inputs):
164
+ cardinality = re.findall(r"{.+}", inputs)[0]
165
+ else:
166
+ cardinality = ""
167
+
168
+ return prop, value, cardinality, comment, constraint_type