Bohui Zhang commited on
Commit
1c5a4fe
·
1 Parent(s): 5f6f3a8

Upload the second version

Browse files
Files changed (4) hide show
  1. esgen/functions.py +11 -3
  2. esgen/model.py +27 -6
  3. esgen/queries.py +7 -4
  4. esgen/verbalizer.py +5 -2
esgen/functions.py CHANGED
@@ -9,16 +9,17 @@ from esgen.verbalizer import chatbot_verbaliser, init_comment_verbaliser
9
  from esgen.utils import wikidata_api_search
10
 
11
 
12
- def entity_schema_generation(name, class_id, threshold):
13
  """
14
 
15
  :param name:
16
  :param class_id:
17
  :param threshold:
 
18
  :return:
19
  """
20
  try:
21
- es = EntitySchema(name, class_id, threshold / 100)
22
  es.properties, es.pending = es.generate_initial_schema()
23
  prop = es.pending[0]
24
  prop["examples"] = collect_examples(es.class_id, prop["id"])
@@ -91,7 +92,14 @@ def input_no(es_json, history):
91
  es.load_es_json(es_json)
92
  return str(es), es.__json__(), history
93
  # add the pending property to rejected properties
94
- es_json["rejected"].append(es_json["pending"].pop(0))
 
 
 
 
 
 
 
95
  es = EntitySchema(name="", class_id="")
96
  es.load_es_json(es_json)
97
  try:
 
9
  from esgen.utils import wikidata_api_search
10
 
11
 
12
+ def entity_schema_generation(name, class_id, threshold, property_types):
13
  """
14
 
15
  :param name:
16
  :param class_id:
17
  :param threshold:
18
+ :param property_types:
19
  :return:
20
  """
21
  try:
22
+ es = EntitySchema(name, class_id, threshold / 100, property_types)
23
  es.properties, es.pending = es.generate_initial_schema()
24
  prop = es.pending[0]
25
  prop["examples"] = collect_examples(es.class_id, prop["id"])
 
92
  es.load_es_json(es_json)
93
  return str(es), es.__json__(), history
94
  # add the pending property to rejected properties
95
+ rejected_prop = es_json["pending"].pop(0)
96
+ constraint_json = {
97
+ "prop": "wdt:" + rejected_prop["id"],
98
+ "value": ".", # with any value
99
+ "cardinality": "{0}", # has no matching statements
100
+ "comment": init_comment_verbaliser(rejected_prop["label"])
101
+ }
102
+ es_json["rejected"].append(constraint_json)
103
  es = EntitySchema(name="", class_id="")
104
  es.load_es_json(es_json)
105
  try:
esgen/model.py CHANGED
@@ -50,6 +50,7 @@ class EntitySchema:
50
  threshold=0.5,
51
  extra=False,
52
  prefixes=None,
 
53
  ):
54
  """
55
 
@@ -58,6 +59,7 @@ class EntitySchema:
58
  :param extra: may have other statements
59
  :param prefixes:
60
  :param threshold:
 
61
  """
62
  self.name = name
63
  self.class_id = class_id
@@ -72,6 +74,8 @@ class EntitySchema:
72
  self.prefixes = default_prefixes
73
  else:
74
  self.prefixes = prefixes
 
 
75
  self.properties = list()
76
  self.pending = list()
77
  self.optional = list()
@@ -82,12 +86,12 @@ class EntitySchema:
82
  self.parent_relation = None # relation connect es to its parent es, usually used for sub es
83
 
84
  def __str__(self):
85
- prefix_list = [prefix_verbaliser(abbr, url)for abbr, url in self.prefixes.items()]
86
  prefix_str = "".join(prefix_list)
87
 
88
  start_str = f"start = @<{self.name}>\n"
89
 
90
- constraint_list = self.properties + self.optional
91
  constraint_str = value_shape_verbaliser(self.name, constraint_list, self.extra)
92
 
93
  shex_script = '\n'.join([prefix_str, start_str, constraint_str])
@@ -96,7 +100,12 @@ class EntitySchema:
96
 
97
  def generate_initial_schema(self):
98
  schema_components, pending = list(), list()
99
- properties = filter_properties(class_id=self.class_id, num_instances=NUM_ITEMS, threshold=self.threshold)
 
 
 
 
 
100
  # print(properties)
101
  for prop_id, prop in properties.items():
102
  if prop["frequency_level"] == "frequent":
@@ -197,11 +206,16 @@ class NestedEntitySchema(EntitySchema):
197
 
198
  start_str = f"\nstart = @<{self.name}>\n"
199
 
200
- constraint_list = self.properties + self.optional
 
201
  constraint_str = value_shape_verbaliser(self.name, constraint_list, self.extra)
202
 
203
  value_shapes_str = "\n".join(
204
- value_shape_verbaliser(value_shape_name, value_shape.properties + value_shape.optional, extra=False)
 
 
 
 
205
  for value_shape_name, value_shape in self.value_shapes.items()
206
  )
207
 
@@ -347,4 +361,11 @@ class NestedEntitySchema(EntitySchema):
347
  self.properties.append(constraint)
348
 
349
  def sort_constraints(self):
350
- pass
 
 
 
 
 
 
 
 
50
  threshold=0.5,
51
  extra=False,
52
  prefixes=None,
53
+ property_types=None,
54
  ):
55
  """
56
 
 
59
  :param extra: may have other statements
60
  :param prefixes:
61
  :param threshold:
62
+ :param property_types:
63
  """
64
  self.name = name
65
  self.class_id = class_id
 
74
  self.prefixes = default_prefixes
75
  else:
76
  self.prefixes = prefixes
77
+ if property_types is None:
78
+ self.property_types = ["wikibase:WikibaseItem"]
79
  self.properties = list()
80
  self.pending = list()
81
  self.optional = list()
 
86
  self.parent_relation = None # relation connect es to its parent es, usually used for sub es
87
 
88
  def __str__(self):
89
+ prefix_list = [prefix_verbaliser(abbr, url) for abbr, url in self.prefixes.items()]
90
  prefix_str = "".join(prefix_list)
91
 
92
  start_str = f"start = @<{self.name}>\n"
93
 
94
+ constraint_list = self.properties + self.optional + self.rejected
95
  constraint_str = value_shape_verbaliser(self.name, constraint_list, self.extra)
96
 
97
  shex_script = '\n'.join([prefix_str, start_str, constraint_str])
 
100
 
101
  def generate_initial_schema(self):
102
  schema_components, pending = list(), list()
103
+ properties = filter_properties(
104
+ class_id=self.class_id,
105
+ num_instances=NUM_ITEMS,
106
+ threshold=self.threshold,
107
+ property_types=self.property_types
108
+ )
109
  # print(properties)
110
  for prop_id, prop in properties.items():
111
  if prop["frequency_level"] == "frequent":
 
206
 
207
  start_str = f"\nstart = @<{self.name}>\n"
208
 
209
+ self.sort_constraints() # sort constraints before verbalization
210
+ constraint_list = self.properties + self.optional + self.rejected
211
  constraint_str = value_shape_verbaliser(self.name, constraint_list, self.extra)
212
 
213
  value_shapes_str = "\n".join(
214
+ value_shape_verbaliser(
215
+ value_shape_name,
216
+ value_shape.properties + value_shape.optional + value_shape.rejected,
217
+ extra=False
218
+ )
219
  for value_shape_name, value_shape in self.value_shapes.items()
220
  )
221
 
 
361
  self.properties.append(constraint)
362
 
363
  def sort_constraints(self):
364
+ # rule 1: keep "instance of" property in the first place
365
+ sorted_properties = list()
366
+ for constraint in self.properties:
367
+ if constraint.prop == "wdt:P31":
368
+ sorted_properties.insert(0, constraint)
369
+ else:
370
+ sorted_properties.append(constraint)
371
+ self.properties = sorted_properties
esgen/queries.py CHANGED
@@ -55,10 +55,11 @@ def collect_instances(class_id: str, instances_num: int) -> list:
55
  return instances
56
 
57
 
58
- def collect_properties(instances: list) -> dict:
59
  """
60
 
61
  :param instances:
 
62
  :return:
63
  """
64
  # properties = []
@@ -69,8 +70,9 @@ def collect_properties(instances: list) -> dict:
69
  wd:%s ?property ?value .
70
  ?prop wikibase:directClaim ?property .
71
  ?prop wikibase:propertyType ?propType .
 
72
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
73
- }""" % instance
74
  results = get_sparql_results(query)
75
  for result in results["results"]["bindings"]:
76
  # property_count = int(result["count"]["value"])
@@ -94,18 +96,19 @@ def collect_properties(instances: list) -> dict:
94
  return properties
95
 
96
 
97
- def filter_properties(class_id: str, num_instances: int, threshold: float) -> dict:
98
  """
99
  filter properties based on the threshold
100
 
101
  :param class_id:
102
  :param num_instances:
103
  :param threshold:
 
104
  :return:
105
  """
106
  # instance_count = count_instances(class_id)
107
  instances = collect_instances(class_id, num_instances)
108
- properties = collect_properties(instances)
109
  for key, value in properties.items():
110
  # prop_count = count_properties(instances, prop["property"])
111
  # prop["count"] = prop_count
 
55
  return instances
56
 
57
 
58
+ def collect_properties(instances: list, property_types: list) -> dict:
59
  """
60
 
61
  :param instances:
62
+ :param property_types:
63
  :return:
64
  """
65
  # properties = []
 
70
  wd:%s ?property ?value .
71
  ?prop wikibase:directClaim ?property .
72
  ?prop wikibase:propertyType ?propType .
73
+ FILTER ( ?propType IN ( %s ) )
74
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
75
+ }""" % (instance, ",".join(property_types))
76
  results = get_sparql_results(query)
77
  for result in results["results"]["bindings"]:
78
  # property_count = int(result["count"]["value"])
 
96
  return properties
97
 
98
 
99
+ def filter_properties(class_id: str, num_instances: int, threshold: float, property_types: list) -> dict:
100
  """
101
  filter properties based on the threshold
102
 
103
  :param class_id:
104
  :param num_instances:
105
  :param threshold:
106
+ :param property_types:
107
  :return:
108
  """
109
  # instance_count = count_instances(class_id)
110
  instances = collect_instances(class_id, num_instances)
111
+ properties = collect_properties(instances, property_types)
112
  for key, value in properties.items():
113
  # prop_count = count_properties(instances, prop["property"])
114
  # prop["count"] = prop_count
esgen/verbalizer.py CHANGED
@@ -1,5 +1,8 @@
1
- def init_comment_verbaliser(prop_label, freq):
2
- comment = f"{prop_label}, {freq * 100:.2f}%"
 
 
 
3
  return comment
4
 
5
 
 
1
+ def init_comment_verbaliser(prop_label, freq=None):
2
+ if freq:
3
+ comment = f"{prop_label}, {freq * 100:.2f}%"
4
+ else:
5
+ comment = f"{prop_label}"
6
  return comment
7
 
8