sntcristian commited on
Commit
f7d44dc
·
verified ·
1 Parent(s): 5e08dfe

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ faiss.index filter=lfs diff=lfs merge=lfs -text
37
+ index.txt filter=lfs diff=lfs merge=lfs -text
38
+ knowledge_base.sqlite filter=lfs diff=lfs merge=lfs -text
create_bela_db.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sqlite3
3
+ import numpy as np
4
+ import re
5
+ from tqdm import tqdm
6
+ from collections import defaultdict
7
+ import concurrent.futures
8
+ import multiprocessing as mp
9
+
10
+ # File paths
11
+ index_path = 'index.txt'
12
+ db_path = 'knowledge_base_final.sqlite'
13
+ type2class_path = "type2classes.json"
14
+ rdf1_path = "types_and_dates.rdf"
15
+ rdf2_path = "labels.rdf"
16
+ rdf3_path = "descriptions.rdf"
17
+
18
+
19
+ # Load mappings once
20
+ with open(type2class_path, "r", encoding="utf-8") as f2:
21
+ type2class = json.load(f2)
22
+
23
+ class2type = {}
24
+ for k, v in type2class.items():
25
+ for _c in v:
26
+ class2type[_c] = k
27
+
28
+
29
+ def preprocess_types_and_dates(rdf_path, required_qids):
30
+ """
31
+ Pre-process RDF file to extract only relevant information for required QIDs.
32
+ Returns dictionaries mapping QID to types and dates.
33
+ """
34
+ qid_to_types = defaultdict(list)
35
+ qid_to_dates = defaultdict(list)
36
+
37
+ # Compile regex patterns once for better performance
38
+ type_pattern = re.compile(r'wd:(Q\d+)\s+wdt:P31\s+wd:(Q\d+) \.')
39
+ date_pattern = re.compile(r'wd:(Q\d+)\s+wdt:P\d+\s+"(.*?)"\^\^xsd:dateTime \.')
40
+
41
+ print("Preprocessing RDF types and dates...")
42
+ with open(rdf_path, "r", encoding="utf-8") as f:
43
+ for line in tqdm(f, desc="Processing RDF lines"):
44
+
45
+ if line.startswith('wd:Q'):
46
+ qid_match = re.match(r'wd:(Q\d+) ', line)
47
+ if qid_match:
48
+ line_qid = qid_match.group(1)
49
+ if line_qid not in required_qids:
50
+ continue
51
+ else:
52
+ continue
53
+
54
+ # Extract type information
55
+ type_match = type_pattern.match(line)
56
+ if type_match:
57
+ entity_qid, class_qid = type_match.groups()
58
+ if class_qid in class2type:
59
+ qid_to_types[entity_qid].append(class2type[class_qid])
60
+
61
+ # Extract date information
62
+ date_match = date_pattern.match(line)
63
+ if date_match:
64
+ entity_qid, date_str = date_match.groups()
65
+ qid_to_dates[entity_qid].append(date_str)
66
+
67
+ return qid_to_types, qid_to_dates
68
+
69
+
70
+
71
+
72
+ def preprocess_labels(rdf_path, required_qids):
73
+ """
74
+ Pre-process RDF file to extract only relevant information for required QIDs.
75
+ Returns dictionaries mapping QID to types and dates.
76
+ """
77
+ qid_to_labels = defaultdict(dict)
78
+
79
+ # Compile regex patterns once for better performance
80
+ labels_pattern = re.compile(r'wd:(Q\d+)\s+schema:name\s+"(.*?)"@([a-z]+)\s*\.')
81
+
82
+ print("Preprocessing RDF labels...")
83
+ with open(rdf_path, "r", encoding="utf-8") as f:
84
+ for line in tqdm(f, desc="Processing RDF lines"):
85
+
86
+ match = re.match(labels_pattern, line)
87
+
88
+ if match:
89
+ wikidata_entity, label, language = match.groups()
90
+ if wikidata_entity in required_qids:
91
+ qid_to_labels[wikidata_entity][language] = label
92
+
93
+ return qid_to_labels
94
+
95
+
96
+ def preprocess_descriptions(rdf_path, required_qids):
97
+ """
98
+ Pre-process RDF file to extract only relevant information for required QIDs.
99
+ Returns dictionaries mapping QID to types and dates.
100
+ """
101
+ qid_to_descriptions = defaultdict(dict)
102
+
103
+ # Compile regex patterns once for better performance
104
+ descriptions_pattern = re.compile(r'wd:(Q\d+)\s+schema:description\s+"(.*?)"@([a-z]+)\s*\.')
105
+
106
+ print("Preprocessing RDF descriptions...")
107
+ with open(rdf_path, "r", encoding="utf-8") as f:
108
+ for line in tqdm(f, desc="Processing RDF lines"):
109
+
110
+ match = re.match(descriptions_pattern, line)
111
+
112
+ if match:
113
+ wikidata_entity, description, language = match.groups()
114
+ if wikidata_entity in required_qids:
115
+ qid_to_descriptions[wikidata_entity][language] = description
116
+
117
+ return qid_to_descriptions
118
+
119
+ def determine_entity_type(type_list):
120
+ """Determine the most common entity type from a list of types."""
121
+ if not type_list:
122
+ return None
123
+
124
+ entity_type_count = {
125
+ "PER": 0,
126
+ "LOC": 0,
127
+ "WORK": 0,
128
+ "ORG": 0
129
+ }
130
+
131
+ for entity_type in type_list:
132
+ if entity_type in entity_type_count:
133
+ entity_type_count[entity_type] += 1
134
+
135
+ max_count = max(entity_type_count.values())
136
+ if max_count == 0:
137
+ return None
138
+
139
+ for tag, count in entity_type_count.items():
140
+ if count == max_count:
141
+ return tag
142
+
143
+ return None
144
+
145
+
146
+ def parse_date_string(date_str):
147
+ """Parse a date string and return a numpy datetime64 object."""
148
+ date_part = date_str.split("T")[0]
149
+ sign = date_part[:1]
150
+ actual_date = date_part[1:]
151
+ try:
152
+ parts = actual_date.split("-")
153
+ if len(parts) == 3:
154
+ year, month, day = parts
155
+ # Handle invalid months/days
156
+ if month == "00":
157
+ month = "01"
158
+ if day == "00":
159
+ day = "01"
160
+
161
+ if sign == '-':
162
+ formatted_date = f"-{year}-{month}-{day}"
163
+ else:
164
+ formatted_date = f"{year}-{month}-{day}"
165
+
166
+ return np.datetime64(formatted_date)
167
+
168
+ except Exception:
169
+ # Fallback to year-01-01
170
+ year = actual_date.split("-")[0]
171
+ if sign == '-':
172
+ return np.datetime64(f"-{year}-01-01")
173
+ else:
174
+ return np.datetime64(f"{year}-01-01")
175
+
176
+
177
+ def find_minimum_date(date_list):
178
+ """Find the minimum date from a list of date strings."""
179
+ if not date_list:
180
+ return None
181
+
182
+ valid_dates = []
183
+ for date_str in date_list:
184
+ parsed_date = parse_date_string(date_str)
185
+ if parsed_date is not None:
186
+ valid_dates.append(parsed_date)
187
+
188
+ return min(valid_dates) if valid_dates else None
189
+
190
+
191
+ def process_entity_batch(batch_data):
192
+ """Process a batch of entities - can be used for multiprocessing."""
193
+ entities, qid_to_types, qid_to_dates, qid_to_labels, qid_to_descriptions = batch_data
194
+
195
+ results = {
196
+ "entities":[],
197
+ "enwiki":[],
198
+ "dewiki":[],
199
+ "frwiki":[],
200
+ "itwiki":[],
201
+ "nlwiki":[],
202
+ "svwiki":[],
203
+ "fiwiki":[]
204
+ }
205
+ for idx, wikidata_qid in entities:
206
+
207
+ # Get type and date from preprocessed data
208
+ entity_types = qid_to_types.get(wikidata_qid, [])
209
+ entity_dates = qid_to_dates.get(wikidata_qid, [])
210
+
211
+ _type = determine_entity_type(entity_types)
212
+ min_date = find_minimum_date(entity_dates)
213
+
214
+ if isinstance(min_date, np.datetime64):
215
+ min_date = str(min_date)
216
+
217
+ results["entities"].append(
218
+ (idx, wikidata_qid, _type, min_date)
219
+ )
220
+
221
+ enwiki = qid_to_labels.get(wikidata_qid, dict()).get("en", "")
222
+ endescr = qid_to_descriptions.get(wikidata_qid, dict()).get("en", "")
223
+ if not (len(enwiki) == 0 and len(endescr) == 0):
224
+ results["enwiki"].append((idx, enwiki, endescr))
225
+
226
+ dewiki = qid_to_labels.get(wikidata_qid, dict()).get("de", "")
227
+ dedescr = qid_to_descriptions.get(wikidata_qid, dict()).get("de", "")
228
+ if not (len(dewiki) == 0 and len(dedescr) == 0):
229
+ results["dewiki"].append((idx, dewiki, dedescr))
230
+
231
+ frwiki = qid_to_labels.get(wikidata_qid, dict()).get("fr", "")
232
+ frdescr = qid_to_descriptions.get(wikidata_qid, dict()).get("fr", "")
233
+ if not (len(frwiki) == 0 and len(frdescr) == 0):
234
+ results["frwiki"].append((idx, frwiki, frdescr))
235
+
236
+ itwiki = qid_to_labels.get(wikidata_qid, dict()).get("it", "")
237
+ itdescr = qid_to_descriptions.get(wikidata_qid, dict()).get("it", "")
238
+ if not (len(itwiki) == 0 and len(itdescr) == 0):
239
+ results["itwiki"].append((idx, itwiki, itdescr))
240
+
241
+ nlwiki = qid_to_labels.get(wikidata_qid, dict()).get("nl", "")
242
+ nldescr = qid_to_descriptions.get(wikidata_qid, dict()).get("nl", "")
243
+ if not (len(nlwiki) == 0 and len(nldescr) == 0):
244
+ results["nlwiki"].append((idx, nlwiki, nldescr))
245
+
246
+ svwiki = qid_to_labels.get(wikidata_qid, dict()).get("sv", "")
247
+ svdescr = qid_to_descriptions.get(wikidata_qid, dict()).get("sv", "")
248
+ if not (len(svwiki) == 0 and len(svdescr) == 0):
249
+ results["svwiki"].append((idx, svwiki, svdescr))
250
+
251
+ fiwiki = qid_to_labels.get(wikidata_qid, dict()).get("fi", "")
252
+ fidescr = qid_to_descriptions.get(wikidata_qid, dict()).get("fi", "")
253
+ if not (len(fiwiki) == 0 and len(fidescr) == 0):
254
+ results["fiwiki"].append((idx, fiwiki, fidescr))
255
+
256
+ return results
257
+
258
+
259
+ def main():
260
+ # Step 1: Load Wikipedia to Wikidata mappings
261
+ print("Loading entities...")
262
+ entities = []
263
+ required_qids = set()
264
+ with open(index_path, 'r', encoding='utf-8') as txt_file:
265
+ for idx, line in enumerate(txt_file):
266
+ if line.strip().startswith("Q"):
267
+ entities.append((idx, line.strip()))
268
+ required_qids.add(line.strip()) # Remove 'Q' prefix
269
+
270
+ print(f"Found {len(entities)} entities, {len(required_qids)} unique QIDs")
271
+
272
+
273
+ # Step 3: Preprocess RDF file
274
+ qid_to_types, qid_to_dates = preprocess_types_and_dates(rdf1_path, required_qids)
275
+ qid_to_labels = preprocess_labels(rdf2_path, required_qids)
276
+ qid_to_descriptions = preprocess_descriptions(rdf3_path, required_qids)
277
+
278
+ # Step 4: Set up database
279
+ conn = sqlite3.connect(db_path)
280
+ cursor = conn.cursor()
281
+
282
+ cursor.execute('''
283
+ CREATE TABLE IF NOT EXISTS entities (
284
+ id INTEGER PRIMARY KEY,
285
+ wikidata_qid TEXT,
286
+ type_ TEXT,
287
+ min_date TEXT
288
+ )''')
289
+
290
+ cursor.execute('''
291
+ CREATE TABLE IF NOT EXISTS enwiki (
292
+ id INTEGER PRIMARY KEY,
293
+ label TEXT,
294
+ descr TEXT,
295
+ FOREIGN KEY (id) REFERENCES entities(id)
296
+ )''')
297
+
298
+ cursor.execute('''
299
+ CREATE TABLE IF NOT EXISTS dewiki (
300
+ id INTEGER PRIMARY KEY,
301
+ label TEXT,
302
+ descr TEXT,
303
+ FOREIGN KEY (id) REFERENCES entities(id)
304
+ )''')
305
+
306
+ cursor.execute('''
307
+ CREATE TABLE IF NOT EXISTS frwiki (
308
+ id INTEGER PRIMARY KEY,
309
+ label TEXT,
310
+ descr TEXT,
311
+ FOREIGN KEY (id) REFERENCES entities(id)
312
+ )''')
313
+
314
+ cursor.execute('''
315
+ CREATE TABLE IF NOT EXISTS itwiki (
316
+ id INTEGER PRIMARY KEY,
317
+ label TEXT,
318
+ descr TEXT,
319
+ FOREIGN KEY (id) REFERENCES entities(id)
320
+ )''')
321
+
322
+ cursor.execute('''
323
+ CREATE TABLE IF NOT EXISTS nlwiki (
324
+ id INTEGER PRIMARY KEY,
325
+ label TEXT,
326
+ descr TEXT,
327
+ FOREIGN KEY (id) REFERENCES entities(id)
328
+ )''')
329
+
330
+ cursor.execute('''
331
+ CREATE TABLE IF NOT EXISTS svwiki (
332
+ id INTEGER PRIMARY KEY,
333
+ label TEXT,
334
+ descr TEXT,
335
+ FOREIGN KEY (id) REFERENCES entities(id)
336
+ )''')
337
+
338
+ cursor.execute('''
339
+ CREATE TABLE IF NOT EXISTS fiwiki (
340
+ id INTEGER PRIMARY KEY,
341
+ label TEXT,
342
+ descr TEXT,
343
+ FOREIGN KEY (id) REFERENCES entities(id)
344
+ )''')
345
+
346
+ # Step 5: Process entities (with optional multiprocessing)
347
+ use_multiprocessing = len(entities) > 1000 # Only use for large datasets
348
+
349
+ if use_multiprocessing:
350
+ print("Processing entities with multiprocessing...")
351
+ # Split entities into chunks for multiprocessing
352
+ num_processes = min(mp.cpu_count(), 4) # Limit to 4 processes
353
+ chunk_size = len(entities) // num_processes
354
+ chunks = []
355
+
356
+ for i in range(0, len(entities), chunk_size):
357
+ chunk = entities[i:i + chunk_size]
358
+ chunks.append((chunk, qid_to_types, qid_to_dates, qid_to_labels, qid_to_descriptions))
359
+
360
+ all_results = {"entities":[],
361
+ "enwiki":[],
362
+ "dewiki":[],
363
+ "frwiki":[],
364
+ "itwiki":[],
365
+ "nlwiki":[],
366
+ "svwiki":[],
367
+ "fiwiki":[]
368
+ }
369
+
370
+ with concurrent.futures.ProcessPoolExecutor(max_workers=num_processes) as executor:
371
+ futures = [executor.submit(process_entity_batch, chunk) for chunk in chunks]
372
+ for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing chunks"):
373
+ all_results["entities"] += future.result()["entities"]
374
+ all_results["enwiki"] += future.result()["enwiki"]
375
+ all_results["dewiki"] += future.result()["dewiki"]
376
+ all_results["frwiki"] += future.result()["frwiki"]
377
+ all_results["itwiki"] += future.result()["itwiki"]
378
+ all_results["nlwiki"] += future.result()["nlwiki"]
379
+ all_results["svwiki"] += future.result()["svwiki"]
380
+ all_results["fiwiki"] += future.result()["fiwiki"]
381
+ else:
382
+ print("Processing entities sequentially...")
383
+ all_results = process_entity_batch((entities, qid_to_types, qid_to_dates, qid_to_labels, qid_to_descriptions))
384
+
385
+ # Step 6: Insert into database in batches
386
+ print("Inserting into database...")
387
+ batch_size = 1000
388
+ print("Inserting into entities table...")
389
+ for i in tqdm(range(0, len(all_results["entities"]), batch_size), desc="Inserting entities batches"):
390
+ batch = all_results["entities"][i:i + batch_size]
391
+ cursor.executemany('''
392
+ INSERT INTO entities (id, wikidata_qid, type_, min_date)
393
+ VALUES (?, ?, ?, ?)
394
+ ''', batch)
395
+ conn.commit()
396
+
397
+ for i in tqdm(range(0, len(all_results["enwiki"]), batch_size), desc="Inserting enwiki batches"):
398
+ batch = all_results["enwiki"][i:i + batch_size]
399
+ cursor.executemany('''
400
+ INSERT INTO enwiki (id, label, descr)
401
+ VALUES (?, ?, ?)
402
+ ''', batch)
403
+ conn.commit()
404
+
405
+ for i in tqdm(range(0, len(all_results["dewiki"]), batch_size), desc="Inserting dewiki batches"):
406
+ batch = all_results["dewiki"][i:i + batch_size]
407
+ cursor.executemany('''
408
+ INSERT INTO dewiki (id, label, descr)
409
+ VALUES (?, ?, ?)
410
+ ''', batch)
411
+ conn.commit()
412
+
413
+ for i in tqdm(range(0, len(all_results["frwiki"]), batch_size), desc="Inserting frwiki batches"):
414
+ batch = all_results["frwiki"][i:i + batch_size]
415
+ cursor.executemany('''
416
+ INSERT INTO frwiki (id, label, descr)
417
+ VALUES (?, ?, ?)
418
+ ''', batch)
419
+ conn.commit()
420
+
421
+ for i in tqdm(range(0, len(all_results["itwiki"]), batch_size), desc="Inserting itwiki batches"):
422
+ batch = all_results["itwiki"][i:i + batch_size]
423
+ cursor.executemany('''
424
+ INSERT INTO itwiki (id, label, descr)
425
+ VALUES (?, ?, ?)
426
+ ''', batch)
427
+ conn.commit()
428
+
429
+ for i in tqdm(range(0, len(all_results["nlwiki"]), batch_size), desc="Inserting nlwiki batches"):
430
+ batch = all_results["nlwiki"][i:i + batch_size]
431
+ cursor.executemany('''
432
+ INSERT INTO nlwiki (id, label, descr)
433
+ VALUES (?, ?, ?)
434
+ ''', batch)
435
+ conn.commit()
436
+
437
+ for i in tqdm(range(0, len(all_results["svwiki"]), batch_size), desc="Inserting svwiki batches"):
438
+ batch = all_results["svwiki"][i:i + batch_size]
439
+ cursor.executemany('''
440
+ INSERT INTO svwiki (id, label, descr)
441
+ VALUES (?, ?, ?)
442
+ ''', batch)
443
+ conn.commit()
444
+
445
+ for i in tqdm(range(0, len(all_results["fiwiki"]), batch_size), desc="Inserting fiwiki batches"):
446
+ batch = all_results["fiwiki"][i:i + batch_size]
447
+ cursor.executemany('''
448
+ INSERT INTO fiwiki (id, label, descr)
449
+ VALUES (?, ?, ?)
450
+ ''', batch)
451
+ conn.commit() # Commit each batch
452
+
453
+ # Final commit and close
454
+ conn.commit()
455
+ conn.close()
456
+
457
+ print(f"Database '{db_path}' created successfully with {len(all_results)} entities.")
458
+
459
+
460
+ if __name__ == "__main__":
461
+ main()
faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17e71167c992ec4ecc6e7747b6d78a3efa3d32d2a89856b2fc0b465cbc3e4575
3
+ size 19765027245
index.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65403dd2c236629cba026156acc8fb31c1907525824d512a90679d16ea1fe124
3
+ size 158216234
knowledge_base.sqlite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af09701a1f4c7b78c92a87b2c55747cf4c097dbde4bbd15f351a2a6a8f043068
3
+ size 3159400448
model_wiki.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b13ec79092a6a60db30e3e366aa1e0dc55ecc85a02ab4427ef4d163847a7dd26
3
+ size 2243434498
type2classes.json ADDED
The diff for this file is too large to render. See raw diff