import logging import numpy as np import requests from typing import Dict from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor ENTITY_PATH = '/data/jcherian/wikipedia_entity_map.npz' WIKIDATA_URL = "https://www.wikidata.org/w/api.php" logger = logging.getLogger(__name__) logging.basicConfig(filename='human.log', level=logging.INFO) def get_id(response : Dict) -> str: if response.get("entities", None) is None: return None wikidata_codes = list(response['entities'].keys()) assert len(wikidata_codes) == 1 return wikidata_codes[0] def is_human(response : Dict, id: str) -> bool: instances = response['entities'][id]['claims'].get('P31', []) for inst in instances: if inst['mainsnak']['datavalue']['value']['id'] == 'Q5': return True return False def validate_entity(k): name = k.split('/')[-1] adapter = requests.adapters.HTTPAdapter(max_retries=10) with requests.session() as s: s.mount("https://", adapter) response = s.get(url=WIKIDATA_URL, params={"action" : "wbgetentities", "sites" : "enwiki", "titles" : name, "normalize": "1", "languages": "en", "format": "json", "props": "claims"}) try: response = response.json() except: print(response.text) wiki_id = get_id(response) if wiki_id is None: return name, False try: human = is_human(response, wiki_id) except: return name, False logger.info(f"{name}, {human}") return name, human if __name__ == "__main__": wiki_entities = np.load(ENTITY_PATH) entity_names = list(wiki_entities.keys()) try: with ThreadPoolExecutor(max_workers=5) as executor: res = list( tqdm( executor.map( lambda k : validate_entity(k), entity_names ), total=len(entity_names) ) ) except: import pickle with open('human.pkl', 'wb') as fp: pickle.dump(res, fp) import pickle with open('human.pkl', 'wb') as fp: pickle.dump(res, fp) import IPython; IPython.embed()