Spaces:

ValadisCERTH
/

CountriesCitiesModuleSerco

Build error

File size: 16,165 Bytes

import spacy

from geopy.geocoders import Nominatim
import geonamescache
import pycountry

from geotext import GeoText

import re

from transformers import BertTokenizer, BertModel
import torch


# initial loads

# load the spacy model
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

# load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')

# Load valid city names from geonamescache
gc = geonamescache.GeonamesCache()
city_names = set([city['name'] for city in gc.get_cities().values()])


def flatten(lst):
    """
    Define a helper function to flatten the list recursively
    """
    
    for item in lst:
      if isinstance(item, list):
          yield from flatten(item)
      else:
          yield item


def is_country(reference):
    """
    Check if a given reference is a valid country name
    """
    
    try:
        # use the pycountry library to verify if an input is a country
        country = pycountry.countries.search_fuzzy(reference)[0]
        return True
    except LookupError:
        return False


def is_city(reference):
    """
    Check if a given reference is a valid city name
    """

    # Check if the reference is a valid city name
    if reference in city_names:
        return True

    # Load the Nomatim (open street maps) api
    geolocator = Nominatim(user_agent="certh_serco_validate_city_app")
    location = geolocator.geocode(reference, language="en")

    # If a reference is identified as a 'city', 'town', or 'village', then it is indeed a city 
    if location.raw['type'] in ['city', 'town', 'village']:
        return True

    # If a reference is identified as 'administrative' (e.g. administrative area),
    # then we further examine if the retrieved info is a single token (meaning a country) or a series of tokens (meaning a city)
    # that condition takes place to separate some cases where small cities were identified as administrative areas
    elif location.raw['type'] == 'administrative':
        if len(location.raw['display_name'].split(",")) > 1:
            return True

    return False


def validate_locations(locations):
    """
    Validate that the identified references are indeed a Country and a City
    """

    validated_loc = []

    for location in locations:

        # validate whether it is a city
        if is_city(location):
            validated_loc.append((location, 'city'))

        # validate whether it is a country
        elif is_country(location):
            validated_loc.append((location, 'country'))

        else:
            # Check if the location is a multi-word name
            words = location.split()
            if len(words) > 1:
                
                # Try to find the country or city name among the words
                for i in range(len(words)):
                    name = ' '.join(words[i:])
                    
                    if is_country(name):
                        validated_loc.append((name, 'country'))
                        break
                    
                    elif is_city(name):
                        validated_loc.append((name, 'city'))
                        break

    return validated_loc



def identify_loc_ner(sentence):
    """
    Identify all the geopolitical and location entities with the spacy tool
    """

    doc = nlp(sentence)

    ner_locations = []

    # GPE and LOC are the labels for location entities in spaCy  
    for ent in doc.ents:
        if ent.label_ in ['GPE', 'LOC']:

            if len(ent.text.split()) > 1:
                ner_locations.append(ent.text)
            else:
                for token in ent:          
                    if token.ent_type_ == 'GPE':
                        ner_locations.append(ent.text)
                        break

    return ner_locations



def identify_loc_geoparselibs(sentence):
    """
    Identify cities and countries with 3 different geoparsing libraries
    """

    geoparse_locations = []

    # Geoparsing library 1

    # Load geonames cache to check if a city name is valid
    gc = geonamescache.GeonamesCache()

    # Get a list of many countries/cities
    countries = gc.get_countries()
    cities = gc.get_cities()

    city_names = [city['name'] for city in cities.values()]
    country_names = [country['name'] for country in countries.values()]

    # if any word sequence in our sentence is one of those countries/cities identify it
    words = sentence.split()
    for i in range(len(words)):
        for j in range(i+1, len(words)+1):
            word_seq = ' '.join(words[i:j])
            if word_seq in city_names or word_seq in country_names:
                geoparse_locations.append(word_seq)

    # Geoparsing library 2

    # similarly with the pycountry library
    for country in pycountry.countries:
        if country.name in sentence:
            geoparse_locations.append(country.name)

    # Geoparsing library 3

    # similarly with the geotext library
    places = GeoText(sentence)
    cities = list(places.cities)
    countries = list(places.countries)

    if cities:
        geoparse_locations += cities
    if countries:
        geoparse_locations += countries

    return (geoparse_locations, countries, cities)



def identify_loc_regex(sentence):
    """
    Identify cities and countries with regular expression matching 
    """

    regex_locations = []

    # Country and cities references can be preceded by 'in', 'from' or 'of'
    pattern = r"\b(in|from|of)\b\s([\w\s]+)"
    additional_refs = re.findall(pattern, sentence)

    for match in additional_refs:
        regex_locations.append(match[1])

    return regex_locations



def identify_loc_embeddings(sentence, countries, cities):
    """
    Identify cities and countries with the BERT pre-trained embeddings matching
    """

    embd_locations = []

    # Define a list of country and city names (those are given by the geonamescache library before)
    countries_cities = countries + cities

    # Concatenate multi-word countries and cities into a single string
    multiword_countries = [c.replace(' ', '_') for c in countries if ' ' in c]
    multiword_cities = [c.replace(' ', '_') for c in cities if ' ' in c]
    countries_cities += multiword_countries + multiword_cities

    # Preprocess the input sentence
    tokens = tokenizer.tokenize(sentence)
    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])

    # Get the BERT embeddings for the input sentence
    with torch.no_grad():
        embeddings = model(input_ids)[0][0]

    # Find the country and city names in the input sentence
    for i in range(len(tokens)):
        token = tokens[i]
        if token in countries_cities:
            embd_locations.append(token)
        else:
            word_vector = embeddings[i]
            similarity_scores = torch.nn.functional.cosine_similarity(word_vector.unsqueeze(0), embeddings)
            similar_tokens = [tokens[j] for j in similarity_scores.argsort(descending=True)[1:6]]
            for word in similar_tokens:
                if word in countries_cities and similarity_scores[tokens.index(word)] > 0.5:
                    embd_locations.append(word)

    # Convert back multi-word country and city names to original form
    embd_locations = [loc.replace('_', ' ') if '_' in loc else loc for loc in embd_locations]

    return embd_locations



def multiple_country_city_identifications_solve(country_city_dict):
  """
  This is a function to solve the appearance of multiple identification of countries and cities.
  It checks all the elements of the input dictionary and if any smaller length element exists as a substring inside
  a bigger length element of it, it deletes the smaller size one. In that sense, a dictionary of the sort
  {'city': ['Port moresby', 'Port'], 'country': ['Guinea', 'Papua new guinea']} will be converted into
  {'city': ['Port moresby'], 'country': ['Papua new guinea']}.

  The reason for that function, is because such type of incosistencies were identified during country/city identification,
  propably relevant to the geoparsing libraries in use
  """

  try:
    
    country_flag = False
    city_flag = False

    # to avoid examining any element in any case, we validate that both a country and a city exist
    # on the input dictionary and that they are of length more than one (which is the target case for us)
    if 'country' in country_city_dict:
      if len(country_city_dict['country']) > 1:
        country_flag = True

    if 'city' in country_city_dict:
      if len(country_city_dict['city']) > 1:
        city_flag = True


    # at first cope with country multiple iterative references
    if country_flag:

      # Sort the countries by length, longest first
      country_city_dict['country'].sort(key=lambda x: len(x), reverse=True)

      # Create a new list of countries that don't contain any substrings
      cleaned_countries = []
      for i in range(len(country_city_dict['country'])):
          is_substring = False
          for j in range(len(cleaned_countries)):
              if country_city_dict['country'][i].lower().find(cleaned_countries[j].lower()) != -1:
                  # If the i-th country is a substring of an already-cleaned country, skip it
                  is_substring = True
                  break
          if not is_substring:
              cleaned_countries.append(country_city_dict['country'][i])

      # Replace the original list of countries with the cleaned one
      country_city_dict['country'] = cleaned_countries

      # Create a new list of countries that are not substrings of other countries
      final_countries = []
      for i in range(len(country_city_dict['country'])):
          is_superstring = False
          for j in range(len(country_city_dict['country'])):
              if i == j:
                  continue
              if country_city_dict['country'][j].lower().find(country_city_dict['country'][i].lower()) != -1:
                  # If the i-th country is a substring of a different country, skip it
                  is_superstring = True
                  break
          if not is_superstring:
              final_countries.append(country_city_dict['country'][i])

      # Replace the original list of countries with the final one
      country_city_dict['country'] = final_countries

    # then cope with city multiple iterative references
    if city_flag:

      # Sort the cities by length, longest first
      country_city_dict['city'].sort(key=lambda x: len(x), reverse=True)

      # Create a new list of cities that don't contain any substrings
      cleaned_cities = []
      for i in range(len(country_city_dict['city'])):
          is_substring = False
          for j in range(len(cleaned_cities)):
              if country_city_dict['city'][i].lower().find(cleaned_cities[j].lower()) != -1:
                  # If the i-th city is a substring of an already-cleaned city, skip it
                  is_substring = True
                  break
          if not is_substring:
              cleaned_cities.append(country_city_dict['city'][i])

      # Replace the original list of cities with the cleaned one
      country_city_dict['city'] = cleaned_cities

      # Create a new list of cities that are not substrings of other cities
      final_cities = []
      for i in range(len(country_city_dict['city'])):
          is_superstring = False
          for j in range(len(country_city_dict['city'])):
              if i == j:
                  continue
              if country_city_dict['city'][j].lower().find(country_city_dict['city'][i].lower()) != -1:
                  # If the i-th city is a substring of a different city, skip it
                  is_superstring = True
                  break
          if not is_superstring:
              final_cities.append(country_city_dict['city'][i])

      # Replace the original list of cities with the final one
      country_city_dict['city'] = final_cities

    # return the final dictionary
    if country_city_dict:   
      return country_city_dict

  except:
    return (0, "LOCATION", "unknown_error")



def identify_locations(sentence):
    """
    Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
    """

    locations = []  

    try:

      # # # this is because there were cases were a city followed by comma was not understood by the system
      sentence = sentence.replace(",", " x$x ")

      # ner
      locations.append(identify_loc_ner(sentence))

      # geoparse libs
      geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
      locations.append(geoparse_list)

      # flatten the geoparse list
      locations_flat_1 = list(flatten(locations))

      # regex
      locations_flat_1.append(identify_loc_regex(sentence))

      # flatten the regex list
      locations_flat_2 = list(flatten(locations))

      # embeddings
      locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))    

      # flatten the embeddings list
      locations_flat_3 = list(flatten(locations))

      # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
      # Lowercase the words and get their unique references using set()
      loc_unique = set([loc.lower() for loc in locations_flat_3])

      # Create a new list of locations with initial capitalization, removing duplicates
      loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
      
      # validate that indeed each one of the countries/cities are indeed countries/cities
      validated_locations = validate_locations(loc_capitalization)

      # create a proper dictionary with country/city tags and the relevant entries as a result
      loc_dict = {}
      for location, loc_type in validated_locations:
          if loc_type not in loc_dict:
              loc_dict[loc_type] = []
          loc_dict[loc_type].append(location)

      # bring sentence on previous form
      sentence = sentence.replace(" x$x ",",") 

      # cope with cases of iterative country or city reference due to geoparse lib issues
      locations_dict = multiple_country_city_identifications_solve(loc_dict)

      # conditions for multiple references
      # it is mandatory that a country will exist
      if 'country' in locations_dict:

          # if a city exists
          if 'city' in locations_dict:

              # we accept one country and one city
              if len(locations_dict['country']) == 1 and len(locations_dict['city']) == 1:

                  # capitalize because there may be cases that it will return 'italy'
                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
                  return locations_dict

              # we can accept an absence of city but a country is always mandatory
              elif len(locations_dict['country']) == 1 and len(locations_dict['city']) == 0:
                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
                  return locations_dict

              # error if more than one country or city
              else:
                return (0, "LOCATION", "more_city_or_country")


          # if a city does not exist
          else:

              # we only accept for one country
              if len(locations_dict['country']) == 1:                  
                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
                  return locations_dict

              # error if more than one country
              else:
                return (0, "LOCATION", "more_country")
      
      # error if no country is referred
      else:
          return (0, "LOCATION", "no_country")

    except: 
      # handle the exception if any errors occur while identifying a country/city
      return (0, "LOCATION", "unknown_error")