Spaces:

ValadisCERTH
/

CountriesCitiesModuleSerco

Build error

App Files Files Community

ValadisCERTH commited on Mar 29, 2023

Commit

13d7c9d

1 Parent(s): 99568fb

Update helper.py

Browse files

Files changed (1) hide show

helper.py +112 -19

helper.py CHANGED Viewed

@@ -11,7 +11,6 @@ import re
 from transformers import BertTokenizer, BertModel
 import torch
 # initial loads
 # load the spacy model
@@ -19,8 +18,8 @@ spacy.cli.download("en_core_web_lg")
 nlp = spacy.load("en_core_web_lg")
 # load the pre-trained BERT tokenizer and model
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-model = BertModel.from_pretrained('bert-base-uncased')
 # Load valid city names from geonamescache
 gc = geonamescache.GeonamesCache()
@@ -54,7 +53,7 @@ def is_country(reference):
 def is_city(reference):
     """
-    Check if the given reference is a valid city name
     """
     # Check if the reference is a valid city name
@@ -77,7 +76,7 @@ def is_city(reference):
             return True
     return False
 def validate_locations(locations):
     """
@@ -87,20 +86,28 @@ def validate_locations(locations):
     validated_loc = []
     for location in locations:
         if is_city(location):
             validated_loc.append((location, 'city'))
         elif is_country(location):
             validated_loc.append((location, 'country'))
         else:
             # Check if the location is a multi-word name
             words = location.split()
             if len(words) > 1:
                 # Try to find the country or city name among the words
                 for i in range(len(words)):
                     name = ' '.join(words[i:])
                     if is_country(name):
                         validated_loc.append((name, 'country'))
                         break
                     elif is_city(name):
                         validated_loc.append((name, 'city'))
                         break
@@ -120,10 +127,11 @@ def identify_loc_ner(sentence):
     # GPE and LOC are the labels for location entities in spaCy
     for ent in doc.ents:
         if ent.label_ in ['GPE', 'LOC']:
             if len(ent.text.split()) > 1:
                 ner_locations.append(ent.text)
             else:
-                for token in ent:
                     if token.ent_type_ == 'GPE':
                         ner_locations.append(ent.text)
                         break
@@ -187,7 +195,7 @@ def identify_loc_regex(sentence):
     regex_locations = []
-    # Country references can be preceded by 'in', 'from' or 'of'
     pattern = r"\b(in|from|of)\b\s([\w\s]+)"
     additional_refs = re.findall(pattern, sentence)
@@ -246,8 +254,6 @@ def identify_locations(sentence):
     locations = []
-    # add all the identified country/cities results in a list
     try:
       # ner
@@ -272,24 +278,111 @@ def identify_locations(sentence):
       # flatten the embeddings list
       locations_flat_3 = list(flatten(locations))
-      # acquire the unique country/city names (because it is possible that many different approaches will capture the same countries/cities)
-      flat_loc_list = set(locations_flat_3)
       # validate that indeed each one of the countries/cities are indeed countries/cities
-      validated_locations = validate_locations(flat_loc_list)
       # create a proper dictionary with country/city tags and the relevant entries as a result
       locations_dict = {}
       for location, loc_type in validated_locations:
           if loc_type not in locations_dict:
               locations_dict[loc_type] = []
           locations_dict[loc_type].append(location)
-      return locations_dict
-    except:
-      # handle the exception if any errors occur while identifying a country/city
-      print(f"An error occurred while checking if a city or country exists")
-      return ""

 from transformers import BertTokenizer, BertModel
 import torch
 # initial loads
 # load the spacy model
 nlp = spacy.load("en_core_web_lg")
 # load the pre-trained BERT tokenizer and model
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+model = BertModel.from_pretrained('bert-base-cased')
 # Load valid city names from geonamescache
 gc = geonamescache.GeonamesCache()
 def is_city(reference):
     """
+    Check if a given reference is a valid city name
     """
     # Check if the reference is a valid city name
             return True
     return False
 def validate_locations(locations):
     """
     validated_loc = []
     for location in locations:
+        # validate whether it is a city
         if is_city(location):
             validated_loc.append((location, 'city'))
+        # validate whether it is a country
         elif is_country(location):
             validated_loc.append((location, 'country'))
         else:
             # Check if the location is a multi-word name
             words = location.split()
             if len(words) > 1:
                 # Try to find the country or city name among the words
                 for i in range(len(words)):
                     name = ' '.join(words[i:])
                     if is_country(name):
                         validated_loc.append((name, 'country'))
                         break
                     elif is_city(name):
                         validated_loc.append((name, 'city'))
                         break
     # GPE and LOC are the labels for location entities in spaCy
     for ent in doc.ents:
         if ent.label_ in ['GPE', 'LOC']:
             if len(ent.text.split()) > 1:
                 ner_locations.append(ent.text)
             else:
+                for token in ent:
                     if token.ent_type_ == 'GPE':
                         ner_locations.append(ent.text)
                         break
     regex_locations = []
+    # Country and cities references can be preceded by 'in', 'from' or 'of'
     pattern = r"\b(in|from|of)\b\s([\w\s]+)"
     additional_refs = re.findall(pattern, sentence)
     locations = []
     try:
       # ner
       # flatten the embeddings list
       locations_flat_3 = list(flatten(locations))
+      # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
+      # Lowercase the words and get their unique references using set()
+      loc_unique = set([loc.lower() for loc in locations_flat_3])
+      # Create a new list of locations with initial capitalization, removing duplicates
+      loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
       # validate that indeed each one of the countries/cities are indeed countries/cities
+      validated_locations = validate_locations(loc_capitalization)
       # create a proper dictionary with country/city tags and the relevant entries as a result
       locations_dict = {}
       for location, loc_type in validated_locations:
           if loc_type not in locations_dict:
               locations_dict[loc_type] = []
           locations_dict[loc_type].append(location)
+      # conditions for multiple references
+      # it is mandatory that a country will exist
+      if locations_dict['country']:
+          # if a city exists
+          if 'city' in locations_dict:
+              # we accept one country and one city
+              if len(locations_dict['country']) == 1 and len(locations_dict['city']) == 1:
+                  # capitalize because there may be cases that it will return 'italy'
+                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
+                  return locations_dict
+              # we can accept an absence of city but a country is always mandatory
+              elif len(locations_dict['country']) == 1 and len(locations_dict['city']) == 0:
+                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
+                  return locations_dict
+              # error if more than one country or city
+              else:
+                return (0, "LOCATION", "more_city_or_country")
+          # if a city does not exist
+          else:
+              # we only accept for one country
+              if len(locations_dict['country']) == 1:
+                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
+                  return locations_dict
+              # error if more than one country
+              else:
+                return (0, "LOCATION", "more_country")
+      # error if no country is referred
+      else:
+         return (0, "LOCATION", "no_country")
+    except:
+      # handle the exception if any errors occur while identifying a country/city
+      return (0, "LOCATION", "unknown_error")
+def identify_locations2(sentence):
+    """
+    Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
+    """
+    locations = []
+    # ner
+    locations.append(identify_loc_ner(sentence))
+    # geoparse libs
+    geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
+    locations.append(geoparse_list)
+    # flatten the geoparse list
+    locations_flat_1 = list(flatten(locations))
+    # regex
+    locations_flat_1.append(identify_loc_regex(sentence))
+    # flatten the regex list
+    locations_flat_2 = list(flatten(locations))
+    # embeddings
+    locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
+    # flatten the embeddings list
+    locations_flat_3 = list(flatten(locations))
+    # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
+    # Lowercase the words and get their unique references using set()
+    loc_unique = set([loc.lower() for loc in locations_flat_3])
+    # Create a new list of locations with initial capitalization, removing duplicates
+    loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
+    # validate that indeed each one of the countries/cities are indeed countries/cities
+    validated_locations = validate_locations(loc_capitalization)
+    # create a proper dictionary with country/city tags and the relevant entries as a result
+    locations_dict = {}
+    for location, loc_type in validated_locations:
+        if loc_type not in locations_dict:
+            locations_dict[loc_type] = []
+        locations_dict[loc_type].append(location)
+    return locations_dict