ValadisCERTH commited on
Commit
6be438e
·
1 Parent(s): 3347c60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -31
app.py CHANGED
@@ -9,13 +9,13 @@ nlp = spacy.load('en_core_web_lg')
9
 
10
 
11
  # Define earthquake-related keywords
12
- earthquake_single_keywords = ['earthquake', 'seismic', 'tremor', 'quake', 'aftershock', 'seismology', 'tectonic', 'plate', 'seismometer', 'shake', 'temblor', 'trembler', 'seism', 'shock', 'vibration', 'groundswell']
13
 
14
  # Compute embeddings for single-word keywords
15
  earthquake_single_embeddings = [nlp(keyword).vector for keyword in earthquake_single_keywords]
16
 
17
  # Define multi-word earthquake-related keywords
18
- earthquake_multi_keywords = ['seismic activity', 'earthquake risk', 'earthquake zone', 'seismic waves', 'earthquake damage', 'seismic shift', 'tectonic plates', 'fault line', 'seismic retrofitting', 'seismic hazard', 'aftershock sequence', 'earthquake drill', 'seismic reflection', 'plate tectonics', 'seismic reflection imaging', 'seismic tomography', 'seismic profiling', 'seismic energy release', 'seismicity pattern', 'earthquake swarm', 'seismic gap', 'seismic inversion', 'seismic reflection', 'seismic scattering', 'seismic attenuation', 'seismic imaging', 'seismic map', 'seismic data', 'earthquake monitoring', 'seismic data analysis', 'earth shaking']
19
 
20
  # Compute embeddings for multi-word keywords
21
  earthquake_multi_embeddings = []
@@ -25,6 +25,17 @@ for keyword in earthquake_multi_keywords:
25
  earthquake_multi_embeddings.append(combined_emb)
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Define a function to compute the semantic similarity between a word and a set of embeddings
29
  def compute_similarity_earthquake(word, embeddings, excluded_keywords):
30
  """
@@ -41,8 +52,12 @@ def compute_similarity_earthquake(word, embeddings, excluded_keywords):
41
  # Compute the cosine similarity between the word embedding and the keyword embeddings
42
  similarity_scores = [np.dot(word_emb, emb) / (np.linalg.norm(word_emb) * np.linalg.norm(emb)) for emb in embeddings]
43
 
44
- # Return True if the maximum similarity score is above a certain threshold, else False
45
- return max(similarity_scores) > 0.65
 
 
 
 
46
 
47
 
48
  def identify_earthquake_event(input_sentence):
@@ -50,44 +65,94 @@ def identify_earthquake_event(input_sentence):
50
  Compute the semantic similarity for earthquaqe events
51
  """
52
 
53
- # Define excluded keywords to ignore (because cases like I want bars with magnituted 6 - were given as correct)
54
- excluded_keywords = ['magnitude', 'richter', 'moment', 'scale', 'intensity', 'amplitude', 'energy', 'force', 'power', 'seismicity']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Check for single-word earthquake-related keywords
57
- is_earthquake_related = any([compute_similarity_earthquake(word.text.lower(), earthquake_single_embeddings, excluded_keywords) for word in nlp(input_sentence)])
 
58
 
59
- # If no single-word keywords are found, check for multi-word keywords
60
- if not is_earthquake_related:
 
61
 
62
- # Check 2-grams
63
- for i in range(len(nlp(input_sentence))-1):
64
- bigram = nlp(input_sentence)[i:i+2].text.lower()
65
- if compute_similarity_earthquake(bigram, earthquake_multi_embeddings, excluded_keywords):
66
- is_earthquake_related = True
67
- break
68
-
69
- # Check 3-grams
70
- if not is_earthquake_related:
71
- for i in range(len(nlp(input_sentence))-2):
72
- trigram = nlp(input_sentence)[i:i+3].text.lower()
73
- if compute_similarity_earthquake(trigram, earthquake_multi_embeddings, excluded_keywords):
74
- is_earthquake_related = True
75
- break
76
 
77
- return {'earthquaqe_event': is_earthquake_related}
78
 
79
 
80
  from transformers import pipeline
81
  import gradio as gr
82
 
83
- title = "Natural Language module Demo for Earthquaqe events identification"
84
- description = "This is a simple demo just for demonstration purposes, so that Serco team might have the chance to validate the results of the Natural Language module concerning the earthquaqe event identification, while in progress"
85
 
86
  examples = [
87
- ["I want all earthquakes larger than 5.0 that occurred in Rome during 3/5/20"],
88
- ["I want all seism larger than 5.0 that occurred in Rome between January 2020 and February of the same year"],
89
- ["earth shakes located in Ishkoshim"],
90
- ["give me all the bars with magnitude above than 6 in the region of Athens for the month of January, 1990"]
91
  ]
92
 
93
 
 
9
 
10
 
11
  # Define earthquake-related keywords
12
+ earthquake_single_keywords = ['earthquake', 'seismic', 'tremor', 'quake', 'aftershock', 'seismology', 'tectonic', 'plate', 'seismometer', 'temblor', 'trembler', 'seism', 'shock', 'vibration', 'shake', 'groundswell', 'earthquakes', 'seismics', 'tremors', 'quakes', 'aftershocks', 'seismologies', 'tectonics', 'plates', 'seismometers', 'temblors', 'tremblers', 'seisms', 'shocks', 'vibrations', 'shakes', 'groundswells']
13
 
14
  # Compute embeddings for single-word keywords
15
  earthquake_single_embeddings = [nlp(keyword).vector for keyword in earthquake_single_keywords]
16
 
17
  # Define multi-word earthquake-related keywords
18
+ earthquake_multi_keywords = ['seismic activity', 'earthquake risk', 'earthquake zone', 'seismic wave', 'earthquake damage', 'seismic shift', 'tectonic plate', 'fault line', 'seismic retrofitting', 'seismic hazard', 'aftershock sequence', 'earthquake drill', 'seismic reflection', 'plate tectonic', 'seismic tomography', 'seismic profiling', 'seismicity pattern', 'earthquake swarm', 'seismic gap', 'seismic inversion', 'seismic reflection', 'seismic scattering', 'seismic attenuation', 'seismic imaging', 'seismic map', 'seismic data', 'earthquake monitoring', 'earth shaking', 'seismic activities', 'earthquake risks', 'earthquake zones', 'seismic waves', 'earthquake damages', 'seismic shifts', 'tectonic plates', 'fault lines', 'seismic retrofittings', 'seismic hazards', 'aftershock sequences', 'earthquake drills', 'seismic reflections', 'plate tectonics', 'seismic tomographies', 'seismic profilings', 'seismicity patterns', 'earthquake swarms', 'seismic gaps', 'seismic inversions', 'seismic reflections', 'seismic scatterings', 'seismic attenuations', 'seismic imagings', 'seismic maps', 'earth shakings']
19
 
20
  # Compute embeddings for multi-word keywords
21
  earthquake_multi_embeddings = []
 
25
  earthquake_multi_embeddings.append(combined_emb)
26
 
27
 
28
+ def straight_pattern_matching(ngram):
29
+ """
30
+ Function to compute a straightforward similarity between a word and the pre-defined references
31
+ """
32
+
33
+ if ngram in earthquake_single_keywords or ngram in earthquake_multi_keywords:
34
+ return ngram
35
+ else:
36
+ return False
37
+
38
+
39
  # Define a function to compute the semantic similarity between a word and a set of embeddings
40
  def compute_similarity_earthquake(word, embeddings, excluded_keywords):
41
  """
 
52
  # Compute the cosine similarity between the word embedding and the keyword embeddings
53
  similarity_scores = [np.dot(word_emb, emb) / (np.linalg.norm(word_emb) * np.linalg.norm(emb)) for emb in embeddings]
54
 
55
+ # Return if the maximum similarity score is above a certain threshold
56
+ if max(similarity_scores) > 0.65:
57
+ return word
58
+
59
+ else:
60
+ return False
61
 
62
 
63
  def identify_earthquake_event(input_sentence):
 
65
  Compute the semantic similarity for earthquaqe events
66
  """
67
 
68
+ try:
69
+
70
+ # Define excluded keywords to ignore (because cases like I want bars with magnituted 6 - were given as correct)
71
+ excluded_keywords = ['magnitude', 'richter', 'moment', 'scale', 'intensity', 'amplitude', 'energy', 'force', 'power', 'seismicity']
72
+
73
+ parsed_sentence = nlp(input_sentence)
74
+
75
+ # start with simple straight pattern matching of single keywords
76
+ for word in parsed_sentence:
77
+ if word.text not in excluded_keywords:
78
+
79
+ straight_matching_single = straight_pattern_matching(word.text)
80
+
81
+ if straight_matching_single:
82
+ return {'earthquaqe_event': [True, straight_matching_single]}
83
+
84
+ # Continue with embeddings single matching
85
+ earthquaqe_keywords_single = []
86
+
87
+ # Check for single-word earthquake-related keywords
88
+ earthquaqe_keywords_single = [compute_similarity_earthquake(word.text.lower(), earthquake_single_embeddings, excluded_keywords) for word in parsed_sentence]
89
+
90
+ single_keyword_flag = False
91
+
92
+ # check until you find one such reference and then break
93
+ for elem in earthquaqe_keywords_single:
94
+ if elem:
95
+ single_keyword_flag = True
96
+ target_elem_single = elem
97
+ break
98
+
99
+ # if there is at least one referece, we can assume that the sentence refers to earthquaqe events
100
+ if single_keyword_flag:
101
+ return {'earthquaqe_event': [True, target_elem_single]}
102
+
103
+ # otherwise we examine for 2grams multi-word straight patterns and embeddings
104
+
105
+ earthquaqe_keywords_multi = []
106
+
107
+ # check 2-grams
108
+ for i in range(len(parsed_sentence)-1):
109
+ bigram = parsed_sentence[i:i+2].text.lower()
110
+
111
+ # case of straight matching
112
+ straight_matching_multi = straight_pattern_matching(word.text)
113
+
114
+ if straight_matching_multi:
115
+ return {'earthquaqe_event': [True, straight_matching_multi]}
116
+
117
+ # if no straight matching then perform embeddings
118
+ else:
119
+ earthquaqe_keywords_multi.append(compute_similarity_earthquake(bigram, earthquake_multi_embeddings, excluded_keywords))
120
+
121
+
122
+ # case that the straight multi matching did not give any output
123
+ multi_keyword_flag = False
124
+
125
+ # check until you find one such reference and then break
126
+ for elem in earthquaqe_keywords_multi:
127
+ if elem:
128
+ multi_keyword_flag = True
129
+ target_elem_multi = elem
130
+ break
131
 
132
+ # if there is at least one referece, we can assume that the sentence refers to earthquaqe events
133
+ if multi_keyword_flag:
134
+ return {'earthquaqe_event': [True, target_elem_multi]}
135
 
136
+ # otherwise there is no reference
137
+ else:
138
+ return (0,'EARTHQUAQE_EVENT','no_earthquaqe_reference')
139
 
140
+ except:
141
+ return (0,'EARTHQUAQE_EVENT','unknown_error')
 
 
 
 
 
 
 
 
 
 
 
 
142
 
 
143
 
144
 
145
  from transformers import pipeline
146
  import gradio as gr
147
 
148
+ title = "Earthquaqe events Demo"
149
+ description = "This is a simple demo just for demonstration purposes for Serco team, to validate the results of the Natural Language module concerning the identification of earthquaqe events, while in progress"
150
 
151
  examples = [
152
+ ["I want all earthquakes that are located in Italy, in 01/01/23 with magnitude greater than 6.2"],
153
+ ["I want all seisms that are located in Italy, in 01/01/23 with magnitude greater than 6.2"],
154
+ ["I want all earth shakes that are located in Italy, in 01/01/23 with magnitude greater than 6.2"],
155
+ ["I want all bars that are located in Italy, in 01/01/23 with magnitude greater than 6.2"],
156
  ]
157
 
158