Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,7 @@ from haversine import haversine, Unit
|
|
| 16 |
dataset=None
|
| 17 |
|
| 18 |
|
|
|
|
| 19 |
def generate_human_readable(tokens,labels):
|
| 20 |
ret = []
|
| 21 |
for t,lab in zip(tokens,labels):
|
|
@@ -49,12 +50,8 @@ def getSlice(tensor):
|
|
| 49 |
|
| 50 |
def getIndex(input):
|
| 51 |
|
| 52 |
-
# Model name from Hugging Face model hub
|
| 53 |
-
model_name = "zekun-li/geolm-base-toponym-recognition"
|
| 54 |
|
| 55 |
-
|
| 56 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 57 |
-
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
| 58 |
|
| 59 |
# Tokenize input sentence
|
| 60 |
tokens = tokenizer.encode(input, return_tensors="pt")
|
|
@@ -126,11 +123,7 @@ def cutSlices(tensor, slicesList):
|
|
| 126 |
def MLearningFormInput(input):
|
| 127 |
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 132 |
-
|
| 133 |
-
model = GeoLMModel.from_pretrained(model_name)
|
| 134 |
|
| 135 |
tokens = tokenizer.encode(input, return_tensors="pt")
|
| 136 |
|
|
@@ -181,11 +174,8 @@ def generate_human_readable(tokens,labels):
|
|
| 181 |
|
| 182 |
def getLocationName(input_sentence):
|
| 183 |
# Model name from Hugging Face model hub
|
| 184 |
-
|
| 185 |
|
| 186 |
-
# Load tokenizer and model
|
| 187 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 188 |
-
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
| 189 |
|
| 190 |
# Tokenize input sentence
|
| 191 |
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
|
|
@@ -278,12 +268,14 @@ def search_geonames(toponym, df):
|
|
| 278 |
|
| 279 |
def get50Neigbors(locationID, dataset, k=50):
|
| 280 |
|
|
|
|
|
|
|
| 281 |
input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0]
|
| 282 |
|
| 283 |
|
| 284 |
lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name']
|
| 285 |
|
| 286 |
-
filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:
|
| 287 |
|
| 288 |
filtered_dataset['distance'] = filtered_dataset.apply(
|
| 289 |
lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS),
|
|
@@ -291,6 +283,10 @@ def get50Neigbors(locationID, dataset, k=50):
|
|
| 291 |
).copy()
|
| 292 |
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
filtered_dataset = filtered_dataset.sort_values(by='distance')
|
| 295 |
|
| 296 |
|
|
@@ -301,9 +297,8 @@ def get50Neigbors(locationID, dataset, k=50):
|
|
| 301 |
neighbors=nearest_neighbors.values.tolist()
|
| 302 |
|
| 303 |
|
| 304 |
-
|
| 305 |
|
| 306 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 307 |
|
| 308 |
sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
|
| 309 |
cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token)
|
|
@@ -328,8 +323,6 @@ def get50Neigbors(locationID, dataset, k=50):
|
|
| 328 |
|
| 329 |
#--------------------------------------------
|
| 330 |
|
| 331 |
-
model = GeoLMModel.from_pretrained(model_name)
|
| 332 |
-
|
| 333 |
|
| 334 |
tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long()
|
| 335 |
|
|
@@ -351,6 +344,9 @@ def get50Neigbors(locationID, dataset, k=50):
|
|
| 351 |
res=cutSlices(outputs.last_hidden_state, [targetIndex])
|
| 352 |
|
| 353 |
|
|
|
|
|
|
|
|
|
|
| 354 |
return res
|
| 355 |
|
| 356 |
|
|
@@ -374,22 +370,55 @@ def cosine_similarity(target_feature, candidate_feature):
|
|
| 374 |
|
| 375 |
def getCSV():
|
| 376 |
dataset = pd.read_csv('geohash.csv')
|
| 377 |
-
|
| 378 |
return dataset
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
def showing(df):
|
| 381 |
|
| 382 |
m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5)
|
| 383 |
|
| 384 |
-
size_scale = 100
|
| 385 |
-
color_scale = 255
|
| 386 |
-
|
| 387 |
for i in range(len(df)):
|
| 388 |
lat, lon, prob = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob']
|
| 389 |
|
| 390 |
size = int(prob**2 * size_scale )
|
| 391 |
color = int(prob**2 * color_scale)
|
| 392 |
|
|
|
|
| 393 |
folium.CircleMarker(
|
| 394 |
location=[lat, lon],
|
| 395 |
radius=size,
|
|
@@ -398,8 +427,10 @@ def showing(df):
|
|
| 398 |
fill_color=f'#{color:02X}0000'
|
| 399 |
).add_to(m)
|
| 400 |
|
|
|
|
| 401 |
m.save("map.html")
|
| 402 |
|
|
|
|
| 403 |
with open("map.html", "r", encoding="utf-8") as f:
|
| 404 |
map_html = f.read()
|
| 405 |
|
|
|
|
| 16 |
dataset=None
|
| 17 |
|
| 18 |
|
| 19 |
+
|
| 20 |
def generate_human_readable(tokens,labels):
|
| 21 |
ret = []
|
| 22 |
for t,lab in zip(tokens,labels):
|
|
|
|
| 50 |
|
| 51 |
def getIndex(input):
|
| 52 |
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
tokenizer, model= getModel1()
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# Tokenize input sentence
|
| 57 |
tokens = tokenizer.encode(input, return_tensors="pt")
|
|
|
|
| 123 |
def MLearningFormInput(input):
|
| 124 |
|
| 125 |
|
| 126 |
+
tokenizer,model=getModel2()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
tokens = tokenizer.encode(input, return_tensors="pt")
|
| 129 |
|
|
|
|
| 174 |
|
| 175 |
def getLocationName(input_sentence):
|
| 176 |
# Model name from Hugging Face model hub
|
| 177 |
+
tokenizer, model= getModel1()
|
| 178 |
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
# Tokenize input sentence
|
| 181 |
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
|
|
|
|
| 268 |
|
| 269 |
def get50Neigbors(locationID, dataset, k=50):
|
| 270 |
|
| 271 |
+
print("neighbor part----------------------------------------------------------------")
|
| 272 |
+
|
| 273 |
input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0]
|
| 274 |
|
| 275 |
|
| 276 |
lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name']
|
| 277 |
|
| 278 |
+
filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:7])].copy()
|
| 279 |
|
| 280 |
filtered_dataset['distance'] = filtered_dataset.apply(
|
| 281 |
lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS),
|
|
|
|
| 283 |
).copy()
|
| 284 |
|
| 285 |
|
| 286 |
+
print("neighbor end----------------------------------------------------------------")
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
|
| 290 |
filtered_dataset = filtered_dataset.sort_values(by='distance')
|
| 291 |
|
| 292 |
|
|
|
|
| 297 |
neighbors=nearest_neighbors.values.tolist()
|
| 298 |
|
| 299 |
|
| 300 |
+
tokenizer, model= getModel1_0()
|
| 301 |
|
|
|
|
| 302 |
|
| 303 |
sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
|
| 304 |
cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token)
|
|
|
|
| 323 |
|
| 324 |
#--------------------------------------------
|
| 325 |
|
|
|
|
|
|
|
| 326 |
|
| 327 |
tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long()
|
| 328 |
|
|
|
|
| 344 |
res=cutSlices(outputs.last_hidden_state, [targetIndex])
|
| 345 |
|
| 346 |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
|
| 350 |
return res
|
| 351 |
|
| 352 |
|
|
|
|
| 370 |
|
| 371 |
def getCSV():
|
| 372 |
dataset = pd.read_csv('geohash.csv')
|
|
|
|
| 373 |
return dataset
|
| 374 |
|
| 375 |
+
@st.cache_data
|
| 376 |
+
|
| 377 |
+
def getModel1():
|
| 378 |
+
# Model name from Hugging Face model hub
|
| 379 |
+
model_name = "zekun-li/geolm-base-toponym-recognition"
|
| 380 |
+
|
| 381 |
+
# Load tokenizer and model
|
| 382 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 383 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
| 384 |
+
|
| 385 |
+
return tokenizer,model
|
| 386 |
+
|
| 387 |
+
def getModel1_0():
|
| 388 |
+
# Model name from Hugging Face model hub
|
| 389 |
+
model_name = "zekun-li/geolm-base-toponym-recognition"
|
| 390 |
+
|
| 391 |
+
# Load tokenizer and model
|
| 392 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 393 |
+
model = GeoLMModel.from_pretrained(model_name)
|
| 394 |
+
return tokenizer,model
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def getModel2():
|
| 399 |
+
|
| 400 |
+
model_name = "zekun-li/geolm-base-cased"
|
| 401 |
+
|
| 402 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 403 |
+
|
| 404 |
+
model = GeoLMModel.from_pretrained(model_name)
|
| 405 |
+
|
| 406 |
+
return tokenizer,model
|
| 407 |
+
|
| 408 |
+
|
| 409 |
def showing(df):
|
| 410 |
|
| 411 |
m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5)
|
| 412 |
|
| 413 |
+
size_scale = 100
|
| 414 |
+
color_scale = 255
|
|
|
|
| 415 |
for i in range(len(df)):
|
| 416 |
lat, lon, prob = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob']
|
| 417 |
|
| 418 |
size = int(prob**2 * size_scale )
|
| 419 |
color = int(prob**2 * color_scale)
|
| 420 |
|
| 421 |
+
# 在Folium地图上添加标记
|
| 422 |
folium.CircleMarker(
|
| 423 |
location=[lat, lon],
|
| 424 |
radius=size,
|
|
|
|
| 427 |
fill_color=f'#{color:02X}0000'
|
| 428 |
).add_to(m)
|
| 429 |
|
| 430 |
+
# 保存Folium地图为HTML文件
|
| 431 |
m.save("map.html")
|
| 432 |
|
| 433 |
+
# 在Streamlit中嵌入HTML文件
|
| 434 |
with open("map.html", "r", encoding="utf-8") as f:
|
| 435 |
map_html = f.read()
|
| 436 |
|