Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ from difflib import SequenceMatcher
|
|
| 3 |
import requests
|
| 4 |
import xml.etree.ElementTree as ET
|
| 5 |
import gradio as gr
|
|
|
|
| 6 |
|
| 7 |
areaData = {
|
| 8 |
"Hong Kong": {
|
|
@@ -80,15 +81,14 @@ areaData = {
|
|
| 80 |
}
|
| 81 |
}
|
| 82 |
|
|
|
|
| 83 |
def normalize_text(text):
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
return standardized
|
| 87 |
|
| 88 |
def normalize_address(address):
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
return address.strip().upper()
|
| 92 |
|
| 93 |
def load_and_normalize_address_pool(file_paths):
|
| 94 |
address_pool = []
|
|
@@ -106,15 +106,19 @@ def load_and_normalize_address_pool(file_paths):
|
|
| 106 |
print(f"Error reading file {file_path}: {e}")
|
| 107 |
return address_pool
|
| 108 |
|
|
|
|
| 109 |
def similarity(a, b):
|
|
|
|
| 110 |
return SequenceMatcher(None, a, b).ratio()
|
| 111 |
|
|
|
|
| 112 |
def extract_relevant_part(user_input):
|
| 113 |
number_part = re.findall(r'\d+', user_input)
|
| 114 |
number_part = number_part[0] if number_part else ''
|
| 115 |
address_part = re.sub(r'^\d+', '', user_input).strip()
|
| 116 |
return number_part, address_part
|
| 117 |
|
|
|
|
| 118 |
def match_address(user_input, address_pool):
|
| 119 |
number_part, address_part = extract_relevant_part(user_input)
|
| 120 |
normalized_input = normalize_address(address_part)
|
|
@@ -129,6 +133,7 @@ def match_address(user_input, address_pool):
|
|
| 129 |
best_match = f"{number_part} {best_match}".strip() if number_part else best_match
|
| 130 |
return best_match, highest_similarity
|
| 131 |
|
|
|
|
| 132 |
def fetch_address_from_als_api(user_input):
|
| 133 |
api_url = f"https://www.als.gov.hk/lookup?q={requests.utils.quote(user_input)}"
|
| 134 |
try:
|
|
@@ -175,27 +180,34 @@ def fetch_address_from_als_api(user_input):
|
|
| 175 |
def extract_building_from_address(user_input):
|
| 176 |
normalized_input = normalize_text(user_input)
|
| 177 |
match = re.match(r'([^,]+)', normalized_input)
|
| 178 |
-
|
|
|
|
| 179 |
|
| 180 |
-
|
|
|
|
|
|
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
-
|
| 190 |
-
if isinstance(als_result, dict):
|
| 191 |
-
for address_type, details in als_result.items():
|
| 192 |
-
result_str += f"\n{address_type}:\n"
|
| 193 |
-
for key, value in details.items():
|
| 194 |
-
result_str += f"{key}: {value}\n"
|
| 195 |
-
else:
|
| 196 |
-
result_str += als_result
|
| 197 |
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
def clean_area_data(area_data):
|
|
@@ -203,11 +215,13 @@ def clean_area_data(area_data):
|
|
| 203 |
for region, districts in area_data.items():
|
| 204 |
cleaned_districts = {}
|
| 205 |
for district, subdistricts in districts.items():
|
| 206 |
-
valid_subdistricts = [normalize_text(name) for name in subdistricts if
|
|
|
|
| 207 |
cleaned_districts[normalize_text(district)] = valid_subdistricts
|
| 208 |
cleaned_area_data[normalize_text(region)] = cleaned_districts
|
| 209 |
return cleaned_area_data
|
| 210 |
|
|
|
|
| 211 |
cleaned_area_data = clean_area_data(areaData)
|
| 212 |
file_paths = [
|
| 213 |
'EngBuilding.txt',
|
|
@@ -219,10 +233,10 @@ address_pool = load_and_normalize_address_pool(file_paths)
|
|
| 219 |
|
| 220 |
interface = gr.Interface(
|
| 221 |
fn=address_search,
|
| 222 |
-
inputs=gr.Textbox(label="Enter
|
| 223 |
-
outputs=gr.Textbox(label="ALS API
|
| 224 |
title="Address Lookup and Matching (English)",
|
| 225 |
-
description="Enter
|
| 226 |
)
|
| 227 |
|
| 228 |
-
interface.launch()
|
|
|
|
| 3 |
import requests
|
| 4 |
import xml.etree.ElementTree as ET
|
| 5 |
import gradio as gr
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 7 |
|
| 8 |
areaData = {
|
| 9 |
"Hong Kong": {
|
|
|
|
| 81 |
}
|
| 82 |
}
|
| 83 |
|
| 84 |
+
|
| 85 |
def normalize_text(text):
|
| 86 |
+
return re.sub(r'\s+', ' ', text.lower().strip())
|
| 87 |
+
|
|
|
|
| 88 |
|
| 89 |
def normalize_address(address):
|
| 90 |
+
return re.sub(r'[^\w\s]', '', re.sub(r'\s+', ' ', address)).strip().upper()
|
| 91 |
+
|
|
|
|
| 92 |
|
| 93 |
def load_and_normalize_address_pool(file_paths):
|
| 94 |
address_pool = []
|
|
|
|
| 106 |
print(f"Error reading file {file_path}: {e}")
|
| 107 |
return address_pool
|
| 108 |
|
| 109 |
+
|
| 110 |
def similarity(a, b):
|
| 111 |
+
a, b = a.replace(' ', ''), b.replace(' ', '')
|
| 112 |
return SequenceMatcher(None, a, b).ratio()
|
| 113 |
|
| 114 |
+
|
| 115 |
def extract_relevant_part(user_input):
|
| 116 |
number_part = re.findall(r'\d+', user_input)
|
| 117 |
number_part = number_part[0] if number_part else ''
|
| 118 |
address_part = re.sub(r'^\d+', '', user_input).strip()
|
| 119 |
return number_part, address_part
|
| 120 |
|
| 121 |
+
|
| 122 |
def match_address(user_input, address_pool):
|
| 123 |
number_part, address_part = extract_relevant_part(user_input)
|
| 124 |
normalized_input = normalize_address(address_part)
|
|
|
|
| 133 |
best_match = f"{number_part} {best_match}".strip() if number_part else best_match
|
| 134 |
return best_match, highest_similarity
|
| 135 |
|
| 136 |
+
|
| 137 |
def fetch_address_from_als_api(user_input):
|
| 138 |
api_url = f"https://www.als.gov.hk/lookup?q={requests.utils.quote(user_input)}"
|
| 139 |
try:
|
|
|
|
| 180 |
def extract_building_from_address(user_input):
|
| 181 |
normalized_input = normalize_text(user_input)
|
| 182 |
match = re.match(r'([^,]+)', normalized_input)
|
| 183 |
+
return match.group(1).strip() if match else normalized_input
|
| 184 |
+
|
| 185 |
|
| 186 |
+
def address_search(user_inputs):
|
| 187 |
+
results = []
|
| 188 |
+
user_inputs_list = user_inputs.splitlines()
|
| 189 |
|
| 190 |
+
def process_input(user_input):
|
| 191 |
+
building_part = extract_building_from_address(user_input)
|
| 192 |
+
normalized_input = normalize_address(building_part)
|
| 193 |
+
best_match, similarity_score = match_address(normalized_input, address_pool)
|
| 194 |
+
als_result = fetch_address_from_als_api(best_match) if best_match else "No match found."
|
| 195 |
|
| 196 |
+
result_str = f"Best match: {best_match} (Similarity: {similarity_score:.2f})\n"
|
| 197 |
+
if isinstance(als_result, dict):
|
| 198 |
+
for address_type, details in als_result.items():
|
| 199 |
+
result_str += f"\n{address_type}:\n"
|
| 200 |
+
for key, value in details.items():
|
| 201 |
+
result_str += f"{key}: {value}\n"
|
| 202 |
+
else:
|
| 203 |
+
result_str += als_result
|
| 204 |
|
| 205 |
+
return result_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
with ThreadPoolExecutor() as executor:
|
| 208 |
+
results = list(executor.map(process_input, user_inputs_list))
|
| 209 |
+
|
| 210 |
+
return "\n\n".join(results)
|
| 211 |
|
| 212 |
|
| 213 |
def clean_area_data(area_data):
|
|
|
|
| 215 |
for region, districts in area_data.items():
|
| 216 |
cleaned_districts = {}
|
| 217 |
for district, subdistricts in districts.items():
|
| 218 |
+
valid_subdistricts = [normalize_text(name) for name in subdistricts if
|
| 219 |
+
not re.search(r'Non-Building|Invalid|Other', name, re.I)]
|
| 220 |
cleaned_districts[normalize_text(district)] = valid_subdistricts
|
| 221 |
cleaned_area_data[normalize_text(region)] = cleaned_districts
|
| 222 |
return cleaned_area_data
|
| 223 |
|
| 224 |
+
|
| 225 |
cleaned_area_data = clean_area_data(areaData)
|
| 226 |
file_paths = [
|
| 227 |
'EngBuilding.txt',
|
|
|
|
| 233 |
|
| 234 |
interface = gr.Interface(
|
| 235 |
fn=address_search,
|
| 236 |
+
inputs=gr.Textbox(label="Enter Addresses (one per line, allow Batch Processing)", lines=10),
|
| 237 |
+
outputs=gr.Textbox(label="ALS API Results"),
|
| 238 |
title="Address Lookup and Matching (English)",
|
| 239 |
+
description="Enter addresses to find the closest matches and fetch details from the ALS API."
|
| 240 |
)
|
| 241 |
|
| 242 |
+
interface.launch()
|