File size: 9,149 Bytes
18a8050
 
 
 
 
83be062
18a8050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83be062
18a8050
83be062
 
18a8050
 
83be062
 
18a8050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83be062
18a8050
83be062
18a8050
 
83be062
18a8050
 
 
 
 
 
83be062
18a8050
 
 
 
 
 
 
 
 
 
 
 
 
 
83be062
18a8050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83be062
 
18a8050
83be062
 
 
18a8050
83be062
 
 
 
 
18a8050
83be062
 
 
 
 
 
 
 
18a8050
83be062
18a8050
83be062
 
 
 
18a8050
 
 
 
 
 
 
83be062
 
18a8050
 
 
 
83be062
18a8050
 
 
 
 
 
 
 
 
 
 
83be062
 
18a8050
83be062
18a8050
 
83be062
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import re
from difflib import SequenceMatcher
import requests
import xml.etree.ElementTree as ET
import gradio as gr
from concurrent.futures import ThreadPoolExecutor

areaData = {
    "Hong Kong": {
        "Central and Western": [
            "Sai Ying Pun", "Kennedy Town", "Shek Tong Tsui", "Sai Wan", "Sheung Wan",
            "Central", "Admiralty", "Mid-Levels West", "Mid-Levels", "The Peak"
        ],
        "Wan Chai": [
            "Wan Chai", "Causeway Bay", "Happy Valley", "Tai Hang", "Stubbs Road",
            "Jardine's Lookout"
        ],
        "Eastern": [
            "Tin Hau", "Braemar Hill", "North Point", "Quarry Bay", "Sai Wan Ho",
            "Shau Kei Wan", "Chai Wan", "Siu Sai Wan"
        ],
        "Southern": [
            "Pok Fu Lam", "Aberdeen", "Ap Lei Chau", "Wong Chuk Hang", "Shouson Hill",
            "Repulse Bay", "Chung Hom Kok", "Stanley", "Tai Tam", "Shek O", "Telegraph Bay"
        ]
    },
    "Kowloon": {
        "Yau Tsim Mong": [
            "Tsim Sha Tsui", "Yau Ma Tei", "West Kowloon", "Kowloon Tong", "Mong Kok",
            "Tai Kok Tsui", "Jordan", "Prince Edward"
        ],
        "Sham Shui Po": [
            "Mei Foo", "Lai Chi Kok", "Cheung Sha Wan", "Sham Shui Po", "Shek Kip Mei", "Tai Wo Ping", "Stonecutters Island"
        ],
        "Kowloon City": [
            "Hung Hom", "To Kwa Wan", "Ma Tau Kok", "Ma Tau Wai", "Kai Tak", "Kowloon City",
            "Ho Man Tin", "Kowloon Tong", "Beacon Hill"
        ],
        "Wong Tai Sin": [
            "San Po Kong", "Wong Tai Sin", "Tung Tau", "Wang Tau Hom", "Lok Fu", "Diamond Hill",
            "Tsz Wan Shan", "Ngau Chi Wan"
        ],
        "Kwun Tong": [
            "Ping Shek", "Kowloon Bay", "Ngau Tau Kok", "Tsz Wan Shan", "Kwun Tong",
            "Sau Mau Ping", "Lam Tin", "Yau Tong", "Lei Yue Mun"
        ]
    },
    "New Territories": {
        "Kwai Tsing": [
            "Kwai Chung", "Tsing Yi", "Kwai Fong"
        ],
        "Tsuen Wan": [
            "Tsuen Wan", "Tsing Lung Bridge", "Tsing Hung Bridge", "Shen Tsuen", "Tsing Chung Koon",
            "Ma Wan", "Tsing Lung Bridge"
        ],
        "Tuen Mun": [
            "Tai Lam Chung", "Siu Lam", "Tuen Mun", "Lam Tei"
        ],
        "Yuen Long": [
            "Hung Shui Kiu", "Ha Tsuen", "Lau Fau Shan", "Tin Shui Wai", "Yuen Long", "San Tin",
            "Lok Ma Chau", "Kam Tin", "Shek Kong", "Pat Heung"
        ],
        "North": [
            "Fanling", "Luen Wo Hui", "Sheung Shui", "Shek Wu Hui", "Sha Tau Kok", "Lok Keng",
            "Wu Kau Tang"
        ],
        "Tai Po": [
            "Tai Po Market", "Tai Po", "Tai Po Kau", "Tai Mei Tuk", "Plover Cove", "Cheung Uk Tau",
            "Tai Wo"
        ],
        "Sha Tin": [
            "Tai Wai", "Sha Tin", "Fo Tan", "Ma On Shan", "Shui Chuen O", "Ma On Shan"
        ],
        "Sai Kung": [
            "Clear Water Bay", "Sai Kung", "Tai Mong Tsai", "Tseung Kwan O", "Hang Hau",
            "Tiu Keng Leng", "Ma Yau Tong"
        ],
        "Islands": [
            "Cheung Chau", "Peng Chau", "Lantau Island", "Tung Chung", "Lamma Island"
        ]
    }
}


def normalize_text(text):
    return re.sub(r'\s+', ' ', text.lower().strip())


def normalize_address(address):
    return re.sub(r'[^\w\s]', '', re.sub(r'\s+', ' ', address)).strip().upper()


def load_and_normalize_address_pool(file_paths):
    address_pool = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    address = line.strip()
                    if address:
                        normalized = normalize_address(address)
                        address_pool.append((address, normalized))
        except FileNotFoundError:
            print(f"File not found: {file_path}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    return address_pool


def similarity(a, b):
    a, b = a.replace(' ', ''), b.replace(' ', '')
    return SequenceMatcher(None, a, b).ratio()


def extract_relevant_part(user_input):
    number_part = re.findall(r'\d+', user_input)
    number_part = number_part[0] if number_part else ''
    address_part = re.sub(r'^\d+', '', user_input).strip()
    return number_part, address_part


def match_address(user_input, address_pool):
    number_part, address_part = extract_relevant_part(user_input)
    normalized_input = normalize_address(address_part)
    best_match = None
    highest_similarity = 0
    for original_address, normalized_address in address_pool:
        sim = similarity(normalized_input, normalized_address)
        if sim > highest_similarity:
            highest_similarity = sim
            best_match = original_address
    if best_match:
        best_match = f"{number_part} {best_match}".strip() if number_part else best_match
    return best_match, highest_similarity


def fetch_address_from_als_api(user_input):
    api_url = f"https://www.als.gov.hk/lookup?q={requests.utils.quote(user_input)}"
    try:
        response = requests.get(api_url)
        response.raise_for_status()
        tree = ET.ElementTree(ET.fromstring(response.content))
        root = tree.getroot()
        result = {}

        eng_premises = root.find(".//EngPremisesAddress")
        if eng_premises is not None:
            result['English Address'] = {
                'Estate': eng_premises.findtext(".//EstateName", ''),
                'Street': eng_premises.findtext(".//StreetName", ''),
                'Building No': eng_premises.findtext(".//BuildingNoFrom", ''),
                'District': eng_premises.findtext(".//DcDistrict", ''),
                'Region': eng_premises.findtext(".//Region", '')
            }

        chi_premises = root.find(".//ChiPremisesAddress")
        if chi_premises is not None:
            result['Chinese Address'] = {
                'Estate': chi_premises.findtext(".//EstateName", ''),
                'Street': chi_premises.findtext(".//StreetName", ''),
                'Building No': chi_premises.findtext(".//BuildingNoFrom", ''),
                'District': chi_premises.findtext(".//DcDistrict", ''),
                'Region': chi_premises.findtext(".//Region", '')
            }

        geo_info = root.find(".//GeospatialInformation")
        if geo_info is not None:
            result['Geospatial Information'] = {
                'Latitude': geo_info.findtext(".//Latitude", ''),
                'Longitude': geo_info.findtext(".//Longitude", ''),
                'Northing': geo_info.findtext(".//Northing", ''),
                'Easting': geo_info.findtext(".//Easting", '')
            }

        return result
    except requests.RequestException as e:
        return f"Error fetching data from ALS API: {e}"


def extract_building_from_address(user_input):
    normalized_input = normalize_text(user_input)
    match = re.match(r'([^,]+)', normalized_input)
    return match.group(1).strip() if match else normalized_input


def address_search(user_inputs):
    results = []
    user_inputs_list = user_inputs.splitlines()

    def process_input(user_input):
        building_part = extract_building_from_address(user_input)
        normalized_input = normalize_address(building_part)
        best_match, similarity_score = match_address(normalized_input, address_pool)
        als_result = fetch_address_from_als_api(best_match) if best_match else "No match found."

        result_str = f"Best match: {best_match} (Similarity: {similarity_score:.2f})\n"
        if isinstance(als_result, dict):
            for address_type, details in als_result.items():
                result_str += f"\n{address_type}:\n"
                for key, value in details.items():
                    result_str += f"{key}: {value}\n"
        else:
            result_str += als_result

        return result_str

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_input, user_inputs_list))

    return "\n\n".join(results)


def clean_area_data(area_data):
    cleaned_area_data = {}
    for region, districts in area_data.items():
        cleaned_districts = {}
        for district, subdistricts in districts.items():
            valid_subdistricts = [normalize_text(name) for name in subdistricts if
                                  not re.search(r'Non-Building|Invalid|Other', name, re.I)]
            cleaned_districts[normalize_text(district)] = valid_subdistricts
        cleaned_area_data[normalize_text(region)] = cleaned_districts
    return cleaned_area_data


cleaned_area_data = clean_area_data(areaData)
file_paths = [
    'EngBuilding.txt',
    'EngEstate.txt',
    'EngStreet.txt',
    'EngVillage.txt'
]
address_pool = load_and_normalize_address_pool(file_paths)

interface = gr.Interface(
    fn=address_search,
    inputs=gr.Textbox(label="Enter Addresses (one per line, allow Batch Processing)", lines=10),
    outputs=gr.Textbox(label="ALS API Results"),
    title="Address Lookup and Matching (English)",
    description="Enter addresses to find the closest matches and fetch details from the ALS API."
)

interface.launch()