Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -236,6 +236,7 @@ def is_real_estate_ad(message_content):
|
|
| 236 |
return False
|
| 237 |
|
| 238 |
return is_ad
|
|
|
|
| 239 |
def extract_ad_details(message_content):
|
| 240 |
""" Extracts details from a real estate advertisement and translates the location. """
|
| 241 |
all_extracted_ads = []
|
|
@@ -264,6 +265,12 @@ def extract_ad_details(message_content):
|
|
| 264 |
'نوع العملية': '', 'Ad Text': '', 'price_value_numeric': None
|
| 265 |
}
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
for prop_type, pattern in property_type_patterns_original.items():
|
| 268 |
if re.search(r'\b' + pattern + r'\b', block, re.IGNORECASE):
|
| 269 |
details['نوع العقار'] = prop_type
|
|
@@ -278,7 +285,6 @@ def extract_ad_details(message_content):
|
|
| 278 |
if unified_location:
|
| 279 |
details['المنطقة'] = ARABIC_TO_ENGLISH_TRANSLATION.get(unified_location, unified_location)
|
| 280 |
|
| 281 |
-
# --- START: MODIFICATION FOR AREA FILTERING ---
|
| 282 |
area_patterns = [
|
| 283 |
r'(\d+(?:[.,]\d+)?)\s*(?:متر|م|sqm|m2|م²|meter|امتار|Meter|Area|area)',
|
| 284 |
r'(?:مساحة|مساحه|area|Area)\s*(\d+(?:[.,]\d+)?)',
|
|
@@ -288,22 +294,27 @@ def extract_ad_details(message_content):
|
|
| 288 |
for pattern in area_patterns:
|
| 289 |
match = re.search(pattern, block, re.IGNORECASE)
|
| 290 |
if match:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
val_str = match.group(1) if match.group(1) else match.group(2)
|
| 292 |
if val_str:
|
| 293 |
val_numeric = float(val_str.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩,', '0123456789.')))
|
| 294 |
-
unit = '
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
continue # Skip to the next ad block
|
| 299 |
|
| 300 |
details['مساحة العقار'] = f"{val_numeric} {unit}"
|
| 301 |
area_found = True
|
| 302 |
-
break
|
| 303 |
|
| 304 |
if not area_found:
|
| 305 |
-
continue
|
| 306 |
-
# --- END: MODIFICATION FOR AREA FILTERING ---
|
| 307 |
|
| 308 |
price_value = None
|
| 309 |
currency_symbol = "EGP"
|
|
@@ -376,7 +387,6 @@ def extract_ad_details(message_content):
|
|
| 376 |
all_extracted_ads.append(details)
|
| 377 |
|
| 378 |
return all_extracted_ads
|
| 379 |
-
|
| 380 |
def process_multiple_chat_files(files):
|
| 381 |
""" Processes multiple chat files and returns extracted real estate ads with duplicates removed. """
|
| 382 |
if not files:
|
|
|
|
| 236 |
return False
|
| 237 |
|
| 238 |
return is_ad
|
| 239 |
+
|
| 240 |
def extract_ad_details(message_content):
|
| 241 |
""" Extracts details from a real estate advertisement and translates the location. """
|
| 242 |
all_extracted_ads = []
|
|
|
|
| 265 |
'نوع العملية': '', 'Ad Text': '', 'price_value_numeric': None
|
| 266 |
}
|
| 267 |
|
| 268 |
+
# --- START: MODIFICATION TO EXCLUDE LAND ---
|
| 269 |
+
# أولاً، تحقق مما إذا كان الإعلان عن "أرض" وقم بتجاهله فوراً
|
| 270 |
+
if re.search(r'\b(أرض|land|plot)\b', block, re.IGNORECASE):
|
| 271 |
+
continue # تجاهل هذا الإعلان وانتقل إلى التالي
|
| 272 |
+
# --- END: MODIFICATION TO EXCLUDE LAND ---
|
| 273 |
+
|
| 274 |
for prop_type, pattern in property_type_patterns_original.items():
|
| 275 |
if re.search(r'\b' + pattern + r'\b', block, re.IGNORECASE):
|
| 276 |
details['نوع العقار'] = prop_type
|
|
|
|
| 285 |
if unified_location:
|
| 286 |
details['المنطقة'] = ARABIC_TO_ENGLISH_TRANSLATION.get(unified_location, unified_location)
|
| 287 |
|
|
|
|
| 288 |
area_patterns = [
|
| 289 |
r'(\d+(?:[.,]\d+)?)\s*(?:متر|م|sqm|m2|م²|meter|امتار|Meter|Area|area)',
|
| 290 |
r'(?:مساحة|مساحه|area|Area)\s*(\d+(?:[.,]\d+)?)',
|
|
|
|
| 294 |
for pattern in area_patterns:
|
| 295 |
match = re.search(pattern, block, re.IGNORECASE)
|
| 296 |
if match:
|
| 297 |
+
# --- START: MODIFICATION TO EXCLUDE FEDDAN ---
|
| 298 |
+
# تحقق من وجود كلمة "فدان" وتجاهل الإعلان إذا وجدت
|
| 299 |
+
if 'فدان' in match.group(0).lower():
|
| 300 |
+
area_found = False # تأكد من أن هذا الإعلان سيتم تجاهله
|
| 301 |
+
break # اخرج من حلقة البحث عن المساحة
|
| 302 |
+
# --- END: MODIFICATION TO EXCLUDE FEDDAN ---
|
| 303 |
+
|
| 304 |
val_str = match.group(1) if match.group(1) else match.group(2)
|
| 305 |
if val_str:
|
| 306 |
val_numeric = float(val_str.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩,', '0123456789.')))
|
| 307 |
+
unit = 'm'
|
| 308 |
|
| 309 |
+
if val_numeric < 21:
|
| 310 |
+
continue
|
|
|
|
| 311 |
|
| 312 |
details['مساحة العقار'] = f"{val_numeric} {unit}"
|
| 313 |
area_found = True
|
| 314 |
+
break
|
| 315 |
|
| 316 |
if not area_found:
|
| 317 |
+
continue
|
|
|
|
| 318 |
|
| 319 |
price_value = None
|
| 320 |
currency_symbol = "EGP"
|
|
|
|
| 387 |
all_extracted_ads.append(details)
|
| 388 |
|
| 389 |
return all_extracted_ads
|
|
|
|
| 390 |
def process_multiple_chat_files(files):
|
| 391 |
""" Processes multiple chat files and returns extracted real estate ads with duplicates removed. """
|
| 392 |
if not files:
|