luvici1111100 commited on
Commit
b523967
·
verified ·
1 Parent(s): 6496853

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -236,6 +236,7 @@ def is_real_estate_ad(message_content):
236
  return False
237
 
238
  return is_ad
 
239
  def extract_ad_details(message_content):
240
  """ Extracts details from a real estate advertisement and translates the location. """
241
  all_extracted_ads = []
@@ -264,6 +265,12 @@ def extract_ad_details(message_content):
264
  'نوع العملية': '', 'Ad Text': '', 'price_value_numeric': None
265
  }
266
 
 
 
 
 
 
 
267
  for prop_type, pattern in property_type_patterns_original.items():
268
  if re.search(r'\b' + pattern + r'\b', block, re.IGNORECASE):
269
  details['نوع العقار'] = prop_type
@@ -278,7 +285,6 @@ def extract_ad_details(message_content):
278
  if unified_location:
279
  details['المنطقة'] = ARABIC_TO_ENGLISH_TRANSLATION.get(unified_location, unified_location)
280
 
281
- # --- START: MODIFICATION FOR AREA FILTERING ---
282
  area_patterns = [
283
  r'(\d+(?:[.,]\d+)?)\s*(?:متر|م|sqm|m2|م²|meter|امتار|Meter|Area|area)',
284
  r'(?:مساحة|مساحه|area|Area)\s*(\d+(?:[.,]\d+)?)',
@@ -288,22 +294,27 @@ def extract_ad_details(message_content):
288
  for pattern in area_patterns:
289
  match = re.search(pattern, block, re.IGNORECASE)
290
  if match:
 
 
 
 
 
 
 
291
  val_str = match.group(1) if match.group(1) else match.group(2)
292
  if val_str:
293
  val_numeric = float(val_str.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩,', '0123456789.')))
294
- unit = 'F' if 'فدان' in match.group(0).lower() else 'm'
295
 
296
- # Filter out ads with area less than 21m (unless it's in Feddan)
297
- if unit == 'm' and val_numeric < 21:
298
- continue # Skip to the next ad block
299
 
300
  details['مساحة العقار'] = f"{val_numeric} {unit}"
301
  area_found = True
302
- break # Area found and passes filter, no need to check other patterns
303
 
304
  if not area_found:
305
- continue # Skip ad if no valid area is found
306
- # --- END: MODIFICATION FOR AREA FILTERING ---
307
 
308
  price_value = None
309
  currency_symbol = "EGP"
@@ -376,7 +387,6 @@ def extract_ad_details(message_content):
376
  all_extracted_ads.append(details)
377
 
378
  return all_extracted_ads
379
-
380
  def process_multiple_chat_files(files):
381
  """ Processes multiple chat files and returns extracted real estate ads with duplicates removed. """
382
  if not files:
 
236
  return False
237
 
238
  return is_ad
239
+
240
  def extract_ad_details(message_content):
241
  """ Extracts details from a real estate advertisement and translates the location. """
242
  all_extracted_ads = []
 
265
  'نوع العملية': '', 'Ad Text': '', 'price_value_numeric': None
266
  }
267
 
268
+ # --- START: MODIFICATION TO EXCLUDE LAND ---
269
+ # أولاً، تحقق مما إذا كان الإعلان عن "أرض" وقم بتجاهله فوراً
270
+ if re.search(r'\b(أرض|land|plot)\b', block, re.IGNORECASE):
271
+ continue # تجاهل هذا الإعلان وانتقل إلى التالي
272
+ # --- END: MODIFICATION TO EXCLUDE LAND ---
273
+
274
  for prop_type, pattern in property_type_patterns_original.items():
275
  if re.search(r'\b' + pattern + r'\b', block, re.IGNORECASE):
276
  details['نوع العقار'] = prop_type
 
285
  if unified_location:
286
  details['المنطقة'] = ARABIC_TO_ENGLISH_TRANSLATION.get(unified_location, unified_location)
287
 
 
288
  area_patterns = [
289
  r'(\d+(?:[.,]\d+)?)\s*(?:متر|م|sqm|m2|م²|meter|امتار|Meter|Area|area)',
290
  r'(?:مساحة|مساحه|area|Area)\s*(\d+(?:[.,]\d+)?)',
 
294
  for pattern in area_patterns:
295
  match = re.search(pattern, block, re.IGNORECASE)
296
  if match:
297
+ # --- START: MODIFICATION TO EXCLUDE FEDDAN ---
298
+ # تحقق من وجود كلمة "فدان" وتجاهل الإعلان إذا وجدت
299
+ if 'فدان' in match.group(0).lower():
300
+ area_found = False # تأكد من أن هذا الإعلان سيتم تجاهله
301
+ break # اخرج من حلقة البحث عن المساحة
302
+ # --- END: MODIFICATION TO EXCLUDE FEDDAN ---
303
+
304
  val_str = match.group(1) if match.group(1) else match.group(2)
305
  if val_str:
306
  val_numeric = float(val_str.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩,', '0123456789.')))
307
+ unit = 'm'
308
 
309
+ if val_numeric < 21:
310
+ continue
 
311
 
312
  details['مساحة العقار'] = f"{val_numeric} {unit}"
313
  area_found = True
314
+ break
315
 
316
  if not area_found:
317
+ continue
 
318
 
319
  price_value = None
320
  currency_symbol = "EGP"
 
387
  all_extracted_ads.append(details)
388
 
389
  return all_extracted_ads
 
390
  def process_multiple_chat_files(files):
391
  """ Processes multiple chat files and returns extracted real estate ads with duplicates removed. """
392
  if not files: