Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
src/scrapers/google_maps_scraper.py
CHANGED
|
@@ -45,10 +45,11 @@ class GoogleMapsScraper:
|
|
| 45 |
"//div[@role='tablist']//button[contains(., 'Review')]",
|
| 46 |
],
|
| 47 |
|
| 48 |
-
# Scrollable container -
|
| 49 |
"scroll_container": [
|
| 50 |
-
"//div[@role='feed']",
|
| 51 |
"//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb')]",
|
|
|
|
|
|
|
| 52 |
"//div[contains(@class, 'm6QErb')][@tabindex='-1']",
|
| 53 |
],
|
| 54 |
|
|
@@ -59,10 +60,11 @@ class GoogleMapsScraper:
|
|
| 59 |
"//div[contains(@class, 'jftiEf') and contains(@class, 'fontBodyMedium')]",
|
| 60 |
],
|
| 61 |
|
| 62 |
-
# Review text -
|
| 63 |
"review_text": [
|
| 64 |
-
".//span[@class='wiI7pd']",
|
| 65 |
".//span[contains(@class, 'wiI7pd')]",
|
|
|
|
|
|
|
| 66 |
".//div[contains(@class, 'MyEned')]//span",
|
| 67 |
],
|
| 68 |
|
|
@@ -163,8 +165,21 @@ class GoogleMapsScraper:
|
|
| 163 |
# Click "More" to expand if needed
|
| 164 |
self._expand_review(card)
|
| 165 |
|
| 166 |
-
# Extract text
|
| 167 |
-
text =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
if text and len(text.strip()) > 10:
|
| 170 |
collected_ids.add(card_id)
|
|
@@ -351,14 +366,16 @@ class GoogleMapsScraper:
|
|
| 351 |
return 0.0
|
| 352 |
|
| 353 |
def _scroll_down(self, container):
|
| 354 |
-
"""Scroll down in the container."""
|
| 355 |
try:
|
| 356 |
if container:
|
|
|
|
| 357 |
self.driver.execute_script(
|
| 358 |
-
"arguments[0].scrollTop = arguments[0].
|
| 359 |
container
|
| 360 |
)
|
| 361 |
else:
|
|
|
|
| 362 |
ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
|
| 363 |
except:
|
| 364 |
ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
|
|
|
|
| 45 |
"//div[@role='tablist']//button[contains(., 'Review')]",
|
| 46 |
],
|
| 47 |
|
| 48 |
+
# Scrollable container - from working version
|
| 49 |
"scroll_container": [
|
|
|
|
| 50 |
"//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb')]",
|
| 51 |
+
"//div[contains(@class, 'XiKgde')]",
|
| 52 |
+
"//div[@role='feed']",
|
| 53 |
"//div[contains(@class, 'm6QErb')][@tabindex='-1']",
|
| 54 |
],
|
| 55 |
|
|
|
|
| 60 |
"//div[contains(@class, 'jftiEf') and contains(@class, 'fontBodyMedium')]",
|
| 61 |
],
|
| 62 |
|
| 63 |
+
# Review text - from working version with expanded text support
|
| 64 |
"review_text": [
|
|
|
|
| 65 |
".//span[contains(@class, 'wiI7pd')]",
|
| 66 |
+
".//span[@jsname='fbQN7e']", # Full expanded text
|
| 67 |
+
".//span[@jsname='bN97Pc']", # Truncated text
|
| 68 |
".//div[contains(@class, 'MyEned')]//span",
|
| 69 |
],
|
| 70 |
|
|
|
|
| 165 |
# Click "More" to expand if needed
|
| 166 |
self._expand_review(card)
|
| 167 |
|
| 168 |
+
# Extract text - try multiple approaches like working version
|
| 169 |
+
text = ""
|
| 170 |
+
# First try expanded text selectors
|
| 171 |
+
for selector in [".//span[@jsname='fbQN7e']", ".//span[contains(@class, 'wiI7pd')]"]:
|
| 172 |
+
try:
|
| 173 |
+
elem = card.find_element(By.XPATH, selector)
|
| 174 |
+
t = elem.text.strip()
|
| 175 |
+
if t and len(t) > len(text):
|
| 176 |
+
text = t
|
| 177 |
+
except:
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
# Fallback to general text selectors
|
| 181 |
+
if not text:
|
| 182 |
+
text = self._extract_text(card, self.SELECTORS["review_text"])
|
| 183 |
|
| 184 |
if text and len(text.strip()) > 10:
|
| 185 |
collected_ids.add(card_id)
|
|
|
|
| 366 |
return 0.0
|
| 367 |
|
| 368 |
def _scroll_down(self, container):
|
| 369 |
+
"""Scroll down in the container - matches working version."""
|
| 370 |
try:
|
| 371 |
if container:
|
| 372 |
+
# Scroll to bottom (like working version)
|
| 373 |
self.driver.execute_script(
|
| 374 |
+
"arguments[0].scrollTop = arguments[0].scrollHeight",
|
| 375 |
container
|
| 376 |
)
|
| 377 |
else:
|
| 378 |
+
# Fallback: scroll the page
|
| 379 |
ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
|
| 380 |
except:
|
| 381 |
ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
|