Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -228,8 +228,9 @@ for key, default in [
|
|
| 228 |
# ββ Utilities ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 229 |
|
| 230 |
def clean_text(text):
|
| 231 |
-
|
| 232 |
-
text = re.sub(r'[
|
|
|
|
| 233 |
return text.strip()
|
| 234 |
|
| 235 |
def is_valid_url(url):
|
|
@@ -255,56 +256,51 @@ def scrape_website(url):
|
|
| 255 |
browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
|
| 256 |
page = browser.new_page()
|
| 257 |
try:
|
| 258 |
-
# Use networkidle so JS-rendered and late-loading content is fully ready
|
| 259 |
page.goto(url, wait_until="networkidle", timeout=45000)
|
| 260 |
title = page.title()
|
| 261 |
|
| 262 |
-
#
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
".main-content",
|
| 268 |
-
"#main",
|
| 269 |
-
"article",
|
| 270 |
-
".container", # generic bootstrap layouts
|
| 271 |
-
"#wrapper",
|
| 272 |
-
".page-content",
|
| 273 |
-
".site-content",
|
| 274 |
-
]
|
| 275 |
-
el = None
|
| 276 |
-
for sel in priority_selectors:
|
| 277 |
try:
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
| 281 |
except:
|
| 282 |
continue
|
| 283 |
|
| 284 |
-
#
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
body = page.query_selector("body")
|
| 304 |
-
|
| 305 |
|
| 306 |
-
logging.info(f"Scraped {len(
|
| 307 |
-
return {"title": title, "content":
|
| 308 |
except Exception as e:
|
| 309 |
logging.error(f"Scrape error: {e}")
|
| 310 |
st.error(f"Scraping failed: {e}")
|
|
@@ -484,4 +480,4 @@ else:
|
|
| 484 |
</div>
|
| 485 |
Enter a URL above and hit <strong>Scrape</strong> to get started.
|
| 486 |
</div>
|
| 487 |
-
""", unsafe_allow_html=True)
|
|
|
|
| 228 |
# ββ Utilities ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 229 |
|
| 230 |
def clean_text(text):
|
| 231 |
+
# Only collapse whitespace β preserve Rs. prices, commas, symbols
|
| 232 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 233 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 234 |
return text.strip()
|
| 235 |
|
| 236 |
def is_valid_url(url):
|
|
|
|
| 256 |
browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
|
| 257 |
page = browser.new_page()
|
| 258 |
try:
|
|
|
|
| 259 |
page.goto(url, wait_until="networkidle", timeout=45000)
|
| 260 |
title = page.title()
|
| 261 |
|
| 262 |
+
# Strategy 1: extract structured name+price pairs from <li> elements
|
| 263 |
+
# Works well for listing/price pages like whatmobile.com.pk
|
| 264 |
+
lines = []
|
| 265 |
+
li_elements = page.query_selector_all("li")
|
| 266 |
+
for li in li_elements:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
try:
|
| 268 |
+
text = li.inner_text().strip()
|
| 269 |
+
# Keep li items that contain a heading and a price-like pattern
|
| 270 |
+
if text and len(text) > 3 and len(text) < 300:
|
| 271 |
+
lines.append(text)
|
| 272 |
except:
|
| 273 |
continue
|
| 274 |
|
| 275 |
+
# Strategy 2: grab all headings and paragraphs too
|
| 276 |
+
for tag in ["h1", "h2", "h3", "h4", "p", "td"]:
|
| 277 |
+
elements = page.query_selector_all(tag)
|
| 278 |
+
for e in elements:
|
| 279 |
+
try:
|
| 280 |
+
text = e.inner_text().strip()
|
| 281 |
+
if text and len(text) > 3 and len(text) < 500:
|
| 282 |
+
lines.append(text)
|
| 283 |
+
except:
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
# Deduplicate while preserving order
|
| 287 |
+
seen = set()
|
| 288 |
+
unique_lines = []
|
| 289 |
+
for line in lines:
|
| 290 |
+
normalised = re.sub(r'\s+', ' ', line).strip()
|
| 291 |
+
if normalised not in seen:
|
| 292 |
+
seen.add(normalised)
|
| 293 |
+
unique_lines.append(normalised)
|
| 294 |
+
|
| 295 |
+
content = "\n".join(unique_lines)
|
| 296 |
+
|
| 297 |
+
# Fallback to full body if we got almost nothing
|
| 298 |
+
if len(content) < 200:
|
| 299 |
body = page.query_selector("body")
|
| 300 |
+
content = clean_text(body.inner_text()) if body else content
|
| 301 |
|
| 302 |
+
logging.info(f"Scraped {len(content)} chars from {url}")
|
| 303 |
+
return {"title": title, "content": content, "url": url}
|
| 304 |
except Exception as e:
|
| 305 |
logging.error(f"Scrape error: {e}")
|
| 306 |
st.error(f"Scraping failed: {e}")
|
|
|
|
| 480 |
</div>
|
| 481 |
Enter a URL above and hit <strong>Scrape</strong> to get started.
|
| 482 |
</div>
|
| 483 |
+
""", unsafe_allow_html=True)
|