muddasser commited on
Commit
bf68fe9
Β·
verified Β·
1 Parent(s): b04d996

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -44
app.py CHANGED
@@ -228,8 +228,9 @@ for key, default in [
228
  # ── Utilities ──────────────────────────────────────────────────────────────────
229
 
230
  def clean_text(text):
231
- text = re.sub(r'\s+', ' ', text)
232
- text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
 
233
  return text.strip()
234
 
235
  def is_valid_url(url):
@@ -255,56 +256,51 @@ def scrape_website(url):
255
  browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
256
  page = browser.new_page()
257
  try:
258
- # Use networkidle so JS-rendered and late-loading content is fully ready
259
  page.goto(url, wait_until="networkidle", timeout=45000)
260
  title = page.title()
261
 
262
- # Try semantic/common content selectors first
263
- priority_selectors = [
264
- "#content",
265
- ".mw-parser-output",
266
- "main",
267
- ".main-content",
268
- "#main",
269
- "article",
270
- ".container", # generic bootstrap layouts
271
- "#wrapper",
272
- ".page-content",
273
- ".site-content",
274
- ]
275
- el = None
276
- for sel in priority_selectors:
277
  try:
278
- el = page.query_selector(sel)
279
- if el:
280
- break
 
281
  except:
282
  continue
283
 
284
- # If no semantic container found, extract structured text from
285
- # headings + paragraphs + list items directly β€” avoids nav/footer noise
286
- if not el:
287
- chunks = []
288
- for tag in ["h1","h2","h3","h4","p","li","td","th","span"]:
289
- elements = page.query_selector_all(tag)
290
- for e in elements:
291
- try:
292
- t = e.inner_text().strip()
293
- if t and len(t) > 2:
294
- chunks.append(t)
295
- except:
296
- continue
297
- text = clean_text(" ".join(chunks))
298
- else:
299
- text = clean_text(el.inner_text())
300
-
301
- if not text or len(text) < 50:
302
- # Last resort β€” full body
 
 
 
 
 
303
  body = page.query_selector("body")
304
- text = clean_text(body.inner_text())
305
 
306
- logging.info(f"Scraped {len(text)} chars from {url}")
307
- return {"title": title, "content": text, "url": url}
308
  except Exception as e:
309
  logging.error(f"Scrape error: {e}")
310
  st.error(f"Scraping failed: {e}")
@@ -484,4 +480,4 @@ else:
484
  </div>
485
  Enter a URL above and hit <strong>Scrape</strong> to get started.
486
  </div>
487
- """, unsafe_allow_html=True)
 
228
  # ── Utilities ──────────────────────────────────────────────────────────────────
229
 
230
  def clean_text(text):
231
+ # Only collapse whitespace β€” preserve Rs. prices, commas, symbols
232
+ text = re.sub(r'[ \t]+', ' ', text)
233
+ text = re.sub(r'\n{3,}', '\n\n', text)
234
  return text.strip()
235
 
236
  def is_valid_url(url):
 
256
  browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
257
  page = browser.new_page()
258
  try:
 
259
  page.goto(url, wait_until="networkidle", timeout=45000)
260
  title = page.title()
261
 
262
+ # Strategy 1: extract structured name+price pairs from <li> elements
263
+ # Works well for listing/price pages like whatmobile.com.pk
264
+ lines = []
265
+ li_elements = page.query_selector_all("li")
266
+ for li in li_elements:
 
 
 
 
 
 
 
 
 
 
267
  try:
268
+ text = li.inner_text().strip()
269
+ # Keep li items that contain a heading and a price-like pattern
270
+ if text and len(text) > 3 and len(text) < 300:
271
+ lines.append(text)
272
  except:
273
  continue
274
 
275
+ # Strategy 2: grab all headings and paragraphs too
276
+ for tag in ["h1", "h2", "h3", "h4", "p", "td"]:
277
+ elements = page.query_selector_all(tag)
278
+ for e in elements:
279
+ try:
280
+ text = e.inner_text().strip()
281
+ if text and len(text) > 3 and len(text) < 500:
282
+ lines.append(text)
283
+ except:
284
+ continue
285
+
286
+ # Deduplicate while preserving order
287
+ seen = set()
288
+ unique_lines = []
289
+ for line in lines:
290
+ normalised = re.sub(r'\s+', ' ', line).strip()
291
+ if normalised not in seen:
292
+ seen.add(normalised)
293
+ unique_lines.append(normalised)
294
+
295
+ content = "\n".join(unique_lines)
296
+
297
+ # Fallback to full body if we got almost nothing
298
+ if len(content) < 200:
299
  body = page.query_selector("body")
300
+ content = clean_text(body.inner_text()) if body else content
301
 
302
+ logging.info(f"Scraped {len(content)} chars from {url}")
303
+ return {"title": title, "content": content, "url": url}
304
  except Exception as e:
305
  logging.error(f"Scrape error: {e}")
306
  st.error(f"Scraping failed: {e}")
 
480
  </div>
481
  Enter a URL above and hit <strong>Scrape</strong> to get started.
482
  </div>
483
+ """, unsafe_allow_html=True)