subhrajit-mohanty commited on
Commit
a1034af
ยท
verified ยท
1 Parent(s): 193caeb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +562 -0
app.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import time
5
+ import csv
6
+ import random
7
+ import re
8
+ import os
9
+ import io
10
+ import base64
11
+ from urllib.parse import urlparse, urljoin
12
+
13
+ st.set_page_config(
14
+ page_title="Web Scraper",
15
+ page_icon="๐Ÿ•ธ๏ธ",
16
+ layout="wide"
17
+ )
18
+
19
+ # Apply custom CSS
20
+ st.markdown("""
21
+ <style>
22
+ .main {
23
+ padding: 2rem;
24
+ }
25
+ .stButton button {
26
+ background-color: #4CAF50;
27
+ color: white;
28
+ padding: 0.5rem 1rem;
29
+ font-size: 1rem;
30
+ border-radius: 5px;
31
+ }
32
+ .result-area {
33
+ background-color: #f9f9f9;
34
+ padding: 1.5rem;
35
+ border-radius: 10px;
36
+ border: 1px solid #ddd;
37
+ margin-top: 2rem;
38
+ }
39
+ h1, h2, h3 {
40
+ color: #2C3E50;
41
+ }
42
+ </style>
43
+ """, unsafe_allow_html=True)
44
+
45
+ def scrape_website(base_url, max_pages=None, progress_bar=None):
46
+ """
47
+ Scrape all pages of a website with improved security bypass techniques
48
+ """
49
+ all_data = []
50
+ current_page = 1
51
+ has_next_page = True
52
+
53
+ # Create a pool of user agents to rotate
54
+ user_agents = [
55
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
56
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15',
57
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
58
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0'
59
+ ]
60
+
61
+ # Session to maintain cookies
62
+ session = requests.Session()
63
+
64
+ # Parse the base URL to help with navigation
65
+ parsed_url = urlparse(base_url)
66
+ domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
67
+
68
+ # Store debug logs
69
+ logs = []
70
+ logs.append(f"Starting scrape of {base_url}")
71
+
72
+ while has_next_page and (max_pages is None or current_page <= max_pages):
73
+ logs.append(f"Scraping page {current_page}...")
74
+
75
+ # Update progress bar if available
76
+ if progress_bar is not None:
77
+ if max_pages:
78
+ progress_bar.progress(min(current_page / max_pages, 1.0))
79
+ else:
80
+ # If max_pages is None, we just show indeterminate progress
81
+ progress_bar.progress(min(current_page / 10, 1.0))
82
+
83
+ # Rotate user agents
84
+ current_agent = random.choice(user_agents)
85
+ session.headers.update({
86
+ 'User-Agent': current_agent,
87
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
88
+ 'Accept-Language': 'en-US,en;q=0.5',
89
+ 'Referer': domain,
90
+ 'DNT': '1',
91
+ 'Connection': 'keep-alive',
92
+ 'Upgrade-Insecure-Requests': '1'
93
+ })
94
+
95
+ # Different pagination patterns
96
+ if current_page == 1:
97
+ page_url = base_url
98
+ else:
99
+ # Try different pagination patterns
100
+ if '?' in base_url:
101
+ page_url = f"{base_url}&page={current_page}"
102
+ else:
103
+ page_url = f"{base_url}?page={current_page}"
104
+
105
+ # Add random delay between requests (1-3 seconds)
106
+ delay = 1 + random.random() * 2
107
+ time.sleep(delay)
108
+
109
+ try:
110
+ # Make request with timeout
111
+ response = session.get(page_url, timeout=30)
112
+ response.raise_for_status()
113
+
114
+ # Parse HTML
115
+ soup = BeautifulSoup(response.text, 'html.parser')
116
+
117
+ # Get page title for logs
118
+ page_title = soup.title.string if soup.title else 'No title'
119
+ logs.append(f"Page title: {page_title}")
120
+
121
+ # Extract data using direct DOM traversal instead of complex selectors
122
+ page_data = extract_data_safely(soup, domain)
123
+
124
+ if not page_data:
125
+ logs.append(f"No data found on page {current_page}. Trying alternate extraction method...")
126
+ # Try alternate extraction method
127
+ page_data = extract_data_alternate(soup, domain)
128
+
129
+ if not page_data:
130
+ logs.append(f"Still no data found on page {current_page}.")
131
+ if current_page > 2 and not all_data:
132
+ logs.append("Failed to extract data after multiple pages. Check site structure.")
133
+ break
134
+ else:
135
+ logs.append(f"Found {len(page_data)} items on page {current_page}")
136
+ all_data.extend(page_data)
137
+
138
+ # Check for next page without using complex selectors
139
+ next_page_link = None
140
+
141
+ # Simple approach: look for links with "next" in text or attributes
142
+ for a_tag in soup.find_all('a'):
143
+ link_text = a_tag.text.lower()
144
+ if 'next' in link_text or 'next page' in link_text or 'ยป' in link_text or '>' in link_text:
145
+ next_page_link = a_tag
146
+ break
147
+
148
+ # Check attributes for hints
149
+ for attr, value in a_tag.attrs.items():
150
+ if isinstance(value, str) and ('next' in value.lower() or 'pagination-next' in value.lower()):
151
+ next_page_link = a_tag
152
+ break
153
+
154
+ # If we found a next link
155
+ if next_page_link and 'href' in next_page_link.attrs:
156
+ next_url = next_page_link['href']
157
+ # Handle relative URLs
158
+ if not next_url.startswith(('http://', 'https://')):
159
+ next_url = urljoin(page_url, next_url)
160
+
161
+ logs.append(f"Found next page link: {next_url}")
162
+ # If the next URL is the same as current, we may be at the end
163
+ if next_url == page_url:
164
+ has_next_page = False
165
+ else:
166
+ # For direct links, update base_url and reset counter
167
+ if '/page/' in next_url or 'page=' in next_url:
168
+ pass # Continue with our pagination pattern
169
+ else:
170
+ base_url = next_url
171
+ current_page = 1 # Will be incremented to 2 below
172
+ else:
173
+ # No next link found
174
+ has_next_page = False
175
+ logs.append("No next page link found. Reached the end.")
176
+
177
+ current_page += 1
178
+
179
+ except requests.exceptions.RequestException as e:
180
+ logs.append(f"Error scraping page {current_page}: {str(e)}")
181
+
182
+ # If we got blocked (403 error), try with more delay
183
+ if hasattr(e, 'response') and e.response is not None and e.response.status_code == 403:
184
+ logs.append("Possible blocking detected. Waiting longer...")
185
+ time.sleep(10) # Wait 10 seconds in the Streamlit app to avoid freezing
186
+ # Continue without incrementing page to retry
187
+ continue
188
+ else:
189
+ break
190
+
191
+ logs.append(f"Scraping complete. Scraped {len(all_data)} items from {current_page-1} pages.")
192
+ return all_data, logs
193
+
194
+ def extract_data_safely(soup, domain):
195
+ """
196
+ Extract data from a page using direct DOM traversal without complex selectors
197
+ """
198
+ items = []
199
+ potential_containers = []
200
+
201
+ # Step 1: Find potential container elements that might hold repeating content
202
+ for tag in ['div', 'li', 'article', 'section']:
203
+ elements = soup.find_all(tag)
204
+
205
+ # Group elements by their class
206
+ class_groups = {}
207
+ for el in elements:
208
+ if 'class' in el.attrs:
209
+ class_key = ' '.join(sorted(el['class']))
210
+ if class_key in class_groups:
211
+ class_groups[class_key].append(el)
212
+ else:
213
+ class_groups[class_key] = [el]
214
+
215
+ # Find groups with multiple similar elements (potential product listings)
216
+ for class_name, elements_group in class_groups.items():
217
+ if 3 <= len(elements_group) <= 100: # Reasonable number for product listings
218
+ # Check if these elements contain both text and links
219
+ has_content = True
220
+ for el in elements_group[:3]: # Check first few elements
221
+ if not (el.find_all('a') and el.text.strip()):
222
+ has_content = False
223
+ break
224
+
225
+ if has_content:
226
+ potential_containers.extend(elements_group)
227
+
228
+ # Step 2: Process each potential container
229
+ for container in potential_containers:
230
+ try:
231
+ # Look for title: prefer headings, then links with text
232
+ title = None
233
+ # Try to find headings
234
+ for heading in container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
235
+ if heading.text.strip():
236
+ title = heading.text.strip()
237
+ break
238
+
239
+ # If no heading, try links
240
+ if not title:
241
+ for link in container.find_all('a'):
242
+ if link.text.strip() and len(link.text.strip()) > 5:
243
+ title = link.text.strip()
244
+ break
245
+
246
+ # If still no title, try image alt text
247
+ if not title:
248
+ for img in container.find_all('img'):
249
+ if 'alt' in img.attrs and img['alt'].strip():
250
+ title = img['alt'].strip()
251
+ break
252
+
253
+ # Look for URL (any link)
254
+ url = None
255
+ for link in container.find_all('a'):
256
+ if 'href' in link.attrs:
257
+ url = link['href']
258
+ if not url.startswith(('http://', 'https://')):
259
+ url = urljoin(domain, url)
260
+ break
261
+
262
+ # Look for description
263
+ description = ""
264
+ for p in container.find_all('p'):
265
+ if p.text.strip() and p.text.strip() != title:
266
+ description = p.text.strip()
267
+ break
268
+
269
+ # Look for price (text with currency symbols or patterns)
270
+ price = None
271
+ price_pattern = re.compile(r'(\$|โ‚ฌ|ยฃ|ยฅ|USD|EUR|GBP|JPY)\s*\d+[\d\.,]*')
272
+ for text in container.stripped_strings:
273
+ match = price_pattern.search(text)
274
+ if match:
275
+ price = text.strip()
276
+ break
277
+
278
+ # Try to extract an image URL
279
+ image_url = None
280
+ for img in container.find_all('img'):
281
+ if 'src' in img.attrs:
282
+ image_url = img['src']
283
+ if not image_url.startswith(('http://', 'https://')):
284
+ image_url = urljoin(domain, image_url)
285
+ break
286
+
287
+ # Only add if we have at least title and URL
288
+ if title and url:
289
+ item = {
290
+ 'title': title,
291
+ 'url': url,
292
+ 'description': description if description else '',
293
+ 'price': price if price else '',
294
+ 'image_url': image_url if image_url else ''
295
+ }
296
+ items.append(item)
297
+ except Exception as e:
298
+ pass # Skip problematic containers
299
+
300
+ return items
301
+
302
+ def extract_data_alternate(soup, domain):
303
+ """
304
+ Alternative extraction method using simpler approach
305
+ """
306
+ items = []
307
+
308
+ # Look for any anchor tags with meaningful content
309
+ for link in soup.find_all('a'):
310
+ try:
311
+ url = link.get('href')
312
+ if not url:
313
+ continue
314
+
315
+ # Handle relative URLs
316
+ if not url.startswith(('http://', 'https://')):
317
+ url = urljoin(domain, url)
318
+
319
+ # Get title from link text or img alt
320
+ title = link.text.strip()
321
+ if not title or len(title) < 5: # If text is empty or very short
322
+ img = link.find('img')
323
+ if img and img.get('alt'):
324
+ title = img['alt'].strip()
325
+
326
+ # Look for image
327
+ image_url = None
328
+ img = link.find('img')
329
+ if img and 'src' in img.attrs:
330
+ image_url = img['src']
331
+ if not image_url.startswith(('http://', 'https://')):
332
+ image_url = urljoin(domain, image_url)
333
+
334
+ # Look for description near the link
335
+ description = ""
336
+ parent = link.parent
337
+ if parent:
338
+ p_tag = parent.find('p')
339
+ if p_tag and p_tag.text.strip() and p_tag.text.strip() != title:
340
+ description = p_tag.text.strip()
341
+
342
+ # Look for price near the link
343
+ price = None
344
+ # Check for siblings with currency patterns
345
+ price_pattern = re.compile(r'(\$|โ‚ฌ|ยฃ|ยฅ|USD|EUR|GBP|JPY)\s*\d+[\d\.,]*')
346
+
347
+ # Check the parent and its children for price
348
+ parent = link.parent
349
+ if parent:
350
+ for text in parent.stripped_strings:
351
+ match = price_pattern.search(text)
352
+ if match and text.strip() != title:
353
+ price = text.strip()
354
+ break
355
+
356
+ # Only add if we have a meaningful title and URL
357
+ if title and url and len(title) > 5 and '.' in url:
358
+ item = {
359
+ 'title': title,
360
+ 'url': url,
361
+ 'description': description,
362
+ 'price': price if price else '',
363
+ 'image_url': image_url if image_url else ''
364
+ }
365
+ # Avoid duplicates
366
+ if not any(x['url'] == url for x in items):
367
+ items.append(item)
368
+ except Exception as e:
369
+ pass # Skip problematic links
370
+
371
+ return items
372
+
373
+ def generate_markdown(data, site_url):
374
+ """
375
+ Generate a markdown representation of the scraped data
376
+ """
377
+ parsed_url = urlparse(site_url)
378
+ domain_name = parsed_url.netloc
379
+
380
+ # Start with a header
381
+ md = f"# Scraped Content from {domain_name}\n\n"
382
+ md += f"*Source: [{domain_name}]({site_url})*\n\n"
383
+ md += f"*Total items found: {len(data)}*\n\n"
384
+
385
+ # Group by categories if we can detect them
386
+ categories = {}
387
+
388
+ # Try to extract categories from URLs
389
+ for item in data:
390
+ url_path = urlparse(item['url']).path
391
+ path_parts = [p for p in url_path.split('/') if p]
392
+
393
+ # Use the first path component as a category if there is one
394
+ category = "General"
395
+ if len(path_parts) > 0:
396
+ potential_category = path_parts[0].replace('-', ' ').replace('_', ' ').title()
397
+ if 2 < len(potential_category) < 30: # Reasonable category name length
398
+ category = potential_category
399
+
400
+ if category not in categories:
401
+ categories[category] = []
402
+
403
+ categories[category].append(item)
404
+
405
+ # If we couldn't find meaningful categories, just use "Results"
406
+ if len(categories) <= 1:
407
+ md += "## Results\n\n"
408
+
409
+ # Sort by title for better organization
410
+ sorted_data = sorted(data, key=lambda x: x['title'])
411
+
412
+ for item in sorted_data:
413
+ md += f"### {item['title']}\n\n"
414
+ md += f"๐Ÿ”— [View Original]({item['url']})\n\n"
415
+
416
+ if item['image_url']:
417
+ md += f"![Image]({item['image_url']})\n\n"
418
+
419
+ if item['description']:
420
+ md += f"{item['description']}\n\n"
421
+
422
+ if item['price']:
423
+ md += f"**Price:** {item['price']}\n\n"
424
+
425
+ md += "---\n\n"
426
+ else:
427
+ # Output by categories
428
+ for category, items in categories.items():
429
+ md += f"## {category}\n\n"
430
+
431
+ # Sort items by title within each category
432
+ sorted_items = sorted(items, key=lambda x: x['title'])
433
+
434
+ for item in sorted_items:
435
+ md += f"### {item['title']}\n\n"
436
+ md += f"๐Ÿ”— [View Original]({item['url']})\n\n"
437
+
438
+ if item['image_url']:
439
+ md += f"![Image]({item['image_url']})\n\n"
440
+
441
+ if item['description']:
442
+ md += f"{item['description']}\n\n"
443
+
444
+ if item['price']:
445
+ md += f"**Price:** {item['price']}\n\n"
446
+
447
+ md += "---\n\n"
448
+
449
+ return md
450
+
451
+ def download_markdown(markdown_text, filename="scraped_content.md"):
452
+ """
453
+ Create a download link for the markdown file
454
+ """
455
+ b64 = base64.b64encode(markdown_text.encode()).decode()
456
+ href = f'<a href="data:file/markdown;base64,{b64}" download="{filename}">Download Markdown File</a>'
457
+ return href
458
+
459
+ def main():
460
+ st.title("๐Ÿ•ธ๏ธ Web Scraper")
461
+ st.subheader("Extract and convert web content to Markdown")
462
+
463
+ # Input form
464
+ with st.form("scraper_form"):
465
+ url = st.text_input("Enter the URL to scrape:", placeholder="https://example.com/products")
466
+
467
+ col1, col2 = st.columns(2)
468
+ with col1:
469
+ max_pages = st.number_input("Maximum pages to scrape (0 for unlimited):", min_value=0, value=5)
470
+ with col2:
471
+ delay_between_requests = st.slider("Delay between requests (seconds):", min_value=1, max_value=10, value=2)
472
+
473
+ submit_button = st.form_submit_button("Start Scraping")
474
+
475
+ if submit_button:
476
+ if not url:
477
+ st.error("Please enter a valid URL")
478
+ else:
479
+ if not url.startswith(('http://', 'https://')):
480
+ url = 'https://' + url
481
+
482
+ # Convert 0 to None for unlimited pages
483
+ if max_pages == 0:
484
+ max_pages = None
485
+
486
+ # Show progress
487
+ progress = st.progress(0)
488
+ status = st.empty()
489
+
490
+ # Add a placeholder for logs
491
+ log_expander = st.expander("View Logs", expanded=False)
492
+ logs_placeholder = log_expander.empty()
493
+
494
+ try:
495
+ status.text("Scraping in progress... Please wait.")
496
+
497
+ # Perform scraping
498
+ start_time = time.time()
499
+ scraped_data, logs = scrape_website(url, max_pages, progress)
500
+ end_time = time.time()
501
+
502
+ # Display logs
503
+ logs_placeholder.text('\n'.join(logs))
504
+
505
+ if scraped_data:
506
+ status.text(f"Scraping completed! Found {len(scraped_data)} items in {end_time - start_time:.2f} seconds.")
507
+
508
+ # Generate markdown
509
+ markdown_content = generate_markdown(scraped_data, url)
510
+
511
+ # Display results
512
+ st.subheader("Scraped Content (Markdown)")
513
+
514
+ # Two-column layout for preview and raw markdown
515
+ col1, col2 = st.columns(2)
516
+
517
+ with col1:
518
+ st.markdown("### Preview")
519
+ st.markdown(markdown_content)
520
+
521
+ with col2:
522
+ st.markdown("### Raw Markdown")
523
+ st.code(markdown_content)
524
+
525
+ # Download options
526
+ st.markdown("### Download Options")
527
+
528
+ # Download as Markdown
529
+ st.markdown(download_markdown(markdown_content), unsafe_allow_html=True)
530
+
531
+ # Download as CSV option
532
+ if scraped_data:
533
+ csv_buffer = io.StringIO()
534
+ writer = csv.DictWriter(csv_buffer, fieldnames=scraped_data[0].keys())
535
+ writer.writeheader()
536
+ writer.writerows(scraped_data)
537
+
538
+ b64 = base64.b64encode(csv_buffer.getvalue().encode()).decode()
539
+ href = f'<a href="data:file/csv;base64,{b64}" download="scraped_data.csv">Download CSV File</a>'
540
+ st.markdown(href, unsafe_allow_html=True)
541
+ else:
542
+ status.error("No data was found. Try adjusting the URL or increasing the page limit.")
543
+
544
+ except Exception as e:
545
+ status.error(f"An error occurred: {str(e)}")
546
+ st.exception(e)
547
+
548
+ # Footer
549
+ st.markdown("---")
550
+ st.markdown("### ๐Ÿ“ Instructions")
551
+ st.markdown("""
552
+ 1. Enter the URL of the website you want to scrape.
553
+ 2. Specify the maximum number of pages to scrape (0 for unlimited).
554
+ 3. Adjust the delay between requests to avoid overwhelming the server.
555
+ 4. Click "Start Scraping" and wait for the results.
556
+ 5. The scraped content will be displayed as Markdown and can be downloaded.
557
+
558
+ **Note:** Be respectful of website terms of service and robots.txt when scraping.
559
+ """)
560
+
561
+ if __name__ == "__main__":
562
+ main()