Spaces:

subhrajit-mohanty
/

webscrapper

Paused

App Files Files Community

subhrajit-mohanty commited on May 6, 2025

Commit

a1034af

verified ·

1 Parent(s): 193caeb

Create app.py

Browse files

Files changed (1) hide show

app.py +562 -0

app.py ADDED Viewed

	@@ -0,0 +1,562 @@

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import time
+import csv
+import random
+import re
+import os
+import io
+import base64
+from urllib.parse import urlparse, urljoin
+st.set_page_config(
+    page_title="Web Scraper",
+    page_icon="🕸️",
+    layout="wide"
+)
+# Apply custom CSS
+st.markdown("""
+<style>
+    .main {
+        padding: 2rem;
+    }
+    .stButton button {
+        background-color: #4CAF50;
+        color: white;
+        padding: 0.5rem 1rem;
+        font-size: 1rem;
+        border-radius: 5px;
+    }
+    .result-area {
+        background-color: #f9f9f9;
+        padding: 1.5rem;
+        border-radius: 10px;
+        border: 1px solid #ddd;
+        margin-top: 2rem;
+    }
+    h1, h2, h3 {
+        color: #2C3E50;
+    }
+</style>
+""", unsafe_allow_html=True)
+def scrape_website(base_url, max_pages=None, progress_bar=None):
+    """
+    Scrape all pages of a website with improved security bypass techniques
+    """
+    all_data = []
+    current_page = 1
+    has_next_page = True
+    # Create a pool of user agents to rotate
+    user_agents = [
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0'
+    ]
+    # Session to maintain cookies
+    session = requests.Session()
+    # Parse the base URL to help with navigation
+    parsed_url = urlparse(base_url)
+    domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
+    # Store debug logs
+    logs = []
+    logs.append(f"Starting scrape of {base_url}")
+    while has_next_page and (max_pages is None or current_page <= max_pages):
+        logs.append(f"Scraping page {current_page}...")
+        # Update progress bar if available
+        if progress_bar is not None:
+            if max_pages:
+                progress_bar.progress(min(current_page / max_pages, 1.0))
+            else:
+                # If max_pages is None, we just show indeterminate progress
+                progress_bar.progress(min(current_page / 10, 1.0))
+        # Rotate user agents
+        current_agent = random.choice(user_agents)
+        session.headers.update({
+            'User-Agent': current_agent,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Referer': domain,
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        })
+        # Different pagination patterns
+        if current_page == 1:
+            page_url = base_url
+        else:
+            # Try different pagination patterns
+            if '?' in base_url:
+                page_url = f"{base_url}&page={current_page}"
+            else:
+                page_url = f"{base_url}?page={current_page}"
+        # Add random delay between requests (1-3 seconds)
+        delay = 1 + random.random() * 2
+        time.sleep(delay)
+        try:
+            # Make request with timeout
+            response = session.get(page_url, timeout=30)
+            response.raise_for_status()
+            # Parse HTML
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Get page title for logs
+            page_title = soup.title.string if soup.title else 'No title'
+            logs.append(f"Page title: {page_title}")
+            # Extract data using direct DOM traversal instead of complex selectors
+            page_data = extract_data_safely(soup, domain)
+            if not page_data:
+                logs.append(f"No data found on page {current_page}. Trying alternate extraction method...")
+                # Try alternate extraction method
+                page_data = extract_data_alternate(soup, domain)
+            if not page_data:
+                logs.append(f"Still no data found on page {current_page}.")
+                if current_page > 2 and not all_data:
+                    logs.append("Failed to extract data after multiple pages. Check site structure.")
+                    break
+            else:
+                logs.append(f"Found {len(page_data)} items on page {current_page}")
+                all_data.extend(page_data)
+            # Check for next page without using complex selectors
+            next_page_link = None
+            # Simple approach: look for links with "next" in text or attributes
+            for a_tag in soup.find_all('a'):
+                link_text = a_tag.text.lower()
+                if 'next' in link_text or 'next page' in link_text or '»' in link_text or '>' in link_text:
+                    next_page_link = a_tag
+                    break
+                # Check attributes for hints
+                for attr, value in a_tag.attrs.items():
+                    if isinstance(value, str) and ('next' in value.lower() or 'pagination-next' in value.lower()):
+                        next_page_link = a_tag
+                        break
+            # If we found a next link
+            if next_page_link and 'href' in next_page_link.attrs:
+                next_url = next_page_link['href']
+                # Handle relative URLs
+                if not next_url.startswith(('http://', 'https://')):
+                    next_url = urljoin(page_url, next_url)
+                logs.append(f"Found next page link: {next_url}")
+                # If the next URL is the same as current, we may be at the end
+                if next_url == page_url:
+                    has_next_page = False
+                else:
+                    # For direct links, update base_url and reset counter
+                    if '/page/' in next_url or 'page=' in next_url:
+                        pass  # Continue with our pagination pattern
+                    else:
+                        base_url = next_url
+                        current_page = 1  # Will be incremented to 2 below
+            else:
+                # No next link found
+                has_next_page = False
+                logs.append("No next page link found. Reached the end.")
+            current_page += 1
+        except requests.exceptions.RequestException as e:
+            logs.append(f"Error scraping page {current_page}: {str(e)}")
+            # If we got blocked (403 error), try with more delay
+            if hasattr(e, 'response') and e.response is not None and e.response.status_code == 403:
+                logs.append("Possible blocking detected. Waiting longer...")
+                time.sleep(10)  # Wait 10 seconds in the Streamlit app to avoid freezing
+                # Continue without incrementing page to retry
+                continue
+            else:
+                break
+    logs.append(f"Scraping complete. Scraped {len(all_data)} items from {current_page-1} pages.")
+    return all_data, logs
+def extract_data_safely(soup, domain):
+    """
+    Extract data from a page using direct DOM traversal without complex selectors
+    """
+    items = []
+    potential_containers = []
+    # Step 1: Find potential container elements that might hold repeating content
+    for tag in ['div', 'li', 'article', 'section']:
+        elements = soup.find_all(tag)
+        # Group elements by their class
+        class_groups = {}
+        for el in elements:
+            if 'class' in el.attrs:
+                class_key = ' '.join(sorted(el['class']))
+                if class_key in class_groups:
+                    class_groups[class_key].append(el)
+                else:
+                    class_groups[class_key] = [el]
+        # Find groups with multiple similar elements (potential product listings)
+        for class_name, elements_group in class_groups.items():
+            if 3 <= len(elements_group) <= 100:  # Reasonable number for product listings
+                # Check if these elements contain both text and links
+                has_content = True
+                for el in elements_group[:3]:  # Check first few elements
+                    if not (el.find_all('a') and el.text.strip()):
+                        has_content = False
+                        break
+                if has_content:
+                    potential_containers.extend(elements_group)
+    # Step 2: Process each potential container
+    for container in potential_containers:
+        try:
+            # Look for title: prefer headings, then links with text
+            title = None
+            # Try to find headings
+            for heading in container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+                if heading.text.strip():
+                    title = heading.text.strip()
+                    break
+            # If no heading, try links
+            if not title:
+                for link in container.find_all('a'):
+                    if link.text.strip() and len(link.text.strip()) > 5:
+                        title = link.text.strip()
+                        break
+            # If still no title, try image alt text
+            if not title:
+                for img in container.find_all('img'):
+                    if 'alt' in img.attrs and img['alt'].strip():
+                        title = img['alt'].strip()
+                        break
+            # Look for URL (any link)
+            url = None
+            for link in container.find_all('a'):
+                if 'href' in link.attrs:
+                    url = link['href']
+                    if not url.startswith(('http://', 'https://')):
+                        url = urljoin(domain, url)
+                    break
+            # Look for description
+            description = ""
+            for p in container.find_all('p'):
+                if p.text.strip() and p.text.strip() != title:
+                    description = p.text.strip()
+                    break
+            # Look for price (text with currency symbols or patterns)
+            price = None
+            price_pattern = re.compile(r'(\$|€|£|¥|USD|EUR|GBP|JPY)\s*\d+[\d\.,]*')
+            for text in container.stripped_strings:
+                match = price_pattern.search(text)
+                if match:
+                    price = text.strip()
+                    break
+            # Try to extract an image URL
+            image_url = None
+            for img in container.find_all('img'):
+                if 'src' in img.attrs:
+                    image_url = img['src']
+                    if not image_url.startswith(('http://', 'https://')):
+                        image_url = urljoin(domain, image_url)
+                    break
+            # Only add if we have at least title and URL
+            if title and url:
+                item = {
+                    'title': title,
+                    'url': url,
+                    'description': description if description else '',
+                    'price': price if price else '',
+                    'image_url': image_url if image_url else ''
+                }
+                items.append(item)
+        except Exception as e:
+            pass  # Skip problematic containers
+    return items
+def extract_data_alternate(soup, domain):
+    """
+    Alternative extraction method using simpler approach
+    """
+    items = []
+    # Look for any anchor tags with meaningful content
+    for link in soup.find_all('a'):
+        try:
+            url = link.get('href')
+            if not url:
+                continue
+            # Handle relative URLs
+            if not url.startswith(('http://', 'https://')):
+                url = urljoin(domain, url)
+            # Get title from link text or img alt
+            title = link.text.strip()
+            if not title or len(title) < 5:  # If text is empty or very short
+                img = link.find('img')
+                if img and img.get('alt'):
+                    title = img['alt'].strip()
+            # Look for image
+            image_url = None
+            img = link.find('img')
+            if img and 'src' in img.attrs:
+                image_url = img['src']
+                if not image_url.startswith(('http://', 'https://')):
+                    image_url = urljoin(domain, image_url)
+            # Look for description near the link
+            description = ""
+            parent = link.parent
+            if parent:
+                p_tag = parent.find('p')
+                if p_tag and p_tag.text.strip() and p_tag.text.strip() != title:
+                    description = p_tag.text.strip()
+            # Look for price near the link
+            price = None
+            # Check for siblings with currency patterns
+            price_pattern = re.compile(r'(\$|€|£|¥|USD|EUR|GBP|JPY)\s*\d+[\d\.,]*')
+            # Check the parent and its children for price
+            parent = link.parent
+            if parent:
+                for text in parent.stripped_strings:
+                    match = price_pattern.search(text)
+                    if match and text.strip() != title:
+                        price = text.strip()
+                        break
+            # Only add if we have a meaningful title and URL
+            if title and url and len(title) > 5 and '.' in url:
+                item = {
+                    'title': title,
+                    'url': url,
+                    'description': description,
+                    'price': price if price else '',
+                    'image_url': image_url if image_url else ''
+                }
+                # Avoid duplicates
+                if not any(x['url'] == url for x in items):
+                    items.append(item)
+        except Exception as e:
+            pass  # Skip problematic links
+    return items
+def generate_markdown(data, site_url):
+    """
+    Generate a markdown representation of the scraped data
+    """
+    parsed_url = urlparse(site_url)
+    domain_name = parsed_url.netloc
+    # Start with a header
+    md = f"# Scraped Content from {domain_name}\n\n"
+    md += f"*Source: [{domain_name}]({site_url})*\n\n"
+    md += f"*Total items found: {len(data)}*\n\n"
+    # Group by categories if we can detect them
+    categories = {}
+    # Try to extract categories from URLs
+    for item in data:
+        url_path = urlparse(item['url']).path
+        path_parts = [p for p in url_path.split('/') if p]
+        # Use the first path component as a category if there is one
+        category = "General"
+        if len(path_parts) > 0:
+            potential_category = path_parts[0].replace('-', ' ').replace('_', ' ').title()
+            if 2 < len(potential_category) < 30:  # Reasonable category name length
+                category = potential_category
+        if category not in categories:
+            categories[category] = []
+        categories[category].append(item)
+    # If we couldn't find meaningful categories, just use "Results"
+    if len(categories) <= 1:
+        md += "## Results\n\n"
+        # Sort by title for better organization
+        sorted_data = sorted(data, key=lambda x: x['title'])
+        for item in sorted_data:
+            md += f"### {item['title']}\n\n"
+            md += f"🔗 [View Original]({item['url']})\n\n"
+            if item['image_url']:
+                md += f"![Image]({item['image_url']})\n\n"
+            if item['description']:
+                md += f"{item['description']}\n\n"
+            if item['price']:
+                md += f"**Price:** {item['price']}\n\n"
+            md += "---\n\n"
+    else:
+        # Output by categories
+        for category, items in categories.items():
+            md += f"## {category}\n\n"
+            # Sort items by title within each category
+            sorted_items = sorted(items, key=lambda x: x['title'])
+            for item in sorted_items:
+                md += f"### {item['title']}\n\n"
+                md += f"🔗 [View Original]({item['url']})\n\n"
+                if item['image_url']:
+                    md += f"![Image]({item['image_url']})\n\n"
+                if item['description']:
+                    md += f"{item['description']}\n\n"
+                if item['price']:
+                    md += f"**Price:** {item['price']}\n\n"
+                md += "---\n\n"
+    return md
+def download_markdown(markdown_text, filename="scraped_content.md"):
+    """
+    Create a download link for the markdown file
+    """
+    b64 = base64.b64encode(markdown_text.encode()).decode()
+    href = f'<a href="data:file/markdown;base64,{b64}" download="{filename}">Download Markdown File</a>'
+    return href
+def main():
+    st.title("🕸️ Web Scraper")
+    st.subheader("Extract and convert web content to Markdown")
+    # Input form
+    with st.form("scraper_form"):
+        url = st.text_input("Enter the URL to scrape:", placeholder="https://example.com/products")
+        col1, col2 = st.columns(2)
+        with col1:
+            max_pages = st.number_input("Maximum pages to scrape (0 for unlimited):", min_value=0, value=5)
+        with col2:
+            delay_between_requests = st.slider("Delay between requests (seconds):", min_value=1, max_value=10, value=2)
+        submit_button = st.form_submit_button("Start Scraping")
+    if submit_button:
+        if not url:
+            st.error("Please enter a valid URL")
+        else:
+            if not url.startswith(('http://', 'https://')):
+                url = 'https://' + url
+            # Convert 0 to None for unlimited pages
+            if max_pages == 0:
+                max_pages = None
+            # Show progress
+            progress = st.progress(0)
+            status = st.empty()
+            # Add a placeholder for logs
+            log_expander = st.expander("View Logs", expanded=False)
+            logs_placeholder = log_expander.empty()
+            try:
+                status.text("Scraping in progress... Please wait.")
+                # Perform scraping
+                start_time = time.time()
+                scraped_data, logs = scrape_website(url, max_pages, progress)
+                end_time = time.time()
+                # Display logs
+                logs_placeholder.text('\n'.join(logs))
+                if scraped_data:
+                    status.text(f"Scraping completed! Found {len(scraped_data)} items in {end_time - start_time:.2f} seconds.")
+                    # Generate markdown
+                    markdown_content = generate_markdown(scraped_data, url)
+                    # Display results
+                    st.subheader("Scraped Content (Markdown)")
+                    # Two-column layout for preview and raw markdown
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.markdown("### Preview")
+                        st.markdown(markdown_content)
+                    with col2:
+                        st.markdown("### Raw Markdown")
+                        st.code(markdown_content)
+                    # Download options
+                    st.markdown("### Download Options")
+                    # Download as Markdown
+                    st.markdown(download_markdown(markdown_content), unsafe_allow_html=True)
+                    # Download as CSV option
+                    if scraped_data:
+                        csv_buffer = io.StringIO()
+                        writer = csv.DictWriter(csv_buffer, fieldnames=scraped_data[0].keys())
+                        writer.writeheader()
+                        writer.writerows(scraped_data)
+                        b64 = base64.b64encode(csv_buffer.getvalue().encode()).decode()
+                        href = f'<a href="data:file/csv;base64,{b64}" download="scraped_data.csv">Download CSV File</a>'
+                        st.markdown(href, unsafe_allow_html=True)
+                else:
+                    status.error("No data was found. Try adjusting the URL or increasing the page limit.")
+            except Exception as e:
+                status.error(f"An error occurred: {str(e)}")
+                st.exception(e)
+    # Footer
+    st.markdown("---")
+    st.markdown("### 📝 Instructions")
+    st.markdown("""
+    1. Enter the URL of the website you want to scrape.
+    2. Specify the maximum number of pages to scrape (0 for unlimited).
+    3. Adjust the delay between requests to avoid overwhelming the server.
+    4. Click "Start Scraping" and wait for the results.
+    5. The scraped content will be displayed as Markdown and can be downloaded.
+    **Note:** Be respectful of website terms of service and robots.txt when scraping.
+    """)
+if __name__ == "__main__":
+    main()