Spaces:

PHOROTHA913
/

Scrape-Anythings

Sleeping

File size: 6,654 Bytes

5c3dc0d

import streamlit as st
import requests
import json
import re
import time
import random
from datetime import datetime

class InstagramScraperV2:
    def __init__(self):
        self.session = requests.Session()
        self.setup_session()
    
    def setup_session(self):
        """Setup session with better anti-detection measures"""
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
        
        self.session.headers.update({
            'User-Agent': random.choice(user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })
    
    def get_page_with_retry(self, url, max_retries=3):
        """Get page with retry mechanism"""
        for attempt in range(max_retries):
            try:
                time.sleep(random.uniform(2, 4))
                response = self.session.get(url, timeout=20)
                response.raise_for_status()
                return response.text
            except Exception as e:
                st.warning(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == max_retries - 1:
                    raise
        return None
    
    def extract_instagram_data(self, url):
        """Extract data from Instagram with improved error handling"""
        scraped_data = {
            "url": url,
            "timestamp": datetime.now().isoformat(),
            "platform": "instagram",
            "images": [],
            "posts": [],
            "profile_info": {},
            "errors": []
        }
        
        try:
            page_text = self.get_page_with_retry(url)
            if not page_text:
                scraped_data["errors"].append("Failed to load Instagram page")
                return scraped_data
            
            # Extract images
            scraped_data["images"] = self.extract_images_from_page(page_text)
            
            # Extract profile info
            scraped_data["profile_info"] = self.extract_profile_info(page_text)
            
            # Extract posts
            scraped_data["posts"] = self.extract_recent_posts(page_text)
            
        except Exception as e:
            scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")
        
        return scraped_data
    
    def extract_images_from_page(self, page_text):
        """Extract images with improved patterns"""
        images = []
        
        try:
            # Enhanced patterns for Instagram images
            patterns = [
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"'
            ]
            
            found_images = set()
            for pattern in patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            for i, img_url in enumerate(list(found_images)):
                images.append({
                    "src": img_url,
                    "alt": f"Instagram image {i+1}",
                    "title": f"Instagram image {i+1}",
                    "width": "",
                    "height": ""
                })
            
        except Exception as e:
            st.error(f"Failed to extract images: {str(e)}")
        
        return images
    
    def extract_profile_info(self, page_text):
        """Extract profile information"""
        profile_info = {
            "username": "",
            "display_name": "",
            "bio": "",
            "followers": "",
            "following": "",
            "posts_count": ""
        }
        
        try:
            # Extract username from title
            title_match = re.search(r'<title>([^<]+)</title>', page_text)
            if title_match:
                title = title_match.group(1)
                if '(' in title and ')' in title:
                    username = title.split('(')[1].split(')')[0]
                    profile_info["username"] = username
            
            # Look for JSON data
            json_patterns = [
                r'"username":"([^"]+)"',
                r'"full_name":"([^"]+)"',
                r'"biography":"([^"]+)"'
            ]
            
            for pattern in json_patterns:
                matches = re.findall(pattern, page_text)
                if matches:
                    if "username" in pattern:
                        profile_info["username"] = matches[0]
                    elif "full_name" in pattern:
                        profile_info["display_name"] = matches[0]
                    elif "biography" in pattern:
                        profile_info["bio"] = matches[0]
            
        except Exception as e:
            profile_info["error"] = f"Failed to extract profile info: {str(e)}"
        
        return profile_info
    
    def extract_recent_posts(self, page_text):
        """Extract recent posts"""
        posts = []
        
        try:
            post_patterns = [
                r'"shortcode":"([^"]+)"',
                r'/p/([^/"]+)'
            ]
            
            found_posts = set()
            for pattern in post_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match:
                        found_posts.add(match)
            
            for i, post_code in enumerate(list(found_posts)[:10]):
                posts.append({
                    "shortcode": post_code,
                    "url": f"https://www.instagram.com/p/{post_code}/",
                    "index": i + 1
                })
            
        except Exception as e:
            st.error(f"Failed to extract posts: {str(e)}")
        
        return posts

# Global instance
instagram_scraper_v2 = InstagramScraperV2()