Spaces:

PHOROTHA913
/

Scrape-Anythings

Sleeping

File size: 15,032 Bytes

5c3dc0d

import streamlit as st
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse

class InstagramScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
    
    def extract_instagram_data(self, url):
        """Extract data from Instagram profile or post"""
        scraped_data = {
            "url": url,
            "timestamp": datetime.now().isoformat(),
            "platform": "instagram",
            "images": [],
            "posts": [],
            "profile_info": {},
            "errors": []
        }
        
        try:
            # Determine if it's a profile or post URL
            if "/p/" in url or "/reel/" in url:
                # Single post
                scraped_data.update(self.extract_post_data(url))
            else:
                # Profile
                scraped_data.update(self.extract_profile_data(url))
                
        except Exception as e:
            scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")
        
        # Check if we found any data
        if not scraped_data.get("images") and not scraped_data.get("posts") and not scraped_data.get("profile_info", {}).get("username"):
            scraped_data["errors"].append("No Instagram data found. This might be due to:")
            scraped_data["errors"].append("- Private or protected account")
            scraped_data["errors"].append("- Instagram's anti-scraping measures")
            scraped_data["errors"].append("- Network connectivity issues")
            scraped_data["errors"].append("- URL format issues")
        
        return scraped_data
    
    def extract_post_data(self, url):
        """Extract data from a single Instagram post"""
        post_data = {
            "post_type": "single_post",
            "images": [],
            "post_info": {}
        }
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for image URLs in the page
            # Instagram loads images dynamically, so we need to look for patterns
            page_text = response.text
            
            # Find image URLs in the page source
            image_patterns = [
                # Instagram post images (high quality)
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"',
                r'"src":"([^"]*\.jpg[^"]*)"',
                r'"src":"([^"]*\.jpeg[^"]*)"',
                r'"src":"([^"]*\.png[^"]*)"',
                # Direct image URLs
                r'https://[^"]*\.jpg[^"]*',
                r'https://[^"]*\.jpeg[^"]*',
                r'https://[^"]*\.png[^"]*',
                # Instagram CDN URLs (high quality)
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                # Additional Instagram patterns
                r'"url":"([^"]*\.jpg[^"]*)"',
                r'"url":"([^"]*\.jpeg[^"]*)"',
                r'"url":"([^"]*\.png[^"]*)"'
            ]
            
            found_images = set()
            for pattern in image_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
                        # Clean up the URL
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            # Convert to image objects
            for i, img_url in enumerate(list(found_images)):
                post_data["images"].append({
                    "src": img_url,
                    "alt": f"Instagram post image {i+1}",
                    "title": f"Instagram post image {i+1}",
                    "width": "",
                    "height": ""
                })
            
            # Extract post information
            post_data["post_info"] = {
                "url": url,
                "images_count": len(post_data["images"]),
                "scraped_at": datetime.now().isoformat()
            }
            
        except Exception as e:
            post_data["errors"] = [f"Failed to extract post data: {str(e)}"]
        
        return post_data
    
    def extract_profile_data(self, url):
        """Extract data from Instagram profile"""
        profile_data = {
            "profile_type": "account",
            "images": [],
            "profile_info": {},
            "posts": []
        }
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            page_text = response.text
            
            # Extract profile information
            profile_data["profile_info"] = self.extract_profile_info(soup, page_text)
            
            # Extract recent posts first
            profile_data["posts"] = self.extract_recent_posts(page_text)
            
            # Extract images from profile page
            profile_data["images"] = self.extract_profile_images(page_text)
            
            # Extract images from individual posts (higher quality)
            if profile_data["posts"]:
                post_images = self.extract_images_from_posts(profile_data["posts"], max_posts=3)
                if post_images:
                    profile_data["images"].extend(post_images)
            
        except Exception as e:
            profile_data["errors"] = [f"Failed to extract profile data: {str(e)}"]
        
        return profile_data
    
    def extract_profile_info(self, soup, page_text):
        """Extract profile information"""
        profile_info = {
            "username": "",
            "display_name": "",
            "bio": "",
            "followers": "",
            "following": "",
            "posts_count": ""
        }
        
        try:
            # Look for profile information in the page source
            # Instagram loads this data dynamically, so we need to parse JSON
            
            # Find JSON data in the page
            json_patterns = [
                r'window\._sharedData\s*=\s*({[^}]+})',
                r'"profile_page":\s*({[^}]+})',
                r'"user":\s*({[^}]+})'
            ]
            
            for pattern in json_patterns:
                matches = re.findall(pattern, page_text)
                if matches:
                    try:
                        data = json.loads(matches[0])
                        # Extract profile info from JSON
                        if "user" in data:
                            user_data = data["user"]
                            profile_info["username"] = user_data.get("username", "")
                            profile_info["display_name"] = user_data.get("full_name", "")
                            profile_info["bio"] = user_data.get("biography", "")
                            profile_info["followers"] = user_data.get("followed_by", {}).get("count", "")
                            profile_info["following"] = user_data.get("follows", {}).get("count", "")
                            profile_info["posts_count"] = user_data.get("media", {}).get("count", "")
                    except:
                        continue
            
            # Fallback: try to extract from HTML
            if not profile_info["username"]:
                title_tag = soup.find('title')
                if title_tag:
                    title_text = title_tag.get_text()
                    if '(' in title_text and ')' in title_text:
                        username = title_text.split('(')[1].split(')')[0]
                        profile_info["username"] = username
            
        except Exception as e:
            profile_info["error"] = f"Failed to extract profile info: {str(e)}"
        
        return profile_info
    
    def extract_profile_images(self, page_text):
        """Extract images from profile page"""
        images = []
        
        try:
            # Look for Instagram post images in the page source
            # Instagram stores post images in JSON data
            image_patterns = [
                # Instagram post images (high quality)
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"',
                r'"src":"([^"]*\.jpg[^"]*)"',
                r'"src":"([^"]*\.jpeg[^"]*)"',
                r'"src":"([^"]*\.png[^"]*)"',
                # Direct image URLs
                r'https://[^"]*\.jpg[^"]*',
                r'https://[^"]*\.jpeg[^"]*',
                r'https://[^"]*\.png[^"]*',
                # Instagram CDN URLs
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                # Additional Instagram patterns
                r'"url":"([^"]*\.jpg[^"]*)"',
                r'"url":"([^"]*\.jpeg[^"]*)"',
                r'"url":"([^"]*\.png[^"]*)"'
            ]
            
            found_images = set()
            for pattern in image_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
                        # Clean up the URL
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            # Convert to image objects
            for i, img_url in enumerate(list(found_images)):
                images.append({
                    "src": img_url,
                    "alt": f"Instagram post image {i+1}",
                    "title": f"Instagram post image {i+1}",
                    "width": "",
                    "height": ""
                })
            
        except Exception as e:
            st.error(f"Failed to extract profile images: {str(e)}")
        
        return images
    
    def extract_recent_posts(self, page_text):
        """Extract recent posts from profile"""
        posts = []
        
        try:
            # Look for post URLs in the page source
            post_patterns = [
                r'"shortcode":"([^"]+)"',
                r'/p/([^/"]+)',
                r'/reel/([^/"]+)'
            ]
            
            found_posts = set()
            for pattern in post_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match:
                        found_posts.add(match)
            
            # Convert to post objects
            for i, post_code in enumerate(list(found_posts)[:10]):  # Convert set to list and limit to 10 posts
                posts.append({
                    "shortcode": post_code,
                    "url": f"https://www.instagram.com/p/{post_code}/",
                    "index": i + 1
                })
            
        except Exception as e:
            st.error(f"Failed to extract recent posts: {str(e)}")
        
        return posts
    
    def extract_images_from_posts(self, posts, max_posts=5):
        """Extract images from individual posts"""
        all_images = []
        
        try:
            for i, post in enumerate(posts[:max_posts]):
                try:
                    # Get the post page
                    post_url = post["url"]
                    response = self.session.get(post_url, timeout=10)
                    response.raise_for_status()
                    
                    # Extract images from this post
                    post_images = self.extract_post_images(response.text)
                    
                    # Add post context to images
                    for img in post_images:
                        img["post_url"] = post_url
                        img["post_index"] = i + 1
                        all_images.append(img)
                    
                    # Small delay to be respectful
                    time.sleep(1)
                    
                except Exception as e:
                    st.warning(f"Failed to extract images from post {post['shortcode']}: {str(e)}")
                    continue
            
        except Exception as e:
            st.error(f"Failed to extract images from posts: {str(e)}")
        
        return all_images
    
    def extract_post_images(self, page_text):
        """Extract images from a single post page"""
        images = []
        
        try:
            # Look for high-quality Instagram post images
            image_patterns = [
                # Instagram post images (high quality)
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"',
                # Instagram CDN URLs (highest quality)
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                # Additional patterns
                r'"src":"([^"]*\.jpg[^"]*)"',
                r'"src":"([^"]*\.jpeg[^"]*)"',
                r'"src":"([^"]*\.png[^"]*)"'
            ]
            
            found_images = set()
            for pattern in image_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
                        # Clean up the URL
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            # Convert to image objects
            for i, img_url in enumerate(list(found_images)):
                images.append({
                    "src": img_url,
                    "alt": f"Instagram post image {i+1}",
                    "title": f"Instagram post image {i+1}",
                    "width": "",
                    "height": ""
                })
            
        except Exception as e:
            st.error(f"Failed to extract post images: {str(e)}")
        
        return images

# Global Instagram scraper instance
instagram_scraper = InstagramScraper()