import streamlit as st import requests from bs4 import BeautifulSoup import json import re import time from datetime import datetime from urllib.parse import urljoin, urlparse class InstagramScraper: def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }) def extract_instagram_data(self, url): """Extract data from Instagram profile or post""" scraped_data = { "url": url, "timestamp": datetime.now().isoformat(), "platform": "instagram", "images": [], "posts": [], "profile_info": {}, "errors": [] } try: # Determine if it's a profile or post URL if "/p/" in url or "/reel/" in url: # Single post scraped_data.update(self.extract_post_data(url)) else: # Profile scraped_data.update(self.extract_profile_data(url)) except Exception as e: scraped_data["errors"].append(f"Instagram scraping error: {str(e)}") # Check if we found any data if not scraped_data.get("images") and not scraped_data.get("posts") and not scraped_data.get("profile_info", {}).get("username"): scraped_data["errors"].append("No Instagram data found. This might be due to:") scraped_data["errors"].append("- Private or protected account") scraped_data["errors"].append("- Instagram's anti-scraping measures") scraped_data["errors"].append("- Network connectivity issues") scraped_data["errors"].append("- URL format issues") return scraped_data def extract_post_data(self, url): """Extract data from a single Instagram post""" post_data = { "post_type": "single_post", "images": [], "post_info": {} } try: response = self.session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Look for image URLs in the page # Instagram loads images dynamically, so we need to look for patterns page_text = response.text # Find image URLs in the page source image_patterns = [ # Instagram post images (high quality) r'"display_url":"([^"]+)"', r'"display_src":"([^"]+)"', r'"src":"([^"]*\.jpg[^"]*)"', r'"src":"([^"]*\.jpeg[^"]*)"', r'"src":"([^"]*\.png[^"]*)"', # Direct image URLs r'https://[^"]*\.jpg[^"]*', r'https://[^"]*\.jpeg[^"]*', r'https://[^"]*\.png[^"]*', # Instagram CDN URLs (high quality) r'https://scontent[^"]*\.jpg[^"]*', r'https://scontent[^"]*\.jpeg[^"]*', r'https://scontent[^"]*\.png[^"]*', # Additional Instagram patterns r'"url":"([^"]*\.jpg[^"]*)"', r'"url":"([^"]*\.jpeg[^"]*)"', r'"url":"([^"]*\.png[^"]*)"' ] found_images = set() for pattern in image_patterns: matches = re.findall(pattern, page_text) for match in matches: if match and ('instagram' in match.lower() or 'scontent' in match.lower()): # Clean up the URL clean_url = match.replace('\\u0026', '&').replace('\\/', '/') found_images.add(clean_url) # Convert to image objects for i, img_url in enumerate(list(found_images)): post_data["images"].append({ "src": img_url, "alt": f"Instagram post image {i+1}", "title": f"Instagram post image {i+1}", "width": "", "height": "" }) # Extract post information post_data["post_info"] = { "url": url, "images_count": len(post_data["images"]), "scraped_at": datetime.now().isoformat() } except Exception as e: post_data["errors"] = [f"Failed to extract post data: {str(e)}"] return post_data def extract_profile_data(self, url): """Extract data from Instagram profile""" profile_data = { "profile_type": "account", "images": [], "profile_info": {}, "posts": [] } try: response = self.session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') page_text = response.text # Extract profile information profile_data["profile_info"] = self.extract_profile_info(soup, page_text) # Extract recent posts first profile_data["posts"] = self.extract_recent_posts(page_text) # Extract images from profile page profile_data["images"] = self.extract_profile_images(page_text) # Extract images from individual posts (higher quality) if profile_data["posts"]: post_images = self.extract_images_from_posts(profile_data["posts"], max_posts=3) if post_images: profile_data["images"].extend(post_images) except Exception as e: profile_data["errors"] = [f"Failed to extract profile data: {str(e)}"] return profile_data def extract_profile_info(self, soup, page_text): """Extract profile information""" profile_info = { "username": "", "display_name": "", "bio": "", "followers": "", "following": "", "posts_count": "" } try: # Look for profile information in the page source # Instagram loads this data dynamically, so we need to parse JSON # Find JSON data in the page json_patterns = [ r'window\._sharedData\s*=\s*({[^}]+})', r'"profile_page":\s*({[^}]+})', r'"user":\s*({[^}]+})' ] for pattern in json_patterns: matches = re.findall(pattern, page_text) if matches: try: data = json.loads(matches[0]) # Extract profile info from JSON if "user" in data: user_data = data["user"] profile_info["username"] = user_data.get("username", "") profile_info["display_name"] = user_data.get("full_name", "") profile_info["bio"] = user_data.get("biography", "") profile_info["followers"] = user_data.get("followed_by", {}).get("count", "") profile_info["following"] = user_data.get("follows", {}).get("count", "") profile_info["posts_count"] = user_data.get("media", {}).get("count", "") except: continue # Fallback: try to extract from HTML if not profile_info["username"]: title_tag = soup.find('title') if title_tag: title_text = title_tag.get_text() if '(' in title_text and ')' in title_text: username = title_text.split('(')[1].split(')')[0] profile_info["username"] = username except Exception as e: profile_info["error"] = f"Failed to extract profile info: {str(e)}" return profile_info def extract_profile_images(self, page_text): """Extract images from profile page""" images = [] try: # Look for Instagram post images in the page source # Instagram stores post images in JSON data image_patterns = [ # Instagram post images (high quality) r'"display_url":"([^"]+)"', r'"display_src":"([^"]+)"', r'"src":"([^"]*\.jpg[^"]*)"', r'"src":"([^"]*\.jpeg[^"]*)"', r'"src":"([^"]*\.png[^"]*)"', # Direct image URLs r'https://[^"]*\.jpg[^"]*', r'https://[^"]*\.jpeg[^"]*', r'https://[^"]*\.png[^"]*', # Instagram CDN URLs r'https://scontent[^"]*\.jpg[^"]*', r'https://scontent[^"]*\.jpeg[^"]*', r'https://scontent[^"]*\.png[^"]*', # Additional Instagram patterns r'"url":"([^"]*\.jpg[^"]*)"', r'"url":"([^"]*\.jpeg[^"]*)"', r'"url":"([^"]*\.png[^"]*)"' ] found_images = set() for pattern in image_patterns: matches = re.findall(pattern, page_text) for match in matches: if match and ('instagram' in match.lower() or 'scontent' in match.lower()): # Clean up the URL clean_url = match.replace('\\u0026', '&').replace('\\/', '/') found_images.add(clean_url) # Convert to image objects for i, img_url in enumerate(list(found_images)): images.append({ "src": img_url, "alt": f"Instagram post image {i+1}", "title": f"Instagram post image {i+1}", "width": "", "height": "" }) except Exception as e: st.error(f"Failed to extract profile images: {str(e)}") return images def extract_recent_posts(self, page_text): """Extract recent posts from profile""" posts = [] try: # Look for post URLs in the page source post_patterns = [ r'"shortcode":"([^"]+)"', r'/p/([^/"]+)', r'/reel/([^/"]+)' ] found_posts = set() for pattern in post_patterns: matches = re.findall(pattern, page_text) for match in matches: if match: found_posts.add(match) # Convert to post objects for i, post_code in enumerate(list(found_posts)[:10]): # Convert set to list and limit to 10 posts posts.append({ "shortcode": post_code, "url": f"https://www.instagram.com/p/{post_code}/", "index": i + 1 }) except Exception as e: st.error(f"Failed to extract recent posts: {str(e)}") return posts def extract_images_from_posts(self, posts, max_posts=5): """Extract images from individual posts""" all_images = [] try: for i, post in enumerate(posts[:max_posts]): try: # Get the post page post_url = post["url"] response = self.session.get(post_url, timeout=10) response.raise_for_status() # Extract images from this post post_images = self.extract_post_images(response.text) # Add post context to images for img in post_images: img["post_url"] = post_url img["post_index"] = i + 1 all_images.append(img) # Small delay to be respectful time.sleep(1) except Exception as e: st.warning(f"Failed to extract images from post {post['shortcode']}: {str(e)}") continue except Exception as e: st.error(f"Failed to extract images from posts: {str(e)}") return all_images def extract_post_images(self, page_text): """Extract images from a single post page""" images = [] try: # Look for high-quality Instagram post images image_patterns = [ # Instagram post images (high quality) r'"display_url":"([^"]+)"', r'"display_src":"([^"]+)"', # Instagram CDN URLs (highest quality) r'https://scontent[^"]*\.jpg[^"]*', r'https://scontent[^"]*\.jpeg[^"]*', r'https://scontent[^"]*\.png[^"]*', # Additional patterns r'"src":"([^"]*\.jpg[^"]*)"', r'"src":"([^"]*\.jpeg[^"]*)"', r'"src":"([^"]*\.png[^"]*)"' ] found_images = set() for pattern in image_patterns: matches = re.findall(pattern, page_text) for match in matches: if match and ('scontent' in match.lower() or 'instagram' in match.lower()): # Clean up the URL clean_url = match.replace('\\u0026', '&').replace('\\/', '/') found_images.add(clean_url) # Convert to image objects for i, img_url in enumerate(list(found_images)): images.append({ "src": img_url, "alt": f"Instagram post image {i+1}", "title": f"Instagram post image {i+1}", "width": "", "height": "" }) except Exception as e: st.error(f"Failed to extract post images: {str(e)}") return images # Global Instagram scraper instance instagram_scraper = InstagramScraper()