Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import json | |
| import re | |
| import time | |
| from datetime import datetime | |
| from urllib.parse import urljoin, urlparse | |
| class InstagramScraper: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1', | |
| }) | |
| def extract_instagram_data(self, url): | |
| """Extract data from Instagram profile or post""" | |
| scraped_data = { | |
| "url": url, | |
| "timestamp": datetime.now().isoformat(), | |
| "platform": "instagram", | |
| "images": [], | |
| "posts": [], | |
| "profile_info": {}, | |
| "errors": [] | |
| } | |
| try: | |
| # Determine if it's a profile or post URL | |
| if "/p/" in url or "/reel/" in url: | |
| # Single post | |
| scraped_data.update(self.extract_post_data(url)) | |
| else: | |
| # Profile | |
| scraped_data.update(self.extract_profile_data(url)) | |
| except Exception as e: | |
| scraped_data["errors"].append(f"Instagram scraping error: {str(e)}") | |
| # Check if we found any data | |
| if not scraped_data.get("images") and not scraped_data.get("posts") and not scraped_data.get("profile_info", {}).get("username"): | |
| scraped_data["errors"].append("No Instagram data found. This might be due to:") | |
| scraped_data["errors"].append("- Private or protected account") | |
| scraped_data["errors"].append("- Instagram's anti-scraping measures") | |
| scraped_data["errors"].append("- Network connectivity issues") | |
| scraped_data["errors"].append("- URL format issues") | |
| return scraped_data | |
| def extract_post_data(self, url): | |
| """Extract data from a single Instagram post""" | |
| post_data = { | |
| "post_type": "single_post", | |
| "images": [], | |
| "post_info": {} | |
| } | |
| try: | |
| response = self.session.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Look for image URLs in the page | |
| # Instagram loads images dynamically, so we need to look for patterns | |
| page_text = response.text | |
| # Find image URLs in the page source | |
| image_patterns = [ | |
| # Instagram post images (high quality) | |
| r'"display_url":"([^"]+)"', | |
| r'"display_src":"([^"]+)"', | |
| r'"src":"([^"]*\.jpg[^"]*)"', | |
| r'"src":"([^"]*\.jpeg[^"]*)"', | |
| r'"src":"([^"]*\.png[^"]*)"', | |
| # Direct image URLs | |
| r'https://[^"]*\.jpg[^"]*', | |
| r'https://[^"]*\.jpeg[^"]*', | |
| r'https://[^"]*\.png[^"]*', | |
| # Instagram CDN URLs (high quality) | |
| r'https://scontent[^"]*\.jpg[^"]*', | |
| r'https://scontent[^"]*\.jpeg[^"]*', | |
| r'https://scontent[^"]*\.png[^"]*', | |
| # Additional Instagram patterns | |
| r'"url":"([^"]*\.jpg[^"]*)"', | |
| r'"url":"([^"]*\.jpeg[^"]*)"', | |
| r'"url":"([^"]*\.png[^"]*)"' | |
| ] | |
| found_images = set() | |
| for pattern in image_patterns: | |
| matches = re.findall(pattern, page_text) | |
| for match in matches: | |
| if match and ('instagram' in match.lower() or 'scontent' in match.lower()): | |
| # Clean up the URL | |
| clean_url = match.replace('\\u0026', '&').replace('\\/', '/') | |
| found_images.add(clean_url) | |
| # Convert to image objects | |
| for i, img_url in enumerate(list(found_images)): | |
| post_data["images"].append({ | |
| "src": img_url, | |
| "alt": f"Instagram post image {i+1}", | |
| "title": f"Instagram post image {i+1}", | |
| "width": "", | |
| "height": "" | |
| }) | |
| # Extract post information | |
| post_data["post_info"] = { | |
| "url": url, | |
| "images_count": len(post_data["images"]), | |
| "scraped_at": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| post_data["errors"] = [f"Failed to extract post data: {str(e)}"] | |
| return post_data | |
| def extract_profile_data(self, url): | |
| """Extract data from Instagram profile""" | |
| profile_data = { | |
| "profile_type": "account", | |
| "images": [], | |
| "profile_info": {}, | |
| "posts": [] | |
| } | |
| try: | |
| response = self.session.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| page_text = response.text | |
| # Extract profile information | |
| profile_data["profile_info"] = self.extract_profile_info(soup, page_text) | |
| # Extract recent posts first | |
| profile_data["posts"] = self.extract_recent_posts(page_text) | |
| # Extract images from profile page | |
| profile_data["images"] = self.extract_profile_images(page_text) | |
| # Extract images from individual posts (higher quality) | |
| if profile_data["posts"]: | |
| post_images = self.extract_images_from_posts(profile_data["posts"], max_posts=3) | |
| if post_images: | |
| profile_data["images"].extend(post_images) | |
| except Exception as e: | |
| profile_data["errors"] = [f"Failed to extract profile data: {str(e)}"] | |
| return profile_data | |
| def extract_profile_info(self, soup, page_text): | |
| """Extract profile information""" | |
| profile_info = { | |
| "username": "", | |
| "display_name": "", | |
| "bio": "", | |
| "followers": "", | |
| "following": "", | |
| "posts_count": "" | |
| } | |
| try: | |
| # Look for profile information in the page source | |
| # Instagram loads this data dynamically, so we need to parse JSON | |
| # Find JSON data in the page | |
| json_patterns = [ | |
| r'window\._sharedData\s*=\s*({[^}]+})', | |
| r'"profile_page":\s*({[^}]+})', | |
| r'"user":\s*({[^}]+})' | |
| ] | |
| for pattern in json_patterns: | |
| matches = re.findall(pattern, page_text) | |
| if matches: | |
| try: | |
| data = json.loads(matches[0]) | |
| # Extract profile info from JSON | |
| if "user" in data: | |
| user_data = data["user"] | |
| profile_info["username"] = user_data.get("username", "") | |
| profile_info["display_name"] = user_data.get("full_name", "") | |
| profile_info["bio"] = user_data.get("biography", "") | |
| profile_info["followers"] = user_data.get("followed_by", {}).get("count", "") | |
| profile_info["following"] = user_data.get("follows", {}).get("count", "") | |
| profile_info["posts_count"] = user_data.get("media", {}).get("count", "") | |
| except: | |
| continue | |
| # Fallback: try to extract from HTML | |
| if not profile_info["username"]: | |
| title_tag = soup.find('title') | |
| if title_tag: | |
| title_text = title_tag.get_text() | |
| if '(' in title_text and ')' in title_text: | |
| username = title_text.split('(')[1].split(')')[0] | |
| profile_info["username"] = username | |
| except Exception as e: | |
| profile_info["error"] = f"Failed to extract profile info: {str(e)}" | |
| return profile_info | |
| def extract_profile_images(self, page_text): | |
| """Extract images from profile page""" | |
| images = [] | |
| try: | |
| # Look for Instagram post images in the page source | |
| # Instagram stores post images in JSON data | |
| image_patterns = [ | |
| # Instagram post images (high quality) | |
| r'"display_url":"([^"]+)"', | |
| r'"display_src":"([^"]+)"', | |
| r'"src":"([^"]*\.jpg[^"]*)"', | |
| r'"src":"([^"]*\.jpeg[^"]*)"', | |
| r'"src":"([^"]*\.png[^"]*)"', | |
| # Direct image URLs | |
| r'https://[^"]*\.jpg[^"]*', | |
| r'https://[^"]*\.jpeg[^"]*', | |
| r'https://[^"]*\.png[^"]*', | |
| # Instagram CDN URLs | |
| r'https://scontent[^"]*\.jpg[^"]*', | |
| r'https://scontent[^"]*\.jpeg[^"]*', | |
| r'https://scontent[^"]*\.png[^"]*', | |
| # Additional Instagram patterns | |
| r'"url":"([^"]*\.jpg[^"]*)"', | |
| r'"url":"([^"]*\.jpeg[^"]*)"', | |
| r'"url":"([^"]*\.png[^"]*)"' | |
| ] | |
| found_images = set() | |
| for pattern in image_patterns: | |
| matches = re.findall(pattern, page_text) | |
| for match in matches: | |
| if match and ('instagram' in match.lower() or 'scontent' in match.lower()): | |
| # Clean up the URL | |
| clean_url = match.replace('\\u0026', '&').replace('\\/', '/') | |
| found_images.add(clean_url) | |
| # Convert to image objects | |
| for i, img_url in enumerate(list(found_images)): | |
| images.append({ | |
| "src": img_url, | |
| "alt": f"Instagram post image {i+1}", | |
| "title": f"Instagram post image {i+1}", | |
| "width": "", | |
| "height": "" | |
| }) | |
| except Exception as e: | |
| st.error(f"Failed to extract profile images: {str(e)}") | |
| return images | |
| def extract_recent_posts(self, page_text): | |
| """Extract recent posts from profile""" | |
| posts = [] | |
| try: | |
| # Look for post URLs in the page source | |
| post_patterns = [ | |
| r'"shortcode":"([^"]+)"', | |
| r'/p/([^/"]+)', | |
| r'/reel/([^/"]+)' | |
| ] | |
| found_posts = set() | |
| for pattern in post_patterns: | |
| matches = re.findall(pattern, page_text) | |
| for match in matches: | |
| if match: | |
| found_posts.add(match) | |
| # Convert to post objects | |
| for i, post_code in enumerate(list(found_posts)[:10]): # Convert set to list and limit to 10 posts | |
| posts.append({ | |
| "shortcode": post_code, | |
| "url": f"https://www.instagram.com/p/{post_code}/", | |
| "index": i + 1 | |
| }) | |
| except Exception as e: | |
| st.error(f"Failed to extract recent posts: {str(e)}") | |
| return posts | |
| def extract_images_from_posts(self, posts, max_posts=5): | |
| """Extract images from individual posts""" | |
| all_images = [] | |
| try: | |
| for i, post in enumerate(posts[:max_posts]): | |
| try: | |
| # Get the post page | |
| post_url = post["url"] | |
| response = self.session.get(post_url, timeout=10) | |
| response.raise_for_status() | |
| # Extract images from this post | |
| post_images = self.extract_post_images(response.text) | |
| # Add post context to images | |
| for img in post_images: | |
| img["post_url"] = post_url | |
| img["post_index"] = i + 1 | |
| all_images.append(img) | |
| # Small delay to be respectful | |
| time.sleep(1) | |
| except Exception as e: | |
| st.warning(f"Failed to extract images from post {post['shortcode']}: {str(e)}") | |
| continue | |
| except Exception as e: | |
| st.error(f"Failed to extract images from posts: {str(e)}") | |
| return all_images | |
| def extract_post_images(self, page_text): | |
| """Extract images from a single post page""" | |
| images = [] | |
| try: | |
| # Look for high-quality Instagram post images | |
| image_patterns = [ | |
| # Instagram post images (high quality) | |
| r'"display_url":"([^"]+)"', | |
| r'"display_src":"([^"]+)"', | |
| # Instagram CDN URLs (highest quality) | |
| r'https://scontent[^"]*\.jpg[^"]*', | |
| r'https://scontent[^"]*\.jpeg[^"]*', | |
| r'https://scontent[^"]*\.png[^"]*', | |
| # Additional patterns | |
| r'"src":"([^"]*\.jpg[^"]*)"', | |
| r'"src":"([^"]*\.jpeg[^"]*)"', | |
| r'"src":"([^"]*\.png[^"]*)"' | |
| ] | |
| found_images = set() | |
| for pattern in image_patterns: | |
| matches = re.findall(pattern, page_text) | |
| for match in matches: | |
| if match and ('scontent' in match.lower() or 'instagram' in match.lower()): | |
| # Clean up the URL | |
| clean_url = match.replace('\\u0026', '&').replace('\\/', '/') | |
| found_images.add(clean_url) | |
| # Convert to image objects | |
| for i, img_url in enumerate(list(found_images)): | |
| images.append({ | |
| "src": img_url, | |
| "alt": f"Instagram post image {i+1}", | |
| "title": f"Instagram post image {i+1}", | |
| "width": "", | |
| "height": "" | |
| }) | |
| except Exception as e: | |
| st.error(f"Failed to extract post images: {str(e)}") | |
| return images | |
| # Global Instagram scraper instance | |
| instagram_scraper = InstagramScraper() |