Scrape-Anythings / instagram_scraper.py
PHOROTHA913's picture
Upload 9 files
5c3dc0d verified
import streamlit as st
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse
class InstagramScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def extract_instagram_data(self, url):
"""Extract data from Instagram profile or post"""
scraped_data = {
"url": url,
"timestamp": datetime.now().isoformat(),
"platform": "instagram",
"images": [],
"posts": [],
"profile_info": {},
"errors": []
}
try:
# Determine if it's a profile or post URL
if "/p/" in url or "/reel/" in url:
# Single post
scraped_data.update(self.extract_post_data(url))
else:
# Profile
scraped_data.update(self.extract_profile_data(url))
except Exception as e:
scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")
# Check if we found any data
if not scraped_data.get("images") and not scraped_data.get("posts") and not scraped_data.get("profile_info", {}).get("username"):
scraped_data["errors"].append("No Instagram data found. This might be due to:")
scraped_data["errors"].append("- Private or protected account")
scraped_data["errors"].append("- Instagram's anti-scraping measures")
scraped_data["errors"].append("- Network connectivity issues")
scraped_data["errors"].append("- URL format issues")
return scraped_data
def extract_post_data(self, url):
"""Extract data from a single Instagram post"""
post_data = {
"post_type": "single_post",
"images": [],
"post_info": {}
}
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Look for image URLs in the page
# Instagram loads images dynamically, so we need to look for patterns
page_text = response.text
# Find image URLs in the page source
image_patterns = [
# Instagram post images (high quality)
r'"display_url":"([^"]+)"',
r'"display_src":"([^"]+)"',
r'"src":"([^"]*\.jpg[^"]*)"',
r'"src":"([^"]*\.jpeg[^"]*)"',
r'"src":"([^"]*\.png[^"]*)"',
# Direct image URLs
r'https://[^"]*\.jpg[^"]*',
r'https://[^"]*\.jpeg[^"]*',
r'https://[^"]*\.png[^"]*',
# Instagram CDN URLs (high quality)
r'https://scontent[^"]*\.jpg[^"]*',
r'https://scontent[^"]*\.jpeg[^"]*',
r'https://scontent[^"]*\.png[^"]*',
# Additional Instagram patterns
r'"url":"([^"]*\.jpg[^"]*)"',
r'"url":"([^"]*\.jpeg[^"]*)"',
r'"url":"([^"]*\.png[^"]*)"'
]
found_images = set()
for pattern in image_patterns:
matches = re.findall(pattern, page_text)
for match in matches:
if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
# Clean up the URL
clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
found_images.add(clean_url)
# Convert to image objects
for i, img_url in enumerate(list(found_images)):
post_data["images"].append({
"src": img_url,
"alt": f"Instagram post image {i+1}",
"title": f"Instagram post image {i+1}",
"width": "",
"height": ""
})
# Extract post information
post_data["post_info"] = {
"url": url,
"images_count": len(post_data["images"]),
"scraped_at": datetime.now().isoformat()
}
except Exception as e:
post_data["errors"] = [f"Failed to extract post data: {str(e)}"]
return post_data
def extract_profile_data(self, url):
"""Extract data from Instagram profile"""
profile_data = {
"profile_type": "account",
"images": [],
"profile_info": {},
"posts": []
}
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
page_text = response.text
# Extract profile information
profile_data["profile_info"] = self.extract_profile_info(soup, page_text)
# Extract recent posts first
profile_data["posts"] = self.extract_recent_posts(page_text)
# Extract images from profile page
profile_data["images"] = self.extract_profile_images(page_text)
# Extract images from individual posts (higher quality)
if profile_data["posts"]:
post_images = self.extract_images_from_posts(profile_data["posts"], max_posts=3)
if post_images:
profile_data["images"].extend(post_images)
except Exception as e:
profile_data["errors"] = [f"Failed to extract profile data: {str(e)}"]
return profile_data
def extract_profile_info(self, soup, page_text):
"""Extract profile information"""
profile_info = {
"username": "",
"display_name": "",
"bio": "",
"followers": "",
"following": "",
"posts_count": ""
}
try:
# Look for profile information in the page source
# Instagram loads this data dynamically, so we need to parse JSON
# Find JSON data in the page
json_patterns = [
r'window\._sharedData\s*=\s*({[^}]+})',
r'"profile_page":\s*({[^}]+})',
r'"user":\s*({[^}]+})'
]
for pattern in json_patterns:
matches = re.findall(pattern, page_text)
if matches:
try:
data = json.loads(matches[0])
# Extract profile info from JSON
if "user" in data:
user_data = data["user"]
profile_info["username"] = user_data.get("username", "")
profile_info["display_name"] = user_data.get("full_name", "")
profile_info["bio"] = user_data.get("biography", "")
profile_info["followers"] = user_data.get("followed_by", {}).get("count", "")
profile_info["following"] = user_data.get("follows", {}).get("count", "")
profile_info["posts_count"] = user_data.get("media", {}).get("count", "")
except:
continue
# Fallback: try to extract from HTML
if not profile_info["username"]:
title_tag = soup.find('title')
if title_tag:
title_text = title_tag.get_text()
if '(' in title_text and ')' in title_text:
username = title_text.split('(')[1].split(')')[0]
profile_info["username"] = username
except Exception as e:
profile_info["error"] = f"Failed to extract profile info: {str(e)}"
return profile_info
def extract_profile_images(self, page_text):
"""Extract images from profile page"""
images = []
try:
# Look for Instagram post images in the page source
# Instagram stores post images in JSON data
image_patterns = [
# Instagram post images (high quality)
r'"display_url":"([^"]+)"',
r'"display_src":"([^"]+)"',
r'"src":"([^"]*\.jpg[^"]*)"',
r'"src":"([^"]*\.jpeg[^"]*)"',
r'"src":"([^"]*\.png[^"]*)"',
# Direct image URLs
r'https://[^"]*\.jpg[^"]*',
r'https://[^"]*\.jpeg[^"]*',
r'https://[^"]*\.png[^"]*',
# Instagram CDN URLs
r'https://scontent[^"]*\.jpg[^"]*',
r'https://scontent[^"]*\.jpeg[^"]*',
r'https://scontent[^"]*\.png[^"]*',
# Additional Instagram patterns
r'"url":"([^"]*\.jpg[^"]*)"',
r'"url":"([^"]*\.jpeg[^"]*)"',
r'"url":"([^"]*\.png[^"]*)"'
]
found_images = set()
for pattern in image_patterns:
matches = re.findall(pattern, page_text)
for match in matches:
if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
# Clean up the URL
clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
found_images.add(clean_url)
# Convert to image objects
for i, img_url in enumerate(list(found_images)):
images.append({
"src": img_url,
"alt": f"Instagram post image {i+1}",
"title": f"Instagram post image {i+1}",
"width": "",
"height": ""
})
except Exception as e:
st.error(f"Failed to extract profile images: {str(e)}")
return images
def extract_recent_posts(self, page_text):
"""Extract recent posts from profile"""
posts = []
try:
# Look for post URLs in the page source
post_patterns = [
r'"shortcode":"([^"]+)"',
r'/p/([^/"]+)',
r'/reel/([^/"]+)'
]
found_posts = set()
for pattern in post_patterns:
matches = re.findall(pattern, page_text)
for match in matches:
if match:
found_posts.add(match)
# Convert to post objects
for i, post_code in enumerate(list(found_posts)[:10]): # Convert set to list and limit to 10 posts
posts.append({
"shortcode": post_code,
"url": f"https://www.instagram.com/p/{post_code}/",
"index": i + 1
})
except Exception as e:
st.error(f"Failed to extract recent posts: {str(e)}")
return posts
def extract_images_from_posts(self, posts, max_posts=5):
"""Extract images from individual posts"""
all_images = []
try:
for i, post in enumerate(posts[:max_posts]):
try:
# Get the post page
post_url = post["url"]
response = self.session.get(post_url, timeout=10)
response.raise_for_status()
# Extract images from this post
post_images = self.extract_post_images(response.text)
# Add post context to images
for img in post_images:
img["post_url"] = post_url
img["post_index"] = i + 1
all_images.append(img)
# Small delay to be respectful
time.sleep(1)
except Exception as e:
st.warning(f"Failed to extract images from post {post['shortcode']}: {str(e)}")
continue
except Exception as e:
st.error(f"Failed to extract images from posts: {str(e)}")
return all_images
def extract_post_images(self, page_text):
"""Extract images from a single post page"""
images = []
try:
# Look for high-quality Instagram post images
image_patterns = [
# Instagram post images (high quality)
r'"display_url":"([^"]+)"',
r'"display_src":"([^"]+)"',
# Instagram CDN URLs (highest quality)
r'https://scontent[^"]*\.jpg[^"]*',
r'https://scontent[^"]*\.jpeg[^"]*',
r'https://scontent[^"]*\.png[^"]*',
# Additional patterns
r'"src":"([^"]*\.jpg[^"]*)"',
r'"src":"([^"]*\.jpeg[^"]*)"',
r'"src":"([^"]*\.png[^"]*)"'
]
found_images = set()
for pattern in image_patterns:
matches = re.findall(pattern, page_text)
for match in matches:
if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
# Clean up the URL
clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
found_images.add(clean_url)
# Convert to image objects
for i, img_url in enumerate(list(found_images)):
images.append({
"src": img_url,
"alt": f"Instagram post image {i+1}",
"title": f"Instagram post image {i+1}",
"width": "",
"height": ""
})
except Exception as e:
st.error(f"Failed to extract post images: {str(e)}")
return images
# Global Instagram scraper instance
instagram_scraper = InstagramScraper()