Spaces:

PHOROTHA913
/

Scrape-Anythings

Sleeping

App Files Files Community

Scrape-Anythings / instagram_scraper.py

PHOROTHA913

Upload 9 files

5c3dc0d verified 5 months ago

raw

history blame contribute delete

15 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import json
	import re
	import time
	from datetime import datetime
	from urllib.parse import urljoin, urlparse

	class InstagramScraper:
	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	})

	def extract_instagram_data(self, url):
	"""Extract data from Instagram profile or post"""
	scraped_data = {
	"url": url,
	"timestamp": datetime.now().isoformat(),
	"platform": "instagram",
	"images": [],
	"posts": [],
	"profile_info": {},
	"errors": []
	}

	try:
	# Determine if it's a profile or post URL
	if "/p/" in url or "/reel/" in url:
	# Single post
	scraped_data.update(self.extract_post_data(url))
	else:
	# Profile
	scraped_data.update(self.extract_profile_data(url))

	except Exception as e:
	scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")

	# Check if we found any data
	if not scraped_data.get("images") and not scraped_data.get("posts") and not scraped_data.get("profile_info", {}).get("username"):
	scraped_data["errors"].append("No Instagram data found. This might be due to:")
	scraped_data["errors"].append("- Private or protected account")
	scraped_data["errors"].append("- Instagram's anti-scraping measures")
	scraped_data["errors"].append("- Network connectivity issues")
	scraped_data["errors"].append("- URL format issues")

	return scraped_data

	def extract_post_data(self, url):
	"""Extract data from a single Instagram post"""
	post_data = {
	"post_type": "single_post",
	"images": [],
	"post_info": {}
	}

	try:
	response = self.session.get(url, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Look for image URLs in the page
	# Instagram loads images dynamically, so we need to look for patterns
	page_text = response.text

	# Find image URLs in the page source
	image_patterns = [
	# Instagram post images (high quality)
	r'"display_url":"([^"]+)"',
	r'"display_src":"([^"]+)"',
	r'"src":"([^"]\.jpg[^"])"',
	r'"src":"([^"]\.jpeg[^"])"',
	r'"src":"([^"]\.png[^"])"',
	# Direct image URLs
	r'https://[^"]\.jpg[^"]',
	r'https://[^"]\.jpeg[^"]',
	r'https://[^"]\.png[^"]',
	# Instagram CDN URLs (high quality)
	r'https://scontent[^"]\.jpg[^"]',
	r'https://scontent[^"]\.jpeg[^"]',
	r'https://scontent[^"]\.png[^"]',
	# Additional Instagram patterns
	r'"url":"([^"]\.jpg[^"])"',
	r'"url":"([^"]\.jpeg[^"])"',
	r'"url":"([^"]\.png[^"])"'
	]

	found_images = set()
	for pattern in image_patterns:
	matches = re.findall(pattern, page_text)
	for match in matches:
	if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
	# Clean up the URL
	clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
	found_images.add(clean_url)

	# Convert to image objects
	for i, img_url in enumerate(list(found_images)):
	post_data["images"].append({
	"src": img_url,
	"alt": f"Instagram post image {i+1}",
	"title": f"Instagram post image {i+1}",
	"width": "",
	"height": ""
	})

	# Extract post information
	post_data["post_info"] = {
	"url": url,
	"images_count": len(post_data["images"]),
	"scraped_at": datetime.now().isoformat()
	}

	except Exception as e:
	post_data["errors"] = [f"Failed to extract post data: {str(e)}"]

	return post_data

	def extract_profile_data(self, url):
	"""Extract data from Instagram profile"""
	profile_data = {
	"profile_type": "account",
	"images": [],
	"profile_info": {},
	"posts": []
	}

	try:
	response = self.session.get(url, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')
	page_text = response.text

	# Extract profile information
	profile_data["profile_info"] = self.extract_profile_info(soup, page_text)

	# Extract recent posts first
	profile_data["posts"] = self.extract_recent_posts(page_text)

	# Extract images from profile page
	profile_data["images"] = self.extract_profile_images(page_text)

	# Extract images from individual posts (higher quality)
	if profile_data["posts"]:
	post_images = self.extract_images_from_posts(profile_data["posts"], max_posts=3)
	if post_images:
	profile_data["images"].extend(post_images)

	except Exception as e:
	profile_data["errors"] = [f"Failed to extract profile data: {str(e)}"]

	return profile_data

	def extract_profile_info(self, soup, page_text):
	"""Extract profile information"""
	profile_info = {
	"username": "",
	"display_name": "",
	"bio": "",
	"followers": "",
	"following": "",
	"posts_count": ""
	}

	try:
	# Look for profile information in the page source
	# Instagram loads this data dynamically, so we need to parse JSON

	# Find JSON data in the page
	json_patterns = [
	r'window\._sharedData\s=\s({[^}]+})',
	r'"profile_page":\s*({[^}]+})',
	r'"user":\s*({[^}]+})'
	]

	for pattern in json_patterns:
	matches = re.findall(pattern, page_text)
	if matches:
	try:
	data = json.loads(matches[0])
	# Extract profile info from JSON
	if "user" in data:
	user_data = data["user"]
	profile_info["username"] = user_data.get("username", "")
	profile_info["display_name"] = user_data.get("full_name", "")
	profile_info["bio"] = user_data.get("biography", "")
	profile_info["followers"] = user_data.get("followed_by", {}).get("count", "")
	profile_info["following"] = user_data.get("follows", {}).get("count", "")
	profile_info["posts_count"] = user_data.get("media", {}).get("count", "")
	except:
	continue

	# Fallback: try to extract from HTML
	if not profile_info["username"]:
	title_tag = soup.find('title')
	if title_tag:
	title_text = title_tag.get_text()
	if '(' in title_text and ')' in title_text:
	username = title_text.split('(')[1].split(')')[0]
	profile_info["username"] = username

	except Exception as e:
	profile_info["error"] = f"Failed to extract profile info: {str(e)}"

	return profile_info

	def extract_profile_images(self, page_text):
	"""Extract images from profile page"""
	images = []

	try:
	# Look for Instagram post images in the page source
	# Instagram stores post images in JSON data
	image_patterns = [
	# Instagram post images (high quality)
	r'"display_url":"([^"]+)"',
	r'"display_src":"([^"]+)"',
	r'"src":"([^"]\.jpg[^"])"',
	r'"src":"([^"]\.jpeg[^"])"',
	r'"src":"([^"]\.png[^"])"',
	# Direct image URLs
	r'https://[^"]\.jpg[^"]',
	r'https://[^"]\.jpeg[^"]',
	r'https://[^"]\.png[^"]',
	# Instagram CDN URLs
	r'https://scontent[^"]\.jpg[^"]',
	r'https://scontent[^"]\.jpeg[^"]',
	r'https://scontent[^"]\.png[^"]',
	# Additional Instagram patterns
	r'"url":"([^"]\.jpg[^"])"',
	r'"url":"([^"]\.jpeg[^"])"',
	r'"url":"([^"]\.png[^"])"'
	]

	found_images = set()
	for pattern in image_patterns:
	matches = re.findall(pattern, page_text)
	for match in matches:
	if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
	# Clean up the URL
	clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
	found_images.add(clean_url)

	# Convert to image objects
	for i, img_url in enumerate(list(found_images)):
	images.append({
	"src": img_url,
	"alt": f"Instagram post image {i+1}",
	"title": f"Instagram post image {i+1}",
	"width": "",
	"height": ""
	})

	except Exception as e:
	st.error(f"Failed to extract profile images: {str(e)}")

	return images

	def extract_recent_posts(self, page_text):
	"""Extract recent posts from profile"""
	posts = []

	try:
	# Look for post URLs in the page source
	post_patterns = [
	r'"shortcode":"([^"]+)"',
	r'/p/([^/"]+)',
	r'/reel/([^/"]+)'
	]

	found_posts = set()
	for pattern in post_patterns:
	matches = re.findall(pattern, page_text)
	for match in matches:
	if match:
	found_posts.add(match)

	# Convert to post objects
	for i, post_code in enumerate(list(found_posts)[:10]): # Convert set to list and limit to 10 posts
	posts.append({
	"shortcode": post_code,
	"url": f"https://www.instagram.com/p/{post_code}/",
	"index": i + 1
	})

	except Exception as e:
	st.error(f"Failed to extract recent posts: {str(e)}")

	return posts

	def extract_images_from_posts(self, posts, max_posts=5):
	"""Extract images from individual posts"""
	all_images = []

	try:
	for i, post in enumerate(posts[:max_posts]):
	try:
	# Get the post page
	post_url = post["url"]
	response = self.session.get(post_url, timeout=10)
	response.raise_for_status()

	# Extract images from this post
	post_images = self.extract_post_images(response.text)

	# Add post context to images
	for img in post_images:
	img["post_url"] = post_url
	img["post_index"] = i + 1
	all_images.append(img)

	# Small delay to be respectful
	time.sleep(1)

	except Exception as e:
	st.warning(f"Failed to extract images from post {post['shortcode']}: {str(e)}")
	continue

	except Exception as e:
	st.error(f"Failed to extract images from posts: {str(e)}")

	return all_images

	def extract_post_images(self, page_text):
	"""Extract images from a single post page"""
	images = []

	try:
	# Look for high-quality Instagram post images
	image_patterns = [
	# Instagram post images (high quality)
	r'"display_url":"([^"]+)"',
	r'"display_src":"([^"]+)"',
	# Instagram CDN URLs (highest quality)
	r'https://scontent[^"]\.jpg[^"]',
	r'https://scontent[^"]\.jpeg[^"]',
	r'https://scontent[^"]\.png[^"]',
	# Additional patterns
	r'"src":"([^"]\.jpg[^"])"',
	r'"src":"([^"]\.jpeg[^"])"',
	r'"src":"([^"]\.png[^"])"'
	]

	found_images = set()
	for pattern in image_patterns:
	matches = re.findall(pattern, page_text)
	for match in matches:
	if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
	# Clean up the URL
	clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
	found_images.add(clean_url)

	# Convert to image objects
	for i, img_url in enumerate(list(found_images)):
	images.append({
	"src": img_url,
	"alt": f"Instagram post image {i+1}",
	"title": f"Instagram post image {i+1}",
	"width": "",
	"height": ""
	})

	except Exception as e:
	st.error(f"Failed to extract post images: {str(e)}")

	return images

	# Global Instagram scraper instance
	instagram_scraper = InstagramScraper()