Spaces:

PHOROTHA913
/

Scrape-Anythings

Sleeping

App Files Files Community

Scrape-Anythings / instagram_scraper_v2.py

PHOROTHA913

Upload 9 files

5c3dc0d verified 6 months ago

raw

history blame contribute delete

6.65 kB

	import streamlit as st
	import requests
	import json
	import re
	import time
	import random
	from datetime import datetime

	class InstagramScraperV2:
	def __init__(self):
	self.session = requests.Session()
	self.setup_session()

	def setup_session(self):
	"""Setup session with better anti-detection measures"""
	user_agents = [
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	]

	self.session.headers.update({
	'User-Agent': random.choice(user_agents),
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	})

	def get_page_with_retry(self, url, max_retries=3):
	"""Get page with retry mechanism"""
	for attempt in range(max_retries):
	try:
	time.sleep(random.uniform(2, 4))
	response = self.session.get(url, timeout=20)
	response.raise_for_status()
	return response.text
	except Exception as e:
	st.warning(f"Attempt {attempt + 1} failed: {str(e)}")
	if attempt == max_retries - 1:
	raise
	return None

	def extract_instagram_data(self, url):
	"""Extract data from Instagram with improved error handling"""
	scraped_data = {
	"url": url,
	"timestamp": datetime.now().isoformat(),
	"platform": "instagram",
	"images": [],
	"posts": [],
	"profile_info": {},
	"errors": []
	}

	try:
	page_text = self.get_page_with_retry(url)
	if not page_text:
	scraped_data["errors"].append("Failed to load Instagram page")
	return scraped_data

	# Extract images
	scraped_data["images"] = self.extract_images_from_page(page_text)

	# Extract profile info
	scraped_data["profile_info"] = self.extract_profile_info(page_text)

	# Extract posts
	scraped_data["posts"] = self.extract_recent_posts(page_text)

	except Exception as e:
	scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")

	return scraped_data

	def extract_images_from_page(self, page_text):
	"""Extract images with improved patterns"""
	images = []

	try:
	# Enhanced patterns for Instagram images
	patterns = [
	r'https://scontent[^"]\.jpg[^"]',
	r'https://scontent[^"]\.jpeg[^"]',
	r'https://scontent[^"]\.png[^"]',
	r'"display_url":"([^"]+)"',
	r'"display_src":"([^"]+)"'
	]

	found_images = set()
	for pattern in patterns:
	matches = re.findall(pattern, page_text)
	for match in matches:
	if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
	clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
	found_images.add(clean_url)

	for i, img_url in enumerate(list(found_images)):
	images.append({
	"src": img_url,
	"alt": f"Instagram image {i+1}",
	"title": f"Instagram image {i+1}",
	"width": "",
	"height": ""
	})

	except Exception as e:
	st.error(f"Failed to extract images: {str(e)}")

	return images

	def extract_profile_info(self, page_text):
	"""Extract profile information"""
	profile_info = {
	"username": "",
	"display_name": "",
	"bio": "",
	"followers": "",
	"following": "",
	"posts_count": ""
	}

	try:
	# Extract username from title
	title_match = re.search(r'<title>([^<]+)</title>', page_text)
	if title_match:
	title = title_match.group(1)
	if '(' in title and ')' in title:
	username = title.split('(')[1].split(')')[0]
	profile_info["username"] = username

	# Look for JSON data
	json_patterns = [
	r'"username":"([^"]+)"',
	r'"full_name":"([^"]+)"',
	r'"biography":"([^"]+)"'
	]

	for pattern in json_patterns:
	matches = re.findall(pattern, page_text)
	if matches:
	if "username" in pattern:
	profile_info["username"] = matches[0]
	elif "full_name" in pattern:
	profile_info["display_name"] = matches[0]
	elif "biography" in pattern:
	profile_info["bio"] = matches[0]

	except Exception as e:
	profile_info["error"] = f"Failed to extract profile info: {str(e)}"

	return profile_info

	def extract_recent_posts(self, page_text):
	"""Extract recent posts"""
	posts = []

	try:
	post_patterns = [
	r'"shortcode":"([^"]+)"',
	r'/p/([^/"]+)'
	]

	found_posts = set()
	for pattern in post_patterns:
	matches = re.findall(pattern, page_text)
	for match in matches:
	if match:
	found_posts.add(match)

	for i, post_code in enumerate(list(found_posts)[:10]):
	posts.append({
	"shortcode": post_code,
	"url": f"https://www.instagram.com/p/{post_code}/",
	"index": i + 1
	})

	except Exception as e:
	st.error(f"Failed to extract posts: {str(e)}")

	return posts

	# Global instance
	instagram_scraper_v2 = InstagramScraperV2()