Scrape-Anythings / instagram_scraper_v2.py
PHOROTHA913's picture
Upload 9 files
5c3dc0d verified
import streamlit as st
import requests
import json
import re
import time
import random
from datetime import datetime
class InstagramScraperV2:
def __init__(self):
self.session = requests.Session()
self.setup_session()
def setup_session(self):
"""Setup session with better anti-detection measures"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
]
self.session.headers.update({
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
def get_page_with_retry(self, url, max_retries=3):
"""Get page with retry mechanism"""
for attempt in range(max_retries):
try:
time.sleep(random.uniform(2, 4))
response = self.session.get(url, timeout=20)
response.raise_for_status()
return response.text
except Exception as e:
st.warning(f"Attempt {attempt + 1} failed: {str(e)}")
if attempt == max_retries - 1:
raise
return None
def extract_instagram_data(self, url):
"""Extract data from Instagram with improved error handling"""
scraped_data = {
"url": url,
"timestamp": datetime.now().isoformat(),
"platform": "instagram",
"images": [],
"posts": [],
"profile_info": {},
"errors": []
}
try:
page_text = self.get_page_with_retry(url)
if not page_text:
scraped_data["errors"].append("Failed to load Instagram page")
return scraped_data
# Extract images
scraped_data["images"] = self.extract_images_from_page(page_text)
# Extract profile info
scraped_data["profile_info"] = self.extract_profile_info(page_text)
# Extract posts
scraped_data["posts"] = self.extract_recent_posts(page_text)
except Exception as e:
scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")
return scraped_data
def extract_images_from_page(self, page_text):
"""Extract images with improved patterns"""
images = []
try:
# Enhanced patterns for Instagram images
patterns = [
r'https://scontent[^"]*\.jpg[^"]*',
r'https://scontent[^"]*\.jpeg[^"]*',
r'https://scontent[^"]*\.png[^"]*',
r'"display_url":"([^"]+)"',
r'"display_src":"([^"]+)"'
]
found_images = set()
for pattern in patterns:
matches = re.findall(pattern, page_text)
for match in matches:
if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
found_images.add(clean_url)
for i, img_url in enumerate(list(found_images)):
images.append({
"src": img_url,
"alt": f"Instagram image {i+1}",
"title": f"Instagram image {i+1}",
"width": "",
"height": ""
})
except Exception as e:
st.error(f"Failed to extract images: {str(e)}")
return images
def extract_profile_info(self, page_text):
"""Extract profile information"""
profile_info = {
"username": "",
"display_name": "",
"bio": "",
"followers": "",
"following": "",
"posts_count": ""
}
try:
# Extract username from title
title_match = re.search(r'<title>([^<]+)</title>', page_text)
if title_match:
title = title_match.group(1)
if '(' in title and ')' in title:
username = title.split('(')[1].split(')')[0]
profile_info["username"] = username
# Look for JSON data
json_patterns = [
r'"username":"([^"]+)"',
r'"full_name":"([^"]+)"',
r'"biography":"([^"]+)"'
]
for pattern in json_patterns:
matches = re.findall(pattern, page_text)
if matches:
if "username" in pattern:
profile_info["username"] = matches[0]
elif "full_name" in pattern:
profile_info["display_name"] = matches[0]
elif "biography" in pattern:
profile_info["bio"] = matches[0]
except Exception as e:
profile_info["error"] = f"Failed to extract profile info: {str(e)}"
return profile_info
def extract_recent_posts(self, page_text):
"""Extract recent posts"""
posts = []
try:
post_patterns = [
r'"shortcode":"([^"]+)"',
r'/p/([^/"]+)'
]
found_posts = set()
for pattern in post_patterns:
matches = re.findall(pattern, page_text)
for match in matches:
if match:
found_posts.add(match)
for i, post_code in enumerate(list(found_posts)[:10]):
posts.append({
"shortcode": post_code,
"url": f"https://www.instagram.com/p/{post_code}/",
"index": i + 1
})
except Exception as e:
st.error(f"Failed to extract posts: {str(e)}")
return posts
# Global instance
instagram_scraper_v2 = InstagramScraperV2()