import streamlit as st import requests import json import re import time import random from datetime import datetime class InstagramScraperV2: def __init__(self): self.session = requests.Session() self.setup_session() def setup_session(self): """Setup session with better anti-detection measures""" user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ] self.session.headers.update({ 'User-Agent': random.choice(user_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }) def get_page_with_retry(self, url, max_retries=3): """Get page with retry mechanism""" for attempt in range(max_retries): try: time.sleep(random.uniform(2, 4)) response = self.session.get(url, timeout=20) response.raise_for_status() return response.text except Exception as e: st.warning(f"Attempt {attempt + 1} failed: {str(e)}") if attempt == max_retries - 1: raise return None def extract_instagram_data(self, url): """Extract data from Instagram with improved error handling""" scraped_data = { "url": url, "timestamp": datetime.now().isoformat(), "platform": "instagram", "images": [], "posts": [], "profile_info": {}, "errors": [] } try: page_text = self.get_page_with_retry(url) if not page_text: scraped_data["errors"].append("Failed to load Instagram page") return scraped_data # Extract images scraped_data["images"] = self.extract_images_from_page(page_text) # Extract profile info scraped_data["profile_info"] = self.extract_profile_info(page_text) # Extract posts scraped_data["posts"] = self.extract_recent_posts(page_text) except Exception as e: scraped_data["errors"].append(f"Instagram scraping error: {str(e)}") return scraped_data def extract_images_from_page(self, page_text): """Extract images with improved patterns""" images = [] try: # Enhanced patterns for Instagram images patterns = [ r'https://scontent[^"]*\.jpg[^"]*', r'https://scontent[^"]*\.jpeg[^"]*', r'https://scontent[^"]*\.png[^"]*', r'"display_url":"([^"]+)"', r'"display_src":"([^"]+)"' ] found_images = set() for pattern in patterns: matches = re.findall(pattern, page_text) for match in matches: if match and ('scontent' in match.lower() or 'instagram' in match.lower()): clean_url = match.replace('\\u0026', '&').replace('\\/', '/') found_images.add(clean_url) for i, img_url in enumerate(list(found_images)): images.append({ "src": img_url, "alt": f"Instagram image {i+1}", "title": f"Instagram image {i+1}", "width": "", "height": "" }) except Exception as e: st.error(f"Failed to extract images: {str(e)}") return images def extract_profile_info(self, page_text): """Extract profile information""" profile_info = { "username": "", "display_name": "", "bio": "", "followers": "", "following": "", "posts_count": "" } try: # Extract username from title title_match = re.search(r'([^<]+)', page_text) if title_match: title = title_match.group(1) if '(' in title and ')' in title: username = title.split('(')[1].split(')')[0] profile_info["username"] = username # Look for JSON data json_patterns = [ r'"username":"([^"]+)"', r'"full_name":"([^"]+)"', r'"biography":"([^"]+)"' ] for pattern in json_patterns: matches = re.findall(pattern, page_text) if matches: if "username" in pattern: profile_info["username"] = matches[0] elif "full_name" in pattern: profile_info["display_name"] = matches[0] elif "biography" in pattern: profile_info["bio"] = matches[0] except Exception as e: profile_info["error"] = f"Failed to extract profile info: {str(e)}" return profile_info def extract_recent_posts(self, page_text): """Extract recent posts""" posts = [] try: post_patterns = [ r'"shortcode":"([^"]+)"', r'/p/([^/"]+)' ] found_posts = set() for pattern in post_patterns: matches = re.findall(pattern, page_text) for match in matches: if match: found_posts.add(match) for i, post_code in enumerate(list(found_posts)[:10]): posts.append({ "shortcode": post_code, "url": f"https://www.instagram.com/p/{post_code}/", "index": i + 1 }) except Exception as e: st.error(f"Failed to extract posts: {str(e)}") return posts # Global instance instagram_scraper_v2 = InstagramScraperV2()