Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| import json | |
| import re | |
| import time | |
| import random | |
| from datetime import datetime | |
| class InstagramScraperV2: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.setup_session() | |
| def setup_session(self): | |
| """Setup session with better anti-detection measures""" | |
| user_agents = [ | |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| ] | |
| self.session.headers.update({ | |
| 'User-Agent': random.choice(user_agents), | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1' | |
| }) | |
| def get_page_with_retry(self, url, max_retries=3): | |
| """Get page with retry mechanism""" | |
| for attempt in range(max_retries): | |
| try: | |
| time.sleep(random.uniform(2, 4)) | |
| response = self.session.get(url, timeout=20) | |
| response.raise_for_status() | |
| return response.text | |
| except Exception as e: | |
| st.warning(f"Attempt {attempt + 1} failed: {str(e)}") | |
| if attempt == max_retries - 1: | |
| raise | |
| return None | |
| def extract_instagram_data(self, url): | |
| """Extract data from Instagram with improved error handling""" | |
| scraped_data = { | |
| "url": url, | |
| "timestamp": datetime.now().isoformat(), | |
| "platform": "instagram", | |
| "images": [], | |
| "posts": [], | |
| "profile_info": {}, | |
| "errors": [] | |
| } | |
| try: | |
| page_text = self.get_page_with_retry(url) | |
| if not page_text: | |
| scraped_data["errors"].append("Failed to load Instagram page") | |
| return scraped_data | |
| # Extract images | |
| scraped_data["images"] = self.extract_images_from_page(page_text) | |
| # Extract profile info | |
| scraped_data["profile_info"] = self.extract_profile_info(page_text) | |
| # Extract posts | |
| scraped_data["posts"] = self.extract_recent_posts(page_text) | |
| except Exception as e: | |
| scraped_data["errors"].append(f"Instagram scraping error: {str(e)}") | |
| return scraped_data | |
| def extract_images_from_page(self, page_text): | |
| """Extract images with improved patterns""" | |
| images = [] | |
| try: | |
| # Enhanced patterns for Instagram images | |
| patterns = [ | |
| r'https://scontent[^"]*\.jpg[^"]*', | |
| r'https://scontent[^"]*\.jpeg[^"]*', | |
| r'https://scontent[^"]*\.png[^"]*', | |
| r'"display_url":"([^"]+)"', | |
| r'"display_src":"([^"]+)"' | |
| ] | |
| found_images = set() | |
| for pattern in patterns: | |
| matches = re.findall(pattern, page_text) | |
| for match in matches: | |
| if match and ('scontent' in match.lower() or 'instagram' in match.lower()): | |
| clean_url = match.replace('\\u0026', '&').replace('\\/', '/') | |
| found_images.add(clean_url) | |
| for i, img_url in enumerate(list(found_images)): | |
| images.append({ | |
| "src": img_url, | |
| "alt": f"Instagram image {i+1}", | |
| "title": f"Instagram image {i+1}", | |
| "width": "", | |
| "height": "" | |
| }) | |
| except Exception as e: | |
| st.error(f"Failed to extract images: {str(e)}") | |
| return images | |
| def extract_profile_info(self, page_text): | |
| """Extract profile information""" | |
| profile_info = { | |
| "username": "", | |
| "display_name": "", | |
| "bio": "", | |
| "followers": "", | |
| "following": "", | |
| "posts_count": "" | |
| } | |
| try: | |
| # Extract username from title | |
| title_match = re.search(r'<title>([^<]+)</title>', page_text) | |
| if title_match: | |
| title = title_match.group(1) | |
| if '(' in title and ')' in title: | |
| username = title.split('(')[1].split(')')[0] | |
| profile_info["username"] = username | |
| # Look for JSON data | |
| json_patterns = [ | |
| r'"username":"([^"]+)"', | |
| r'"full_name":"([^"]+)"', | |
| r'"biography":"([^"]+)"' | |
| ] | |
| for pattern in json_patterns: | |
| matches = re.findall(pattern, page_text) | |
| if matches: | |
| if "username" in pattern: | |
| profile_info["username"] = matches[0] | |
| elif "full_name" in pattern: | |
| profile_info["display_name"] = matches[0] | |
| elif "biography" in pattern: | |
| profile_info["bio"] = matches[0] | |
| except Exception as e: | |
| profile_info["error"] = f"Failed to extract profile info: {str(e)}" | |
| return profile_info | |
| def extract_recent_posts(self, page_text): | |
| """Extract recent posts""" | |
| posts = [] | |
| try: | |
| post_patterns = [ | |
| r'"shortcode":"([^"]+)"', | |
| r'/p/([^/"]+)' | |
| ] | |
| found_posts = set() | |
| for pattern in post_patterns: | |
| matches = re.findall(pattern, page_text) | |
| for match in matches: | |
| if match: | |
| found_posts.add(match) | |
| for i, post_code in enumerate(list(found_posts)[:10]): | |
| posts.append({ | |
| "shortcode": post_code, | |
| "url": f"https://www.instagram.com/p/{post_code}/", | |
| "index": i + 1 | |
| }) | |
| except Exception as e: | |
| st.error(f"Failed to extract posts: {str(e)}") | |
| return posts | |
| # Global instance | |
| instagram_scraper_v2 = InstagramScraperV2() |