Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from datetime import datetime | |
| import time | |
| import requests | |
| import json | |
| import re | |
| # ---------------- Fixed IG_LOGIN ---------------- | |
| def IG_LOGIN(username, password): | |
| # Always login fresh (cannot rely on cookie.json in Spaces) | |
| link = 'https://www.instagram.com/accounts/login/' | |
| login_url = 'https://www.instagram.com/accounts/login/ajax/' | |
| time_stamp = int(datetime.now().timestamp()) | |
| session = requests.Session() | |
| response = session.get(link) | |
| csrf = response.cookies.get('csrftoken') | |
| payload = { | |
| 'username': username, | |
| 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time_stamp}:{password}', | |
| 'queryParams': {}, | |
| 'optIntoOneTap': 'false' | |
| } | |
| login_header = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", | |
| "X-Requested-With": "XMLHttpRequest", | |
| "Referer": "https://www.instagram.com/accounts/login/", | |
| "x-csrftoken": csrf | |
| } | |
| login_response = session.post(login_url, data=payload, headers=login_header) | |
| json_data = login_response.json() | |
| if json_data.get("authenticated"): | |
| return session.cookies.get_dict() | |
| else: | |
| return None | |
| def get_user_id(username, cookie_jar): | |
| """Get user ID using the web graphql endpoint""" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', | |
| 'X-IG-App-ID': '936619743392459', | |
| 'X-ASBD-ID': '198387', | |
| 'X-CSRFToken': cookie_jar.get('csrftoken', '') | |
| } | |
| # Method 1: Try the username page directly | |
| try: | |
| url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}" | |
| response = requests.get(url, headers=headers, cookies=cookie_jar) | |
| if response.status_code == 200: | |
| data = response.json() | |
| user_id = data['data']['user']['id'] | |
| return user_id | |
| except Exception as e: | |
| print(f"Method 1 failed: {e}") | |
| # Method 2: Try searching for the user | |
| try: | |
| search_url = f"https://www.instagram.com/web/search/topsearch/?query={username}" | |
| response = requests.get(search_url, headers=headers, cookies=cookie_jar) | |
| if response.status_code == 200: | |
| data = response.json() | |
| for user in data.get('users', []): | |
| if user['user']['username'].lower() == username.lower(): | |
| return user['user']['pk'] | |
| except Exception as e: | |
| print(f"Method 2 failed: {e}") | |
| return None | |
| def get_user_details(username, cookie_jar): | |
| """Get detailed user information including bio and follower count""" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', | |
| 'X-IG-App-ID': '936619743392459', | |
| 'X-ASBD-ID': '198387', | |
| 'X-CSRFToken': cookie_jar.get('csrftoken', '') | |
| } | |
| try: | |
| url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}" | |
| response = requests.get(url, headers=headers, cookies=cookie_jar) | |
| if response.status_code == 200: | |
| data = response.json() | |
| user_data = data['data']['user'] | |
| return { | |
| 'bio': user_data.get('biography', ''), | |
| 'follower_count': user_data.get('edge_followed_by', {}).get('count', 0), | |
| 'following_count': user_data.get('edge_follow', {}).get('count', 0), | |
| 'post_count': user_data.get('edge_owner_to_timeline_media', {}).get('count', 0), | |
| 'is_business': user_data.get('is_business_account', False), | |
| 'category': user_data.get('category_name', ''), | |
| 'external_url': user_data.get('external_url', '') | |
| } | |
| except Exception as e: | |
| print(f"Error fetching details for {username}: {e}") | |
| return None | |
| def is_brand_page(bio, follower_count, following_count, post_count, min_followers=1000, max_following_ratio=2.0): | |
| """ | |
| Determine if an account is a brand page based on bio content and metrics | |
| Args: | |
| bio: Account biography text | |
| follower_count: Number of followers | |
| following_count: Number of accounts following | |
| post_count: Number of posts | |
| min_followers: Minimum follower count to be considered a brand | |
| max_following_ratio: Maximum ratio of following/followers | |
| Returns: | |
| tuple: (is_brand, reason) | |
| """ | |
| # Brand keywords in bio | |
| brand_keywords = [ | |
| # Business indicators | |
| r'\b(shop|store|brand|official|business|company|boutique)\b', | |
| r'\b(buy|purchase|order|sale|discount|promo|offer)\b', | |
| # Contact/shipping | |
| r'\b(shipping|delivery|worldwide|dm\s+to\s+order|whatsapp)\b', | |
| r'\b(contact|inquiries|orders|email)\b', | |
| # Product types | |
| r'\b(cosmetics|beauty|skincare|makeup|fashion|clothing|jewelry|accessories)\b', | |
| r'\b(organic|natural|handmade|luxury|premium)\b', | |
| # Business patterns | |
| r'📧|📩|📞|☎️|🛒|🛍️|💳', # Business emojis | |
| r'(www\.|\.com|\.co|\.pk)', # Website URLs | |
| r'(est\.|established|since)\s+\d{4}', # Establishment year | |
| r'(dm|message)\s+(us|for)', # DM for orders | |
| ] | |
| # Influencer indicators (to exclude) | |
| influencer_keywords = [ | |
| r'\b(influencer|blogger|vlogger|content\s+creator|lifestyle)\b', | |
| r'\b(personal|my\s+life|journey|traveler)\b', | |
| r'\b(fitness\s+journey|transformation|inspiration)\b', | |
| ] | |
| bio_lower = bio.lower() | |
| # Check for influencer indicators first | |
| for pattern in influencer_keywords: | |
| if re.search(pattern, bio_lower): | |
| return False, "Appears to be influencer (bio keywords)" | |
| # Check follower count | |
| if follower_count < min_followers: | |
| return False, f"Follower count too low ({follower_count} < {min_followers})" | |
| # Check following ratio (brands typically don't follow many accounts) | |
| if follower_count > 0: | |
| following_ratio = following_count / follower_count | |
| if following_ratio > max_following_ratio: | |
| return False, f"Following ratio too high ({following_ratio:.2f} > {max_following_ratio})" | |
| # Check for brand indicators in bio | |
| brand_score = 0 | |
| matched_patterns = [] | |
| for pattern in brand_keywords: | |
| if re.search(pattern, bio_lower): | |
| brand_score += 1 | |
| matched_patterns.append(pattern) | |
| # Need at least 2 brand indicators | |
| if brand_score >= 2: | |
| return True, f"Brand indicators found: {brand_score}" | |
| # Additional check: if very low following count relative to followers (typical for brands) | |
| if follower_count > 5000 and following_count < 500: | |
| return True, "High follower/low following ratio (brand pattern)" | |
| return False, "Insufficient brand indicators" | |
| def scrap_brand_following(username, password, target, min_followers=1000, max_following_ratio=2.0, | |
| max_accounts_to_check=50, delay=2): | |
| """ | |
| Scrape following accounts and filter for brand pages | |
| Args: | |
| username: Your Instagram username | |
| password: Your Instagram password | |
| target: Target account to scrape following from | |
| min_followers: Minimum followers for brand consideration | |
| max_following_ratio: Maximum following/followers ratio | |
| max_accounts_to_check: Maximum accounts to check (to avoid rate limits) | |
| delay: Delay between requests in seconds | |
| """ | |
| # Login | |
| cookie_jar = IG_LOGIN(username, password) | |
| if not cookie_jar: | |
| return "Login failed" | |
| # Get target user ID | |
| user_id = get_user_id(target, cookie_jar) | |
| if not user_id: | |
| return f"Could not find user ID for {target}" | |
| print(f"Found user ID: {user_id}") | |
| print(f"Fetching following list...\n") | |
| # Fetch following list | |
| base_url = f"https://www.instagram.com/api/v1/friendships/{user_id}/following/" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', | |
| 'X-IG-App-ID': '936619743392459', | |
| 'X-ASBD-ID': '198387', | |
| 'X-CSRFToken': cookie_jar.get('csrftoken', ''), | |
| 'X-Requested-With': 'XMLHttpRequest' | |
| } | |
| params = { | |
| 'count': max_accounts_to_check, | |
| 'search_surface': 'following_list_page' | |
| } | |
| try: | |
| response = requests.get(base_url, headers=headers, cookies=cookie_jar, params=params) | |
| if response.status_code != 200: | |
| print(f"Error: Status code {response.status_code}") | |
| print(f"Response: {response.text[:500]}") | |
| return f"Failed to fetch following: Status {response.status_code}" | |
| user_info = response.json() | |
| users = user_info.get('users', []) | |
| if not users: | |
| return "No following found or account is private" | |
| print(f"Found {len(users)} accounts. Analyzing for brand pages...\n") | |
| # Filter for brand pages | |
| brand_accounts = [] | |
| for idx, user in enumerate(users, 1): | |
| username_to_check = user.get('username', '') | |
| print(f"[{idx}/{len(users)}] Checking: {username_to_check}...", end=" ") | |
| # Get detailed user info | |
| details = get_user_details(username_to_check, cookie_jar) | |
| if details: | |
| is_brand, reason = is_brand_page( | |
| details['bio'], | |
| details['follower_count'], | |
| details['following_count'], | |
| details['post_count'], | |
| min_followers, | |
| max_following_ratio | |
| ) | |
| if is_brand: | |
| print(f"✓ BRAND ({reason})") | |
| brand_accounts.append({ | |
| 'User ID': user.get('pk', ''), | |
| 'Username': username_to_check, | |
| 'Full Name': user.get('full_name', ''), | |
| 'Bio': details['bio'][:100] + '...' if len(details['bio']) > 100 else details['bio'], | |
| 'Followers': details['follower_count'], | |
| 'Following': details['following_count'], | |
| 'Posts': details['post_count'], | |
| 'Category': details['category'], | |
| 'External URL': details['external_url'], | |
| 'Is Business': details['is_business'], | |
| 'Is Private': user.get('is_private', False), | |
| 'Is Verified': user.get('is_verified', False), | |
| 'Profile Pic': user.get('profile_pic_url', ''), | |
| 'Brand Reason': reason | |
| }) | |
| else: | |
| print(f"✗ Not brand ({reason})") | |
| else: | |
| print("✗ Failed to fetch details") | |
| # Rate limiting | |
| time.sleep(delay) | |
| if not brand_accounts: | |
| print("\nNo brand pages found matching criteria.") | |
| return None | |
| # Create DataFrame | |
| df = pd.DataFrame(brand_accounts) | |
| # Save to CSV | |
| filename = f"{target}_brand_following_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" | |
| df.to_csv(filename, index=False) | |
| print(f"\n✓ Saved {len(df)} brand pages to {filename}") | |
| return df | |
| except Exception as e: | |
| print(f"\nError scraping following: {e}") | |
| return f"Failed: {str(e)}" | |
| # ---------------- Gradio wrapper ---------------- | |
| def gradio_scrap_brand_following(username, password, target, min_followers, max_following_ratio, max_accounts_to_check, delay): | |
| try: | |
| result = scrap_brand_following( | |
| username=username, | |
| password=password, | |
| target=target, | |
| min_followers=int(min_followers), | |
| max_following_ratio=float(max_following_ratio), | |
| max_accounts_to_check=int(max_accounts_to_check), | |
| delay=float(delay) | |
| ) | |
| if isinstance(result, pd.DataFrame): | |
| return result | |
| else: | |
| return pd.DataFrame([{"Error": result}]) | |
| except Exception as e: | |
| return pd.DataFrame([{"Error": str(e)}]) | |
| # ---------------- Gradio UI ---------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Instagram Brand Following Scraper") | |
| with gr.Row(): | |
| username = gr.Textbox(label="Your Instagram Username") | |
| password = gr.Textbox(label="Your Instagram Password", type="password") | |
| target = gr.Textbox(label="Target Account Username") | |
| with gr.Row(): | |
| min_followers = gr.Number(value=1000, label="Minimum Followers") | |
| max_following_ratio = gr.Number(value=2.0, label="Max Following/Followers Ratio") | |
| max_accounts_to_check = gr.Number(value=30, label="Max Accounts to Check") | |
| delay = gr.Number(value=2, label="Delay between requests (seconds)") | |
| output = gr.Dataframe(headers=["User ID","Username","Full Name","Bio","Followers","Following","Posts","Category", | |
| "External URL","Is Business","Is Private","Is Verified","Profile Pic","Brand Reason"], | |
| type="pandas") | |
| submit = gr.Button("Scrape Brand Accounts") | |
| submit.click(fn=gradio_scrap_brand_following, | |
| inputs=[username, password, target, min_followers, max_following_ratio, max_accounts_to_check, delay], | |
| outputs=output) | |
| demo.launch(share=True) |