Spaces:

hamna11
/

instagram-rand-scrapper

Sleeping

App Files Files Community

instagram-rand-scrapper / app.py

hamna11

Update app.py

e640e49 verified 6 months ago

raw

history blame contribute delete

13.9 kB

	import gradio as gr
	import pandas as pd
	from datetime import datetime
	import time
	import requests
	import json
	import re

	# ---------------- Fixed IG_LOGIN ----------------
	def IG_LOGIN(username, password):
	# Always login fresh (cannot rely on cookie.json in Spaces)
	link = 'https://www.instagram.com/accounts/login/'
	login_url = 'https://www.instagram.com/accounts/login/ajax/'

	time_stamp = int(datetime.now().timestamp())
	session = requests.Session()
	response = session.get(link)
	csrf = response.cookies.get('csrftoken')

	payload = {
	'username': username,
	'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time_stamp}:{password}',
	'queryParams': {},
	'optIntoOneTap': 'false'
	}

	login_header = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
	"X-Requested-With": "XMLHttpRequest",
	"Referer": "https://www.instagram.com/accounts/login/",
	"x-csrftoken": csrf
	}

	login_response = session.post(login_url, data=payload, headers=login_header)
	json_data = login_response.json()

	if json_data.get("authenticated"):
	return session.cookies.get_dict()
	else:
	return None


	def get_user_id(username, cookie_jar):
	"""Get user ID using the web graphql endpoint"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
	'X-IG-App-ID': '936619743392459',
	'X-ASBD-ID': '198387',
	'X-CSRFToken': cookie_jar.get('csrftoken', '')
	}

	# Method 1: Try the username page directly
	try:
	url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
	response = requests.get(url, headers=headers, cookies=cookie_jar)

	if response.status_code == 200:
	data = response.json()
	user_id = data['data']['user']['id']
	return user_id
	except Exception as e:
	print(f"Method 1 failed: {e}")

	# Method 2: Try searching for the user
	try:
	search_url = f"https://www.instagram.com/web/search/topsearch/?query={username}"
	response = requests.get(search_url, headers=headers, cookies=cookie_jar)

	if response.status_code == 200:
	data = response.json()
	for user in data.get('users', []):
	if user['user']['username'].lower() == username.lower():
	return user['user']['pk']
	except Exception as e:
	print(f"Method 2 failed: {e}")

	return None

	def get_user_details(username, cookie_jar):
	"""Get detailed user information including bio and follower count"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
	'X-IG-App-ID': '936619743392459',
	'X-ASBD-ID': '198387',
	'X-CSRFToken': cookie_jar.get('csrftoken', '')
	}

	try:
	url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
	response = requests.get(url, headers=headers, cookies=cookie_jar)

	if response.status_code == 200:
	data = response.json()
	user_data = data['data']['user']
	return {
	'bio': user_data.get('biography', ''),
	'follower_count': user_data.get('edge_followed_by', {}).get('count', 0),
	'following_count': user_data.get('edge_follow', {}).get('count', 0),
	'post_count': user_data.get('edge_owner_to_timeline_media', {}).get('count', 0),
	'is_business': user_data.get('is_business_account', False),
	'category': user_data.get('category_name', ''),
	'external_url': user_data.get('external_url', '')
	}
	except Exception as e:
	print(f"Error fetching details for {username}: {e}")

	return None

	def is_brand_page(bio, follower_count, following_count, post_count, min_followers=1000, max_following_ratio=2.0):
	"""
	Determine if an account is a brand page based on bio content and metrics

	Args:
	bio: Account biography text
	follower_count: Number of followers
	following_count: Number of accounts following
	post_count: Number of posts
	min_followers: Minimum follower count to be considered a brand
	max_following_ratio: Maximum ratio of following/followers

	Returns:
	tuple: (is_brand, reason)
	"""

	# Brand keywords in bio
	brand_keywords = [
	# Business indicators
	r'\b(shop\|store\|brand\|official\|business\|company\|boutique)\b',
	r'\b(buy\|purchase\|order\|sale\|discount\|promo\|offer)\b',
	# Contact/shipping
	r'\b(shipping\|delivery\|worldwide\|dm\s+to\s+order\|whatsapp)\b',
	r'\b(contact\|inquiries\|orders\|email)\b',
	# Product types
	r'\b(cosmetics\|beauty\|skincare\|makeup\|fashion\|clothing\|jewelry\|accessories)\b',
	r'\b(organic\|natural\|handmade\|luxury\|premium)\b',
	# Business patterns
	r'📧\|📩\|📞\|☎️\|🛒\|🛍️\|💳', # Business emojis
	r'(www\.\|\.com\|\.co\|\.pk)', # Website URLs
	r'(est\.\|established\|since)\s+\d{4}', # Establishment year
	r'(dm\|message)\s+(us\|for)', # DM for orders
	]

	# Influencer indicators (to exclude)
	influencer_keywords = [
	r'\b(influencer\|blogger\|vlogger\|content\s+creator\|lifestyle)\b',
	r'\b(personal\|my\s+life\|journey\|traveler)\b',
	r'\b(fitness\s+journey\|transformation\|inspiration)\b',
	]

	bio_lower = bio.lower()

	# Check for influencer indicators first
	for pattern in influencer_keywords:
	if re.search(pattern, bio_lower):
	return False, "Appears to be influencer (bio keywords)"

	# Check follower count
	if follower_count < min_followers:
	return False, f"Follower count too low ({follower_count} < {min_followers})"

	# Check following ratio (brands typically don't follow many accounts)
	if follower_count > 0:
	following_ratio = following_count / follower_count
	if following_ratio > max_following_ratio:
	return False, f"Following ratio too high ({following_ratio:.2f} > {max_following_ratio})"

	# Check for brand indicators in bio
	brand_score = 0
	matched_patterns = []

	for pattern in brand_keywords:
	if re.search(pattern, bio_lower):
	brand_score += 1
	matched_patterns.append(pattern)

	# Need at least 2 brand indicators
	if brand_score >= 2:
	return True, f"Brand indicators found: {brand_score}"

	# Additional check: if very low following count relative to followers (typical for brands)
	if follower_count > 5000 and following_count < 500:
	return True, "High follower/low following ratio (brand pattern)"

	return False, "Insufficient brand indicators"

	def scrap_brand_following(username, password, target, min_followers=1000, max_following_ratio=2.0,
	max_accounts_to_check=50, delay=2):
	"""
	Scrape following accounts and filter for brand pages

	Args:
	username: Your Instagram username
	password: Your Instagram password
	target: Target account to scrape following from
	min_followers: Minimum followers for brand consideration
	max_following_ratio: Maximum following/followers ratio
	max_accounts_to_check: Maximum accounts to check (to avoid rate limits)
	delay: Delay between requests in seconds
	"""

	# Login
	cookie_jar = IG_LOGIN(username, password)
	if not cookie_jar:
	return "Login failed"

	# Get target user ID
	user_id = get_user_id(target, cookie_jar)
	if not user_id:
	return f"Could not find user ID for {target}"

	print(f"Found user ID: {user_id}")
	print(f"Fetching following list...\n")

	# Fetch following list
	base_url = f"https://www.instagram.com/api/v1/friendships/{user_id}/following/"

	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
	'X-IG-App-ID': '936619743392459',
	'X-ASBD-ID': '198387',
	'X-CSRFToken': cookie_jar.get('csrftoken', ''),
	'X-Requested-With': 'XMLHttpRequest'
	}

	params = {
	'count': max_accounts_to_check,
	'search_surface': 'following_list_page'
	}

	try:
	response = requests.get(base_url, headers=headers, cookies=cookie_jar, params=params)

	if response.status_code != 200:
	print(f"Error: Status code {response.status_code}")
	print(f"Response: {response.text[:500]}")
	return f"Failed to fetch following: Status {response.status_code}"

	user_info = response.json()
	users = user_info.get('users', [])

	if not users:
	return "No following found or account is private"

	print(f"Found {len(users)} accounts. Analyzing for brand pages...\n")

	# Filter for brand pages
	brand_accounts = []

	for idx, user in enumerate(users, 1):
	username_to_check = user.get('username', '')
	print(f"[{idx}/{len(users)}] Checking: {username_to_check}...", end=" ")

	# Get detailed user info
	details = get_user_details(username_to_check, cookie_jar)

	if details:
	is_brand, reason = is_brand_page(
	details['bio'],
	details['follower_count'],
	details['following_count'],
	details['post_count'],
	min_followers,
	max_following_ratio
	)

	if is_brand:
	print(f"✓ BRAND ({reason})")
	brand_accounts.append({
	'User ID': user.get('pk', ''),
	'Username': username_to_check,
	'Full Name': user.get('full_name', ''),
	'Bio': details['bio'][:100] + '...' if len(details['bio']) > 100 else details['bio'],
	'Followers': details['follower_count'],
	'Following': details['following_count'],
	'Posts': details['post_count'],
	'Category': details['category'],
	'External URL': details['external_url'],
	'Is Business': details['is_business'],
	'Is Private': user.get('is_private', False),
	'Is Verified': user.get('is_verified', False),
	'Profile Pic': user.get('profile_pic_url', ''),
	'Brand Reason': reason
	})
	else:
	print(f"✗ Not brand ({reason})")
	else:
	print("✗ Failed to fetch details")

	# Rate limiting
	time.sleep(delay)

	if not brand_accounts:
	print("\nNo brand pages found matching criteria.")
	return None

	# Create DataFrame
	df = pd.DataFrame(brand_accounts)

	# Save to CSV
	filename = f"{target}_brand_following_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
	df.to_csv(filename, index=False)
	print(f"\n✓ Saved {len(df)} brand pages to {filename}")

	return df

	except Exception as e:
	print(f"\nError scraping following: {e}")
	return f"Failed: {str(e)}"


	# ---------------- Gradio wrapper ----------------
	def gradio_scrap_brand_following(username, password, target, min_followers, max_following_ratio, max_accounts_to_check, delay):
	try:
	result = scrap_brand_following(
	username=username,
	password=password,
	target=target,
	min_followers=int(min_followers),
	max_following_ratio=float(max_following_ratio),
	max_accounts_to_check=int(max_accounts_to_check),
	delay=float(delay)
	)
	if isinstance(result, pd.DataFrame):
	return result
	else:
	return pd.DataFrame([{"Error": result}])
	except Exception as e:
	return pd.DataFrame([{"Error": str(e)}])

	# ---------------- Gradio UI ----------------
	with gr.Blocks() as demo:
	gr.Markdown("## Instagram Brand Following Scraper")

	with gr.Row():
	username = gr.Textbox(label="Your Instagram Username")
	password = gr.Textbox(label="Your Instagram Password", type="password")

	target = gr.Textbox(label="Target Account Username")

	with gr.Row():
	min_followers = gr.Number(value=1000, label="Minimum Followers")
	max_following_ratio = gr.Number(value=2.0, label="Max Following/Followers Ratio")

	max_accounts_to_check = gr.Number(value=30, label="Max Accounts to Check")
	delay = gr.Number(value=2, label="Delay between requests (seconds)")

	output = gr.Dataframe(headers=["User ID","Username","Full Name","Bio","Followers","Following","Posts","Category",
	"External URL","Is Business","Is Private","Is Verified","Profile Pic","Brand Reason"],
	type="pandas")

	submit = gr.Button("Scrape Brand Accounts")
	submit.click(fn=gradio_scrap_brand_following,
	inputs=[username, password, target, min_followers, max_following_ratio, max_accounts_to_check, delay],
	outputs=output)

	demo.launch(share=True)