hamna11's picture
Update app.py
e640e49 verified
import gradio as gr
import pandas as pd
from datetime import datetime
import time
import requests
import json
import re
# ---------------- Fixed IG_LOGIN ----------------
def IG_LOGIN(username, password):
# Always login fresh (cannot rely on cookie.json in Spaces)
link = 'https://www.instagram.com/accounts/login/'
login_url = 'https://www.instagram.com/accounts/login/ajax/'
time_stamp = int(datetime.now().timestamp())
session = requests.Session()
response = session.get(link)
csrf = response.cookies.get('csrftoken')
payload = {
'username': username,
'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time_stamp}:{password}',
'queryParams': {},
'optIntoOneTap': 'false'
}
login_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Referer": "https://www.instagram.com/accounts/login/",
"x-csrftoken": csrf
}
login_response = session.post(login_url, data=payload, headers=login_header)
json_data = login_response.json()
if json_data.get("authenticated"):
return session.cookies.get_dict()
else:
return None
def get_user_id(username, cookie_jar):
"""Get user ID using the web graphql endpoint"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-CSRFToken': cookie_jar.get('csrftoken', '')
}
# Method 1: Try the username page directly
try:
url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
response = requests.get(url, headers=headers, cookies=cookie_jar)
if response.status_code == 200:
data = response.json()
user_id = data['data']['user']['id']
return user_id
except Exception as e:
print(f"Method 1 failed: {e}")
# Method 2: Try searching for the user
try:
search_url = f"https://www.instagram.com/web/search/topsearch/?query={username}"
response = requests.get(search_url, headers=headers, cookies=cookie_jar)
if response.status_code == 200:
data = response.json()
for user in data.get('users', []):
if user['user']['username'].lower() == username.lower():
return user['user']['pk']
except Exception as e:
print(f"Method 2 failed: {e}")
return None
def get_user_details(username, cookie_jar):
"""Get detailed user information including bio and follower count"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-CSRFToken': cookie_jar.get('csrftoken', '')
}
try:
url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
response = requests.get(url, headers=headers, cookies=cookie_jar)
if response.status_code == 200:
data = response.json()
user_data = data['data']['user']
return {
'bio': user_data.get('biography', ''),
'follower_count': user_data.get('edge_followed_by', {}).get('count', 0),
'following_count': user_data.get('edge_follow', {}).get('count', 0),
'post_count': user_data.get('edge_owner_to_timeline_media', {}).get('count', 0),
'is_business': user_data.get('is_business_account', False),
'category': user_data.get('category_name', ''),
'external_url': user_data.get('external_url', '')
}
except Exception as e:
print(f"Error fetching details for {username}: {e}")
return None
def is_brand_page(bio, follower_count, following_count, post_count, min_followers=1000, max_following_ratio=2.0):
"""
Determine if an account is a brand page based on bio content and metrics
Args:
bio: Account biography text
follower_count: Number of followers
following_count: Number of accounts following
post_count: Number of posts
min_followers: Minimum follower count to be considered a brand
max_following_ratio: Maximum ratio of following/followers
Returns:
tuple: (is_brand, reason)
"""
# Brand keywords in bio
brand_keywords = [
# Business indicators
r'\b(shop|store|brand|official|business|company|boutique)\b',
r'\b(buy|purchase|order|sale|discount|promo|offer)\b',
# Contact/shipping
r'\b(shipping|delivery|worldwide|dm\s+to\s+order|whatsapp)\b',
r'\b(contact|inquiries|orders|email)\b',
# Product types
r'\b(cosmetics|beauty|skincare|makeup|fashion|clothing|jewelry|accessories)\b',
r'\b(organic|natural|handmade|luxury|premium)\b',
# Business patterns
r'📧|📩|📞|☎️|🛒|🛍️|💳', # Business emojis
r'(www\.|\.com|\.co|\.pk)', # Website URLs
r'(est\.|established|since)\s+\d{4}', # Establishment year
r'(dm|message)\s+(us|for)', # DM for orders
]
# Influencer indicators (to exclude)
influencer_keywords = [
r'\b(influencer|blogger|vlogger|content\s+creator|lifestyle)\b',
r'\b(personal|my\s+life|journey|traveler)\b',
r'\b(fitness\s+journey|transformation|inspiration)\b',
]
bio_lower = bio.lower()
# Check for influencer indicators first
for pattern in influencer_keywords:
if re.search(pattern, bio_lower):
return False, "Appears to be influencer (bio keywords)"
# Check follower count
if follower_count < min_followers:
return False, f"Follower count too low ({follower_count} < {min_followers})"
# Check following ratio (brands typically don't follow many accounts)
if follower_count > 0:
following_ratio = following_count / follower_count
if following_ratio > max_following_ratio:
return False, f"Following ratio too high ({following_ratio:.2f} > {max_following_ratio})"
# Check for brand indicators in bio
brand_score = 0
matched_patterns = []
for pattern in brand_keywords:
if re.search(pattern, bio_lower):
brand_score += 1
matched_patterns.append(pattern)
# Need at least 2 brand indicators
if brand_score >= 2:
return True, f"Brand indicators found: {brand_score}"
# Additional check: if very low following count relative to followers (typical for brands)
if follower_count > 5000 and following_count < 500:
return True, "High follower/low following ratio (brand pattern)"
return False, "Insufficient brand indicators"
def scrap_brand_following(username, password, target, min_followers=1000, max_following_ratio=2.0,
max_accounts_to_check=50, delay=2):
"""
Scrape following accounts and filter for brand pages
Args:
username: Your Instagram username
password: Your Instagram password
target: Target account to scrape following from
min_followers: Minimum followers for brand consideration
max_following_ratio: Maximum following/followers ratio
max_accounts_to_check: Maximum accounts to check (to avoid rate limits)
delay: Delay between requests in seconds
"""
# Login
cookie_jar = IG_LOGIN(username, password)
if not cookie_jar:
return "Login failed"
# Get target user ID
user_id = get_user_id(target, cookie_jar)
if not user_id:
return f"Could not find user ID for {target}"
print(f"Found user ID: {user_id}")
print(f"Fetching following list...\n")
# Fetch following list
base_url = f"https://www.instagram.com/api/v1/friendships/{user_id}/following/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-CSRFToken': cookie_jar.get('csrftoken', ''),
'X-Requested-With': 'XMLHttpRequest'
}
params = {
'count': max_accounts_to_check,
'search_surface': 'following_list_page'
}
try:
response = requests.get(base_url, headers=headers, cookies=cookie_jar, params=params)
if response.status_code != 200:
print(f"Error: Status code {response.status_code}")
print(f"Response: {response.text[:500]}")
return f"Failed to fetch following: Status {response.status_code}"
user_info = response.json()
users = user_info.get('users', [])
if not users:
return "No following found or account is private"
print(f"Found {len(users)} accounts. Analyzing for brand pages...\n")
# Filter for brand pages
brand_accounts = []
for idx, user in enumerate(users, 1):
username_to_check = user.get('username', '')
print(f"[{idx}/{len(users)}] Checking: {username_to_check}...", end=" ")
# Get detailed user info
details = get_user_details(username_to_check, cookie_jar)
if details:
is_brand, reason = is_brand_page(
details['bio'],
details['follower_count'],
details['following_count'],
details['post_count'],
min_followers,
max_following_ratio
)
if is_brand:
print(f"✓ BRAND ({reason})")
brand_accounts.append({
'User ID': user.get('pk', ''),
'Username': username_to_check,
'Full Name': user.get('full_name', ''),
'Bio': details['bio'][:100] + '...' if len(details['bio']) > 100 else details['bio'],
'Followers': details['follower_count'],
'Following': details['following_count'],
'Posts': details['post_count'],
'Category': details['category'],
'External URL': details['external_url'],
'Is Business': details['is_business'],
'Is Private': user.get('is_private', False),
'Is Verified': user.get('is_verified', False),
'Profile Pic': user.get('profile_pic_url', ''),
'Brand Reason': reason
})
else:
print(f"✗ Not brand ({reason})")
else:
print("✗ Failed to fetch details")
# Rate limiting
time.sleep(delay)
if not brand_accounts:
print("\nNo brand pages found matching criteria.")
return None
# Create DataFrame
df = pd.DataFrame(brand_accounts)
# Save to CSV
filename = f"{target}_brand_following_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(filename, index=False)
print(f"\n✓ Saved {len(df)} brand pages to {filename}")
return df
except Exception as e:
print(f"\nError scraping following: {e}")
return f"Failed: {str(e)}"
# ---------------- Gradio wrapper ----------------
def gradio_scrap_brand_following(username, password, target, min_followers, max_following_ratio, max_accounts_to_check, delay):
try:
result = scrap_brand_following(
username=username,
password=password,
target=target,
min_followers=int(min_followers),
max_following_ratio=float(max_following_ratio),
max_accounts_to_check=int(max_accounts_to_check),
delay=float(delay)
)
if isinstance(result, pd.DataFrame):
return result
else:
return pd.DataFrame([{"Error": result}])
except Exception as e:
return pd.DataFrame([{"Error": str(e)}])
# ---------------- Gradio UI ----------------
with gr.Blocks() as demo:
gr.Markdown("## Instagram Brand Following Scraper")
with gr.Row():
username = gr.Textbox(label="Your Instagram Username")
password = gr.Textbox(label="Your Instagram Password", type="password")
target = gr.Textbox(label="Target Account Username")
with gr.Row():
min_followers = gr.Number(value=1000, label="Minimum Followers")
max_following_ratio = gr.Number(value=2.0, label="Max Following/Followers Ratio")
max_accounts_to_check = gr.Number(value=30, label="Max Accounts to Check")
delay = gr.Number(value=2, label="Delay between requests (seconds)")
output = gr.Dataframe(headers=["User ID","Username","Full Name","Bio","Followers","Following","Posts","Category",
"External URL","Is Business","Is Private","Is Verified","Profile Pic","Brand Reason"],
type="pandas")
submit = gr.Button("Scrape Brand Accounts")
submit.click(fn=gradio_scrap_brand_following,
inputs=[username, password, target, min_followers, max_following_ratio, max_accounts_to_check, delay],
outputs=output)
demo.launch(share=True)