Spaces:

hamna11
/

instagram-rand-scrapper

Sleeping

App Files Files Community

hamna11 commited on Oct 27, 2025

Commit

54a601a

verified ·

1 Parent(s): 48fc918

Upload 2 files

Browse files

Files changed (2) hide show

app.py +287 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import time
+import os
+import re
+import io
+import pandas as pd
+import gradio as gr
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException, TimeoutException
+from webdriver_manager.chrome import ChromeDriverManager
+# ---------- Utility Functions ----------
+def save_credentials(username, password):
+    with open('credentials.txt', 'w') as f:
+        f.write(f"{username}\n{password}")
+def load_credentials():
+    if not os.path.exists('credentials.txt'):
+        return None
+    with open('credentials.txt', 'r') as f:
+        lines = f.readlines()
+    if len(lines) >= 2:
+        return lines[0].strip(), lines[1].strip()
+    return None
+def login(bot, username, password, log):
+    log.append("[Info] - Logging in...")
+    bot.get("https://www.instagram.com/accounts/login/")
+    time.sleep(3)
+    try:
+        WebDriverWait(bot, 15).until(EC.presence_of_element_located((By.NAME, "username")))
+        bot.find_element(By.NAME, "username").send_keys(username)
+        bot.find_element(By.NAME, "password").send_keys(password)
+        bot.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
+    except Exception as e:
+        log.append(f"[Error] during login: {e}")
+    time.sleep(8)
+    log.append("[Info] - Logged in successfully.")
+def extract_followers_count(bot, log):
+    try:
+        possible_xpaths = [
+            "//a[contains(@href,'followers')]//span/span",
+            "//a[contains(@href,'followers')]//span",
+            "//ul/li[a[contains(@href,'followers')]]//span",
+            "//header//ul/li[2]//button/span",
+            "//header//ul/li[2]//span/span"
+        ]
+        follower_text = ""
+        for xp in possible_xpaths:
+            try:
+                element = WebDriverWait(bot, 5).until(
+                    EC.presence_of_element_located((By.XPATH, xp))
+                )
+                follower_text = element.text.strip()
+                if follower_text:
+                    break
+            except:
+                continue
+        if not follower_text:
+            log.append("[Error parsing followers] Could not locate followers element.")
+            return 0
+        log.append(f"[Debug] Raw follower text found: {follower_text}")
+        follower_text = follower_text.lower().replace(',', '').replace(' ', '')
+        if 'k' in follower_text:
+            return int(float(follower_text.replace('k', '')) * 1000)
+        elif 'm' in follower_text:
+            return int(float(follower_text.replace('m', '')) * 1000000)
+        elif follower_text.isdigit():
+            return int(follower_text)
+        else:
+            num = re.sub(r'\D', '', follower_text)
+            return int(num) if num else 0
+    except Exception as e:
+        log.append(f"[Error parsing followers] {e}")
+        return 0
+def is_brand_page(description, name):
+    desc = (description + " " + name).lower()
+    brand_keywords = [
+        'official', 'brand', 'store', 'clothing', 'shop', 'studio', 'company',
+        'boutique', 'restaurant', 'cafe', 'apparel', 'cosmetics', 'products',
+        'organization', 'service', 'beauty', 'salon', 'facial', 'skincare', 'clinic'
+    ]
+    influencer_keywords = [
+        'blogger', 'creator', 'influencer', 'model', 'artist', 'makeup',
+        'reviewer', 'photographer', 'personal', 'actor', 'writer', 'content'
+    ]
+    brand_score = sum(1 for w in brand_keywords if w in desc)
+    influencer_score = sum(1 for w in influencer_keywords if w in desc)
+    return brand_score > influencer_score
+def scrape_profile(bot, username, log):
+    url = f"https://www.instagram.com/{username}/"
+    bot.get(url)
+    time.sleep(5)
+    data = {"username": username, "url": url}
+    try:
+        name = bot.find_element(By.XPATH, "//header//h2 | //header//h1").text
+        data["name"] = name
+    except:
+        data["name"] = ""
+    try:
+        section = WebDriverWait(bot, 5).until(
+            EC.presence_of_element_located((By.XPATH, "//section[contains(@class,'xqui205')]"))
+        )
+        section_text = section.text.strip()
+        data["bio"] = section_text
+    except:
+        data["bio"] = ""
+    data["followers"] = extract_followers_count(bot, log)
+    try:
+        link_elem = bot.find_element(By.XPATH, "//section[contains(@class,'xqui205')]//a[contains(@href,'http')]")
+        data["link_in_bio"] = link_elem.get_attribute("href")
+    except:
+        data["link_in_bio"] = ""
+    data["is_brand_page"] = is_brand_page(data["bio"], data["name"])
+    return data
+def get_following_list(bot, target_username, limit, log):
+    log.append(f"[Info] - Opening {target_username}'s profile...")
+    bot.get(f"https://www.instagram.com/{target_username}/")
+    time.sleep(5)
+    total_following = 0
+    try:
+        following_xpath_candidates = [
+            "//a[contains(@href, '/following')]//span/span",
+            "//a[contains(@href, '/following')]//span",
+            "//ul/li[a[contains(@href, 'following')]]//span",
+        ]
+        total_following_text = ""
+        for xp in following_xpath_candidates:
+            try:
+                el = WebDriverWait(bot, 5).until(EC.presence_of_element_located((By.XPATH, xp)))
+                total_following_text = el.text.strip()
+                if total_following_text:
+                    break
+            except:
+                continue
+        if not total_following_text:
+            log.append("[Warning] Could not read following count; assuming 0.")
+        else:
+            txt = total_following_text.lower().replace(",", "").replace(" ", "")
+            if "k" in txt:
+                total_following = int(float(txt.replace("k", "")) * 1000)
+            elif "m" in txt:
+                total_following = int(float(txt.replace("m", "")) * 1000000)
+            else:
+                total_following = int(re.sub(r"\D", "", txt))
+        log.append(f"[Info] - Total followings: {total_following}")
+    except Exception as e:
+        log.append(f"[Error reading following count] {e}")
+    try:
+        following_link = WebDriverWait(bot, 10).until(
+            EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/following')]"))
+        )
+        following_link.click()
+    except Exception:
+        log.append("[Error] Could not open following list.")
+        return []
+    time.sleep(4)
+    try:
+        scroll_box = WebDriverWait(bot, 10).until(
+            EC.presence_of_element_located((By.XPATH,
+                                            "//div[contains(@class,'x6nl9eh') and contains(@class,'x1a5l9x9') and contains(@class,'x7vuprf')]"))
+        )
+    except Exception:
+        log.append("[Error] Could not find the scroll box.")
+        return []
+    follows = set()
+    scroll_round = 0
+    max_scrolls = (min(limit, total_following) // 5) + 5
+    log.append(f"[Info] - Will scroll up to {max_scrolls} times to fetch {limit} followings...")
+    while len(follows) < limit and scroll_round < max_scrolls:
+        links = scroll_box.find_elements(By.TAG_NAME, "a")
+        for link in links:
+            href = link.get_attribute("href")
+            if href and "instagram.com" in href and not any(
+                x in href for x in ["followers", "following", "explore", "reels"]
+            ):
+                username = href.strip("/").split("/")[-1]
+                follows.add(username)
+        bot.execute_script("arguments[0].scrollTop += arguments[0].offsetHeight;", scroll_box)
+        time.sleep(3)
+        scroll_round += 1
+        log.append(f"[Scroll {scroll_round}] Collected so far: {len(follows)}")
+        if total_following and len(follows) >= total_following:
+            log.append("[Info] - Reached end of following list.")
+            break
+    log.append(f"[Info] - Found {len(follows)} following users (limit was {limit}).")
+    return list(follows)[:limit]
+# ---------- Gradio Interface Wrapper ----------
+def run_scraper(insta_username, insta_password, target_username, limit):
+    log = []
+    save_credentials(insta_username, insta_password)
+    options = webdriver.ChromeOptions()
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--headless=new")
+    bot = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
+    login(bot, insta_username, insta_password, log)
+    following_users = get_following_list(bot, target_username, int(limit), log)
+    log.append("[Info] - Checking each following for brand page criteria...")
+    results = []
+    for user in following_users:
+        try:
+            data = scrape_profile(bot, user, log)
+            if data["followers"] >= 50000 and data["is_brand_page"]:
+                log.append(f"[✔] {user} qualifies ({data['followers']} followers)")
+                results.append(data)
+            else:
+                log.append(f"[✖] {user} skipped (followers={data['followers']}, brand={data['is_brand_page']})")
+        except Exception as e:
+            log.append(f"[Error scraping {user}] {e}")
+    bot.quit()
+    if results:
+        df = pd.DataFrame(results)
+        csv_path = "brand_following.csv"
+        df.to_csv(csv_path, index=False, encoding="utf-8")
+        log.append(f"[Saved] {len(results)} brand pages saved to brand_following.csv")
+        return "\n".join(log), csv_path
+    else:
+        log.append("[Info] - No qualifying brand pages found.")
+        return "\n".join(log), None
+# ---------- Gradio App ----------
+iface = gr.Interface(
+    fn=run_scraper,
+    inputs=[
+        gr.Textbox(label="Instagram Username"),
+        gr.Textbox(label="Instagram Password", type="password"),
+        gr.Textbox(label="Target Username (whose following to scrape)"),
+        gr.Number(label="Limit (e.g. 50)", value=50)
+    ],
+    outputs=[
+        gr.Textbox(label="Logs (Live Progress)", lines=25),
+        gr.File(label="Download CSV (if available)")
+    ],
+    title="Instagram Brand Follower Scraper",
+    description="Scrape Instagram following list, detect brand pages (with 50k+ followers)."
+)
+iface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+selenium>=4.0.0
+webdriver-manager>=3.8.0
+pandas>=1.0.0
+gradio