hamna11 commited on
Commit
54a601a
·
verified ·
1 Parent(s): 48fc918

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +287 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ import re
4
+ import io
5
+ import pandas as pd
6
+ import gradio as gr
7
+ from selenium import webdriver
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.chrome.service import Service as ChromeService
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ from selenium.common.exceptions import NoSuchElementException, TimeoutException
13
+ from webdriver_manager.chrome import ChromeDriverManager
14
+
15
+
16
+ # ---------- Utility Functions ----------
17
+
18
+ def save_credentials(username, password):
19
+ with open('credentials.txt', 'w') as f:
20
+ f.write(f"{username}\n{password}")
21
+
22
+
23
+ def load_credentials():
24
+ if not os.path.exists('credentials.txt'):
25
+ return None
26
+ with open('credentials.txt', 'r') as f:
27
+ lines = f.readlines()
28
+ if len(lines) >= 2:
29
+ return lines[0].strip(), lines[1].strip()
30
+ return None
31
+
32
+
33
+ def login(bot, username, password, log):
34
+ log.append("[Info] - Logging in...")
35
+ bot.get("https://www.instagram.com/accounts/login/")
36
+ time.sleep(3)
37
+
38
+ try:
39
+ WebDriverWait(bot, 15).until(EC.presence_of_element_located((By.NAME, "username")))
40
+ bot.find_element(By.NAME, "username").send_keys(username)
41
+ bot.find_element(By.NAME, "password").send_keys(password)
42
+ bot.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
43
+ except Exception as e:
44
+ log.append(f"[Error] during login: {e}")
45
+
46
+ time.sleep(8)
47
+ log.append("[Info] - Logged in successfully.")
48
+
49
+
50
+ def extract_followers_count(bot, log):
51
+ try:
52
+ possible_xpaths = [
53
+ "//a[contains(@href,'followers')]//span/span",
54
+ "//a[contains(@href,'followers')]//span",
55
+ "//ul/li[a[contains(@href,'followers')]]//span",
56
+ "//header//ul/li[2]//button/span",
57
+ "//header//ul/li[2]//span/span"
58
+ ]
59
+
60
+ follower_text = ""
61
+ for xp in possible_xpaths:
62
+ try:
63
+ element = WebDriverWait(bot, 5).until(
64
+ EC.presence_of_element_located((By.XPATH, xp))
65
+ )
66
+ follower_text = element.text.strip()
67
+ if follower_text:
68
+ break
69
+ except:
70
+ continue
71
+
72
+ if not follower_text:
73
+ log.append("[Error parsing followers] Could not locate followers element.")
74
+ return 0
75
+
76
+ log.append(f"[Debug] Raw follower text found: {follower_text}")
77
+
78
+ follower_text = follower_text.lower().replace(',', '').replace(' ', '')
79
+ if 'k' in follower_text:
80
+ return int(float(follower_text.replace('k', '')) * 1000)
81
+ elif 'm' in follower_text:
82
+ return int(float(follower_text.replace('m', '')) * 1000000)
83
+ elif follower_text.isdigit():
84
+ return int(follower_text)
85
+ else:
86
+ num = re.sub(r'\D', '', follower_text)
87
+ return int(num) if num else 0
88
+
89
+ except Exception as e:
90
+ log.append(f"[Error parsing followers] {e}")
91
+ return 0
92
+
93
+
94
+ def is_brand_page(description, name):
95
+ desc = (description + " " + name).lower()
96
+ brand_keywords = [
97
+ 'official', 'brand', 'store', 'clothing', 'shop', 'studio', 'company',
98
+ 'boutique', 'restaurant', 'cafe', 'apparel', 'cosmetics', 'products',
99
+ 'organization', 'service', 'beauty', 'salon', 'facial', 'skincare', 'clinic'
100
+ ]
101
+ influencer_keywords = [
102
+ 'blogger', 'creator', 'influencer', 'model', 'artist', 'makeup',
103
+ 'reviewer', 'photographer', 'personal', 'actor', 'writer', 'content'
104
+ ]
105
+ brand_score = sum(1 for w in brand_keywords if w in desc)
106
+ influencer_score = sum(1 for w in influencer_keywords if w in desc)
107
+ return brand_score > influencer_score
108
+
109
+
110
+ def scrape_profile(bot, username, log):
111
+ url = f"https://www.instagram.com/{username}/"
112
+ bot.get(url)
113
+ time.sleep(5)
114
+ data = {"username": username, "url": url}
115
+
116
+ try:
117
+ name = bot.find_element(By.XPATH, "//header//h2 | //header//h1").text
118
+ data["name"] = name
119
+ except:
120
+ data["name"] = ""
121
+
122
+ try:
123
+ section = WebDriverWait(bot, 5).until(
124
+ EC.presence_of_element_located((By.XPATH, "//section[contains(@class,'xqui205')]"))
125
+ )
126
+ section_text = section.text.strip()
127
+ data["bio"] = section_text
128
+ except:
129
+ data["bio"] = ""
130
+
131
+ data["followers"] = extract_followers_count(bot, log)
132
+
133
+ try:
134
+ link_elem = bot.find_element(By.XPATH, "//section[contains(@class,'xqui205')]//a[contains(@href,'http')]")
135
+ data["link_in_bio"] = link_elem.get_attribute("href")
136
+ except:
137
+ data["link_in_bio"] = ""
138
+
139
+ data["is_brand_page"] = is_brand_page(data["bio"], data["name"])
140
+ return data
141
+
142
+
143
+ def get_following_list(bot, target_username, limit, log):
144
+ log.append(f"[Info] - Opening {target_username}'s profile...")
145
+ bot.get(f"https://www.instagram.com/{target_username}/")
146
+ time.sleep(5)
147
+
148
+ total_following = 0
149
+ try:
150
+ following_xpath_candidates = [
151
+ "//a[contains(@href, '/following')]//span/span",
152
+ "//a[contains(@href, '/following')]//span",
153
+ "//ul/li[a[contains(@href, 'following')]]//span",
154
+ ]
155
+ total_following_text = ""
156
+ for xp in following_xpath_candidates:
157
+ try:
158
+ el = WebDriverWait(bot, 5).until(EC.presence_of_element_located((By.XPATH, xp)))
159
+ total_following_text = el.text.strip()
160
+ if total_following_text:
161
+ break
162
+ except:
163
+ continue
164
+
165
+ if not total_following_text:
166
+ log.append("[Warning] Could not read following count; assuming 0.")
167
+ else:
168
+ txt = total_following_text.lower().replace(",", "").replace(" ", "")
169
+ if "k" in txt:
170
+ total_following = int(float(txt.replace("k", "")) * 1000)
171
+ elif "m" in txt:
172
+ total_following = int(float(txt.replace("m", "")) * 1000000)
173
+ else:
174
+ total_following = int(re.sub(r"\D", "", txt))
175
+ log.append(f"[Info] - Total followings: {total_following}")
176
+ except Exception as e:
177
+ log.append(f"[Error reading following count] {e}")
178
+
179
+ try:
180
+ following_link = WebDriverWait(bot, 10).until(
181
+ EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/following')]"))
182
+ )
183
+ following_link.click()
184
+ except Exception:
185
+ log.append("[Error] Could not open following list.")
186
+ return []
187
+
188
+ time.sleep(4)
189
+
190
+ try:
191
+ scroll_box = WebDriverWait(bot, 10).until(
192
+ EC.presence_of_element_located((By.XPATH,
193
+ "//div[contains(@class,'x6nl9eh') and contains(@class,'x1a5l9x9') and contains(@class,'x7vuprf')]"))
194
+ )
195
+ except Exception:
196
+ log.append("[Error] Could not find the scroll box.")
197
+ return []
198
+
199
+ follows = set()
200
+ scroll_round = 0
201
+ max_scrolls = (min(limit, total_following) // 5) + 5
202
+ log.append(f"[Info] - Will scroll up to {max_scrolls} times to fetch {limit} followings...")
203
+
204
+ while len(follows) < limit and scroll_round < max_scrolls:
205
+ links = scroll_box.find_elements(By.TAG_NAME, "a")
206
+ for link in links:
207
+ href = link.get_attribute("href")
208
+ if href and "instagram.com" in href and not any(
209
+ x in href for x in ["followers", "following", "explore", "reels"]
210
+ ):
211
+ username = href.strip("/").split("/")[-1]
212
+ follows.add(username)
213
+
214
+ bot.execute_script("arguments[0].scrollTop += arguments[0].offsetHeight;", scroll_box)
215
+ time.sleep(3)
216
+ scroll_round += 1
217
+ log.append(f"[Scroll {scroll_round}] Collected so far: {len(follows)}")
218
+
219
+ if total_following and len(follows) >= total_following:
220
+ log.append("[Info] - Reached end of following list.")
221
+ break
222
+
223
+ log.append(f"[Info] - Found {len(follows)} following users (limit was {limit}).")
224
+ return list(follows)[:limit]
225
+
226
+
227
+ # ---------- Gradio Interface Wrapper ----------
228
+
229
+ def run_scraper(insta_username, insta_password, target_username, limit):
230
+ log = []
231
+ save_credentials(insta_username, insta_password)
232
+
233
+ options = webdriver.ChromeOptions()
234
+ options.add_argument("--no-sandbox")
235
+ options.add_argument("--disable-dev-shm-usage")
236
+ options.add_argument("--headless=new")
237
+
238
+ bot = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
239
+ login(bot, insta_username, insta_password, log)
240
+
241
+ following_users = get_following_list(bot, target_username, int(limit), log)
242
+ log.append("[Info] - Checking each following for brand page criteria...")
243
+
244
+ results = []
245
+ for user in following_users:
246
+ try:
247
+ data = scrape_profile(bot, user, log)
248
+ if data["followers"] >= 50000 and data["is_brand_page"]:
249
+ log.append(f"[✔] {user} qualifies ({data['followers']} followers)")
250
+ results.append(data)
251
+ else:
252
+ log.append(f"[✖] {user} skipped (followers={data['followers']}, brand={data['is_brand_page']})")
253
+ except Exception as e:
254
+ log.append(f"[Error scraping {user}] {e}")
255
+
256
+ bot.quit()
257
+
258
+ if results:
259
+ df = pd.DataFrame(results)
260
+ csv_path = "brand_following.csv"
261
+ df.to_csv(csv_path, index=False, encoding="utf-8")
262
+ log.append(f"[Saved] {len(results)} brand pages saved to brand_following.csv")
263
+ return "\n".join(log), csv_path
264
+ else:
265
+ log.append("[Info] - No qualifying brand pages found.")
266
+ return "\n".join(log), None
267
+
268
+
269
+ # ---------- Gradio App ----------
270
+
271
+ iface = gr.Interface(
272
+ fn=run_scraper,
273
+ inputs=[
274
+ gr.Textbox(label="Instagram Username"),
275
+ gr.Textbox(label="Instagram Password", type="password"),
276
+ gr.Textbox(label="Target Username (whose following to scrape)"),
277
+ gr.Number(label="Limit (e.g. 50)", value=50)
278
+ ],
279
+ outputs=[
280
+ gr.Textbox(label="Logs (Live Progress)", lines=25),
281
+ gr.File(label="Download CSV (if available)")
282
+ ],
283
+ title="Instagram Brand Follower Scraper",
284
+ description="Scrape Instagram following list, detect brand pages (with 50k+ followers)."
285
+ )
286
+
287
+ iface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ selenium>=4.0.0
2
+ webdriver-manager>=3.8.0
3
+ pandas>=1.0.0
4
+ gradio