Baskar2005 commited on
Commit
27c717b
·
verified ·
1 Parent(s): 98d9557

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -52
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from flask import Flask, render_template, request, jsonify
2
  from playwright.sync_api import sync_playwright
3
- from playwright_stealth import stealth_sync
4
  from concurrent.futures import ThreadPoolExecutor
5
  import time
6
  import os
@@ -10,7 +10,7 @@ app = Flask(__name__)
10
 
11
  # ---------------- CONFIGURATION ---------------- #
12
  SESSION_FILE = "instagram_session.json"
13
- MAX_WORKERS = 3 # Keep low for free tier servers
14
  # ----------------------------------------------- #
15
 
16
  def identify_url_type(url):
@@ -21,7 +21,38 @@ def identify_url_type(url):
21
  if "instagram.com/" in url: return "PROFILE"
22
  return "UNKNOWN"
23
 
24
- # --- HELPER: RECURSIVE SEARCH (Deep Search for Author) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def find_username_in_json(obj):
26
  if isinstance(obj, dict):
27
  if "owner" in obj and isinstance(obj["owner"], dict):
@@ -41,7 +72,7 @@ def scrape_single_url(url):
41
  if not url or not url.strip(): return None
42
 
43
  with sync_playwright() as p:
44
- # 1. LAUNCH BROWSER (Headless + Anti-Detect Args)
45
  browser = p.chromium.launch(
46
  headless=True,
47
  args=[
@@ -51,8 +82,7 @@ def scrape_single_url(url):
51
  ]
52
  )
53
 
54
- # 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
55
- # We try to load session, but if it fails/blocks, we continue as Guest
56
  context_args = {
57
  "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
58
  "viewport": {"width": 1920, "height": 1080},
@@ -60,19 +90,20 @@ def scrape_single_url(url):
60
  "timezone_id": "America/New_York"
61
  }
62
 
 
63
  if os.path.exists(SESSION_FILE):
64
  try:
65
  context = browser.new_context(storage_state=SESSION_FILE, **context_args)
66
  except:
67
- print("⚠️ Session file corrupt or incompatible. Starting as Guest.")
68
  context = browser.new_context(**context_args)
69
  else:
70
  context = browser.new_context(**context_args)
71
 
72
  page = context.new_page()
73
 
74
- # 3. APPLY STEALTH (Crucial for Hugging Face)
75
- stealth_sync(page)
76
 
77
  print(f"⚡ Processing: {url}")
78
 
@@ -93,21 +124,14 @@ def scrape_single_url(url):
93
 
94
  try:
95
  # === NAVIGATION ===
96
- # Long timeout + "commit" wait ensuring page load
97
  page.goto(url, wait_until="commit", timeout=60000)
98
 
99
- # 4. CHECK FOR LOGIN WALL
100
  time.sleep(4)
101
- page_title = page.title()
102
-
103
- if "Login" in page_title or "Instagram" == page_title:
104
- # Sometimes just "Instagram" means it loaded the login screen, not content
105
- # We do a quick check for content
106
- if page.locator("input[name='username']").count() > 0:
107
- data["status"] = "Failed (Login Block)"
108
- print(" ⚠️ Blocked by Login Wall")
109
- browser.close()
110
- return data
111
 
112
  # === PATH A: PROFILE ===
113
  if data["type"] == "PROFILE":
@@ -145,11 +169,10 @@ def scrape_single_url(url):
145
  except: pass
146
 
147
  page.on("response", handle_response)
148
- # Reload to trigger network requests if needed, or just wait
149
  time.sleep(3)
150
  page.remove_listener("response", handle_response)
151
 
152
- # Get Likes
153
  try:
154
  meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
155
  if meta_desc:
@@ -157,7 +180,7 @@ def scrape_single_url(url):
157
  if likes_match: data["likes"] = likes_match.group(1)
158
  except: pass
159
 
160
- # Get Author
161
  if captured_info["username"]:
162
  data["author"] = captured_info["username"]
163
 
@@ -175,36 +198,25 @@ def scrape_single_url(url):
175
  href = link.get_attribute("href")
176
  if href and "/reels/" in href:
177
  parts = href.strip("/").split("/")
178
- if len(parts) >= 2:
179
- candidate = parts[-2]
180
- if candidate not in ["reels", "instagram"]:
181
- data["author"] = candidate
182
- break
183
  except: pass
184
 
185
- # Get Views (Video Only)
186
  if data["author"]:
187
- is_video = False
188
- if data["type"] == "REEL": is_video = True
189
  try:
190
- og_type = page.locator('meta[property="og:type"]').get_attribute("content")
191
- if og_type and "video" in og_type: is_video = True
192
  except: pass
193
 
194
  if is_video:
195
- # Hop to Reels Tab
196
- profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
197
- page.goto(profile_reels_url, wait_until="domcontentloaded")
198
  time.sleep(3)
199
 
200
- if "/reels/" not in page.url:
201
- data["views"] = "Hidden (Main Grid)"
202
- else:
203
  try:
204
- target_selector = f"a[href*='{shortcode}']"
205
- # Wait a bit for grid to load
206
- page.wait_for_selector(target_selector, timeout=5000)
207
- target_card = page.locator(target_selector).first
208
  card_text = target_card.inner_text()
209
  for line in card_text.split('\n'):
210
  if any(char.isdigit() for char in line):
@@ -215,13 +227,12 @@ def scrape_single_url(url):
215
  else:
216
  data["views"] = "N/A (Photo)"
217
 
218
- # Bonus: Get Followers
219
  try:
220
  fol_link = page.locator("a[href*='/followers/']").first
221
  if fol_link.count() > 0:
222
- title = fol_link.locator("span[title]").first
223
- if title.count() > 0:
224
- data["followers"] = title.get_attribute("title")
225
  except: pass
226
 
227
  data["status"] = "Success"
@@ -244,8 +255,8 @@ def home():
244
  def scrape_api():
245
  data = request.json
246
  raw_urls = data.get('urls', [])
247
-
248
  final_urls = []
 
249
  if isinstance(raw_urls, list):
250
  raw_string = ",".join(raw_urls)
251
  else:
@@ -260,8 +271,7 @@ def scrape_api():
260
  if not final_urls:
261
  return jsonify({"error": "No valid URLs provided"}), 400
262
 
263
- print(f"🔥 API Request: Processing {len(final_urls)} links...")
264
-
265
  results = []
266
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
267
  results_iterator = executor.map(scrape_single_url, final_urls)
@@ -271,5 +281,4 @@ def scrape_api():
271
  return jsonify(results)
272
 
273
  if __name__ == '__main__':
274
- # HUGGING FACE PORT
275
  app.run(host='0.0.0.0', port=7860)
 
1
  from flask import Flask, render_template, request, jsonify
2
  from playwright.sync_api import sync_playwright
3
+ # REMOVED: from playwright_stealth import stealth_sync (Use manual function below instead)
4
  from concurrent.futures import ThreadPoolExecutor
5
  import time
6
  import os
 
10
 
11
  # ---------------- CONFIGURATION ---------------- #
12
  SESSION_FILE = "instagram_session.json"
13
+ MAX_WORKERS = 3
14
  # ----------------------------------------------- #
15
 
16
  def identify_url_type(url):
 
21
  if "instagram.com/" in url: return "PROFILE"
22
  return "UNKNOWN"
23
 
24
+ # --- HELPER: MANUAL STEALTH MODE (Fixes ImportError) ---
25
+ def apply_stealth(page):
26
+ """
27
+ Manually hides 'navigator.webdriver' and other bot flags
28
+ so Instagram thinks this is a real browser.
29
+ """
30
+ # 1. Hide the WebDriver flag
31
+ page.add_init_script("""
32
+ Object.defineProperty(navigator, 'webdriver', {
33
+ get: () => undefined
34
+ });
35
+ """)
36
+ # 2. Mock Chrome runtime
37
+ page.add_init_script("""
38
+ window.navigator.chrome = {
39
+ runtime: {}
40
+ };
41
+ """)
42
+ # 3. Mock Plugins (Bots usually have 0)
43
+ page.add_init_script("""
44
+ Object.defineProperty(navigator, 'plugins', {
45
+ get: () => [1, 2, 3, 4, 5]
46
+ });
47
+ """)
48
+ # 4. Mock Languages
49
+ page.add_init_script("""
50
+ Object.defineProperty(navigator, 'languages', {
51
+ get: () => ['en-US', 'en']
52
+ });
53
+ """)
54
+
55
+ # --- HELPER: RECURSIVE SEARCH ---
56
  def find_username_in_json(obj):
57
  if isinstance(obj, dict):
58
  if "owner" in obj and isinstance(obj["owner"], dict):
 
72
  if not url or not url.strip(): return None
73
 
74
  with sync_playwright() as p:
75
+ # 1. LAUNCH BROWSER
76
  browser = p.chromium.launch(
77
  headless=True,
78
  args=[
 
82
  ]
83
  )
84
 
85
+ # 2. CONFIGURE CONTEXT
 
86
  context_args = {
87
  "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
88
  "viewport": {"width": 1920, "height": 1080},
 
90
  "timezone_id": "America/New_York"
91
  }
92
 
93
+ # Try to load session (if exists)
94
  if os.path.exists(SESSION_FILE):
95
  try:
96
  context = browser.new_context(storage_state=SESSION_FILE, **context_args)
97
  except:
98
+ print("⚠️ Session corrupted. Guest mode.")
99
  context = browser.new_context(**context_args)
100
  else:
101
  context = browser.new_context(**context_args)
102
 
103
  page = context.new_page()
104
 
105
+ # 3. APPLY MANUAL STEALTH (Replaces the library)
106
+ apply_stealth(page)
107
 
108
  print(f"⚡ Processing: {url}")
109
 
 
124
 
125
  try:
126
  # === NAVIGATION ===
 
127
  page.goto(url, wait_until="commit", timeout=60000)
128
 
129
+ # Check Login Wall
130
  time.sleep(4)
131
+ if "Login" in page.title() or page.locator("input[name='username']").count() > 0:
132
+ data["status"] = "Failed (Login Block)"
133
+ browser.close()
134
+ return data
 
 
 
 
 
 
135
 
136
  # === PATH A: PROFILE ===
137
  if data["type"] == "PROFILE":
 
169
  except: pass
170
 
171
  page.on("response", handle_response)
 
172
  time.sleep(3)
173
  page.remove_listener("response", handle_response)
174
 
175
+ # Likes
176
  try:
177
  meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
178
  if meta_desc:
 
180
  if likes_match: data["likes"] = likes_match.group(1)
181
  except: pass
182
 
183
+ # Author
184
  if captured_info["username"]:
185
  data["author"] = captured_info["username"]
186
 
 
198
  href = link.get_attribute("href")
199
  if href and "/reels/" in href:
200
  parts = href.strip("/").split("/")
201
+ if len(parts) >= 2 and parts[-1] == "reels":
202
+ data["author"] = parts[-2]
203
+ break
 
 
204
  except: pass
205
 
206
+ # Views
207
  if data["author"]:
208
+ is_video = (data["type"] == "REEL")
 
209
  try:
210
+ if "video" in page.locator('meta[property="og:type"]').get_attribute("content"): is_video = True
 
211
  except: pass
212
 
213
  if is_video:
214
+ page.goto(f"https://www.instagram.com/{data['author']}/reels/", wait_until="domcontentloaded")
 
 
215
  time.sleep(3)
216
 
217
+ if "/reels/" in page.url:
 
 
218
  try:
219
+ target_card = page.locator(f"a[href*='{shortcode}']").first
 
 
 
220
  card_text = target_card.inner_text()
221
  for line in card_text.split('\n'):
222
  if any(char.isdigit() for char in line):
 
227
  else:
228
  data["views"] = "N/A (Photo)"
229
 
230
+ # Followers (Bonus)
231
  try:
232
  fol_link = page.locator("a[href*='/followers/']").first
233
  if fol_link.count() > 0:
234
+ t = fol_link.locator("span[title]").first
235
+ data["followers"] = t.get_attribute("title")
 
236
  except: pass
237
 
238
  data["status"] = "Success"
 
255
  def scrape_api():
256
  data = request.json
257
  raw_urls = data.get('urls', [])
 
258
  final_urls = []
259
+
260
  if isinstance(raw_urls, list):
261
  raw_string = ",".join(raw_urls)
262
  else:
 
271
  if not final_urls:
272
  return jsonify({"error": "No valid URLs provided"}), 400
273
 
274
+ print(f"🔥 Processing {len(final_urls)} links...")
 
275
  results = []
276
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
277
  results_iterator = executor.map(scrape_single_url, final_urls)
 
281
  return jsonify(results)
282
 
283
  if __name__ == '__main__':
 
284
  app.run(host='0.0.0.0', port=7860)