Baskar2005 commited on
Commit
ab568f2
·
verified ·
1 Parent(s): 13bc625

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -36
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from flask import Flask, render_template, request, jsonify
2
  from playwright.sync_api import sync_playwright
 
3
  from concurrent.futures import ThreadPoolExecutor
4
  import time
5
  import os
@@ -9,7 +10,7 @@ app = Flask(__name__)
9
 
10
  # ---------------- CONFIGURATION ---------------- #
11
  SESSION_FILE = "instagram_session.json"
12
- MAX_WORKERS = 3
13
  # ----------------------------------------------- #
14
 
15
  def identify_url_type(url):
@@ -20,7 +21,7 @@ def identify_url_type(url):
20
  if "instagram.com/" in url: return "PROFILE"
21
  return "UNKNOWN"
22
 
23
- # --- HELPER: RECURSIVE SEARCH ---
24
  def find_username_in_json(obj):
25
  if isinstance(obj, dict):
26
  if "owner" in obj and isinstance(obj["owner"], dict):
@@ -40,34 +41,39 @@ def scrape_single_url(url):
40
  if not url or not url.strip(): return None
41
 
42
  with sync_playwright() as p:
43
- # 🔥 STEALTH CONFIGURATION 🔥
44
- # 1. Hide the "Automation" flag
45
- # 2. Force Headless=True (Required for Server Stability)
46
  browser = p.chromium.launch(
47
- headless=True,
48
- args=["--disable-blink-features=AutomationControlled"]
 
 
 
 
49
  )
50
 
51
- context = browser.new_context(
52
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
53
- viewport={"width": 1280, "height": 720},
54
- storage_state=SESSION_FILE if os.path.exists(SESSION_FILE) else None
55
- )
56
-
57
- # 3. Inject "Real Human" User-Agent (Windows Chrome)
58
- # This prevents the "N/A" error by tricking Instagram
59
  context_args = {
60
- "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
61
- "viewport": {"width": 1280, "height": 720},
62
- "locale": "en-US"
 
63
  }
64
-
65
  if os.path.exists(SESSION_FILE):
66
- context = browser.new_context(storage_state=SESSION_FILE, **context_args)
 
 
 
 
67
  else:
68
  context = browser.new_context(**context_args)
69
 
70
  page = context.new_page()
 
 
 
 
71
  print(f"⚡ Processing: {url}")
72
 
73
  data = {
@@ -86,9 +92,26 @@ def scrape_single_url(url):
86
  return data
87
 
88
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  if data["type"] == "PROFILE":
90
- page.goto(url, wait_until="domcontentloaded", timeout=60000)
91
- time.sleep(3)
92
  try:
93
  followers_link = page.locator("a[href*='/followers/']").first
94
  if followers_link.count() > 0:
@@ -98,9 +121,12 @@ def scrape_single_url(url):
98
  else:
99
  data["followers"] = followers_link.inner_text().split("\n")[0]
100
  except: pass
101
- data["author"] = url.strip("/").split("/")[-1]
 
 
102
  data["status"] = "Success"
103
 
 
104
  elif data["type"] in ["REEL", "POST"]:
105
  if "/reel/" in url:
106
  shortcode = url.split("/reel/")[1].split("/")[0]
@@ -119,10 +145,11 @@ def scrape_single_url(url):
119
  except: pass
120
 
121
  page.on("response", handle_response)
122
- page.goto(url, wait_until="domcontentloaded", timeout=60000)
123
- time.sleep(4)
124
  page.remove_listener("response", handle_response)
125
 
 
126
  try:
127
  meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
128
  if meta_desc:
@@ -130,6 +157,7 @@ def scrape_single_url(url):
130
  if likes_match: data["likes"] = likes_match.group(1)
131
  except: pass
132
 
 
133
  if captured_info["username"]:
134
  data["author"] = captured_info["username"]
135
 
@@ -138,36 +166,33 @@ def scrape_single_url(url):
138
  title = page.title()
139
  match = re.search(r'\(@(.*?)\)', title)
140
  if match: data["author"] = match.group(1)
141
- else:
142
- match_b = re.search(r'^(.*?)\son\sInstagram', title)
143
- if match_b:
144
- parts = match_b.group(1).split(" ")
145
- if len(parts) == 1: data["author"] = parts[0]
146
  except: pass
147
-
148
  if not data["author"]:
149
  try:
150
  links = page.locator("a[href*='/reels/']").all()
151
  for link in links:
152
  href = link.get_attribute("href")
153
- if href:
154
  parts = href.strip("/").split("/")
155
- if len(parts) >= 2 and parts[-1] == "reels":
156
  candidate = parts[-2]
157
  if candidate not in ["reels", "instagram"]:
158
  data["author"] = candidate
159
  break
160
  except: pass
161
 
 
162
  if data["author"]:
163
  is_video = False
 
164
  try:
165
  og_type = page.locator('meta[property="og:type"]').get_attribute("content")
166
  if og_type and "video" in og_type: is_video = True
167
  except: pass
168
- if data["type"] == "REEL": is_video = True
169
 
170
  if is_video:
 
171
  profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
172
  page.goto(profile_reels_url, wait_until="domcontentloaded")
173
  time.sleep(3)
@@ -177,7 +202,8 @@ def scrape_single_url(url):
177
  else:
178
  try:
179
  target_selector = f"a[href*='{shortcode}']"
180
- page.wait_for_selector(target_selector, timeout=8000)
 
181
  target_card = page.locator(target_selector).first
182
  card_text = target_card.inner_text()
183
  for line in card_text.split('\n'):
@@ -189,6 +215,7 @@ def scrape_single_url(url):
189
  else:
190
  data["views"] = "N/A (Photo)"
191
 
 
192
  try:
193
  fol_link = page.locator("a[href*='/followers/']").first
194
  if fol_link.count() > 0:
@@ -208,6 +235,7 @@ def scrape_single_url(url):
208
  browser.close()
209
  return data
210
 
 
211
  @app.route('/')
212
  def home():
213
  return render_template('index.html')
@@ -243,5 +271,5 @@ def scrape_api():
243
  return jsonify(results)
244
 
245
  if __name__ == '__main__':
246
- # HUGGING FACE REQUIRES PORT 7860
247
  app.run(host='0.0.0.0', port=7860)
 
1
  from flask import Flask, render_template, request, jsonify
2
  from playwright.sync_api import sync_playwright
3
+ from playwright_stealth import stealth_sync
4
  from concurrent.futures import ThreadPoolExecutor
5
  import time
6
  import os
 
10
 
11
  # ---------------- CONFIGURATION ---------------- #
12
  SESSION_FILE = "instagram_session.json"
13
+ MAX_WORKERS = 3 # Keep low for free tier servers
14
  # ----------------------------------------------- #
15
 
16
  def identify_url_type(url):
 
21
  if "instagram.com/" in url: return "PROFILE"
22
  return "UNKNOWN"
23
 
24
+ # --- HELPER: RECURSIVE SEARCH (Deep Search for Author) ---
25
  def find_username_in_json(obj):
26
  if isinstance(obj, dict):
27
  if "owner" in obj and isinstance(obj["owner"], dict):
 
41
  if not url or not url.strip(): return None
42
 
43
  with sync_playwright() as p:
44
+ # 1. LAUNCH BROWSER (Headless + Anti-Detect Args)
 
 
45
  browser = p.chromium.launch(
46
+ headless=True,
47
+ args=[
48
+ "--disable-blink-features=AutomationControlled",
49
+ "--no-sandbox",
50
+ "--disable-dev-shm-usage"
51
+ ]
52
  )
53
 
54
+ # 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
55
+ # We try to load session, but if it fails/blocks, we continue as Guest
 
 
 
 
 
 
56
  context_args = {
57
+ "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
58
+ "viewport": {"width": 1920, "height": 1080},
59
+ "locale": "en-US",
60
+ "timezone_id": "America/New_York"
61
  }
62
+
63
  if os.path.exists(SESSION_FILE):
64
+ try:
65
+ context = browser.new_context(storage_state=SESSION_FILE, **context_args)
66
+ except:
67
+ print("⚠️ Session file corrupt or incompatible. Starting as Guest.")
68
+ context = browser.new_context(**context_args)
69
  else:
70
  context = browser.new_context(**context_args)
71
 
72
  page = context.new_page()
73
+
74
+ # 3. APPLY STEALTH (Crucial for Hugging Face)
75
+ stealth_sync(page)
76
+
77
  print(f"⚡ Processing: {url}")
78
 
79
  data = {
 
92
  return data
93
 
94
  try:
95
+ # === NAVIGATION ===
96
+ # Long timeout + "commit" wait ensuring page load
97
+ page.goto(url, wait_until="commit", timeout=60000)
98
+
99
+ # 4. CHECK FOR LOGIN WALL
100
+ time.sleep(4)
101
+ page_title = page.title()
102
+
103
+ if "Login" in page_title or "Instagram" == page_title:
104
+ # Sometimes just "Instagram" means it loaded the login screen, not content
105
+ # We do a quick check for content
106
+ if page.locator("input[name='username']").count() > 0:
107
+ data["status"] = "Failed (Login Block)"
108
+ print(" ⚠️ Blocked by Login Wall")
109
+ browser.close()
110
+ return data
111
+
112
+ # === PATH A: PROFILE ===
113
  if data["type"] == "PROFILE":
114
+ time.sleep(2)
 
115
  try:
116
  followers_link = page.locator("a[href*='/followers/']").first
117
  if followers_link.count() > 0:
 
121
  else:
122
  data["followers"] = followers_link.inner_text().split("\n")[0]
123
  except: pass
124
+
125
+ if not data["author"]:
126
+ data["author"] = url.strip("/").split("/")[-1]
127
  data["status"] = "Success"
128
 
129
+ # === PATH B: MEDIA (REEL/POST) ===
130
  elif data["type"] in ["REEL", "POST"]:
131
  if "/reel/" in url:
132
  shortcode = url.split("/reel/")[1].split("/")[0]
 
145
  except: pass
146
 
147
  page.on("response", handle_response)
148
+ # Reload to trigger network requests if needed, or just wait
149
+ time.sleep(3)
150
  page.remove_listener("response", handle_response)
151
 
152
+ # Get Likes
153
  try:
154
  meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
155
  if meta_desc:
 
157
  if likes_match: data["likes"] = likes_match.group(1)
158
  except: pass
159
 
160
+ # Get Author
161
  if captured_info["username"]:
162
  data["author"] = captured_info["username"]
163
 
 
166
  title = page.title()
167
  match = re.search(r'\(@(.*?)\)', title)
168
  if match: data["author"] = match.group(1)
 
 
 
 
 
169
  except: pass
170
+
171
  if not data["author"]:
172
  try:
173
  links = page.locator("a[href*='/reels/']").all()
174
  for link in links:
175
  href = link.get_attribute("href")
176
+ if href and "/reels/" in href:
177
  parts = href.strip("/").split("/")
178
+ if len(parts) >= 2:
179
  candidate = parts[-2]
180
  if candidate not in ["reels", "instagram"]:
181
  data["author"] = candidate
182
  break
183
  except: pass
184
 
185
+ # Get Views (Video Only)
186
  if data["author"]:
187
  is_video = False
188
+ if data["type"] == "REEL": is_video = True
189
  try:
190
  og_type = page.locator('meta[property="og:type"]').get_attribute("content")
191
  if og_type and "video" in og_type: is_video = True
192
  except: pass
 
193
 
194
  if is_video:
195
+ # Hop to Reels Tab
196
  profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
197
  page.goto(profile_reels_url, wait_until="domcontentloaded")
198
  time.sleep(3)
 
202
  else:
203
  try:
204
  target_selector = f"a[href*='{shortcode}']"
205
+ # Wait a bit for grid to load
206
+ page.wait_for_selector(target_selector, timeout=5000)
207
  target_card = page.locator(target_selector).first
208
  card_text = target_card.inner_text()
209
  for line in card_text.split('\n'):
 
215
  else:
216
  data["views"] = "N/A (Photo)"
217
 
218
+ # Bonus: Get Followers
219
  try:
220
  fol_link = page.locator("a[href*='/followers/']").first
221
  if fol_link.count() > 0:
 
235
  browser.close()
236
  return data
237
 
238
+ # --- ROUTES ---
239
  @app.route('/')
240
  def home():
241
  return render_template('index.html')
 
271
  return jsonify(results)
272
 
273
  if __name__ == '__main__':
274
+ # HUGGING FACE PORT
275
  app.run(host='0.0.0.0', port=7860)