Baskar2005 commited on
Commit
0024ef8
·
verified ·
1 Parent(s): 62457c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -26
app.py CHANGED
@@ -20,16 +20,19 @@ def identify_url_type(url):
20
  if "instagram.com/" in url: return "PROFILE"
21
  return "UNKNOWN"
22
 
23
- # 🔥 MANUAL STEALTH: Hides "Headless" status from Instagram
24
  def apply_stealth(page):
 
 
 
 
25
  page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
26
  page.add_init_script("window.navigator.chrome = { runtime: {} };")
27
  page.add_init_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
28
  page.add_init_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
29
 
30
- # --- DATA HELPER ---
31
  def safe_find_key(obj, key):
32
- """Recursively searches for a key in nested JSON."""
33
  if isinstance(obj, dict):
34
  if key in obj: return obj[key]
35
  for k, v in obj.items():
@@ -45,13 +48,13 @@ def scrape_single_url(url):
45
  if not url or not url.strip(): return None
46
 
47
  with sync_playwright() as p:
48
- # 1. LAUNCH BROWSER (Optimized for Server)
49
  browser = p.chromium.launch(
50
  headless=True,
51
  args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"]
52
  )
53
 
54
- # 2. CONTEXT (Mobile User Agent = Easier Data Access)
55
  context = browser.new_context(
56
  user_agent="Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36",
57
  viewport={"width": 412, "height": 915},
@@ -73,22 +76,19 @@ def scrape_single_url(url):
73
  "status": "Starting"
74
  }
75
 
76
- # --- 3. NETWORK SNIFFER SETUP ---
77
  captured_data = {"play_count": None, "username": None, "like_count": None}
78
 
79
  def handle_response(response):
80
  if "instagram.com" in response.url and ("json" in response.headers.get("content-type", "") or "graphql" in response.url):
81
  try:
82
  json_data = response.json()
83
- # Capture Views/Plays
84
  if not captured_data["play_count"]:
85
  plays = safe_find_key(json_data, "play_count") or safe_find_key(json_data, "video_view_count")
86
  if plays: captured_data["play_count"] = plays
87
- # Capture Likes
88
  if not captured_data["like_count"]:
89
  likes = safe_find_key(json_data, "like_count")
90
  if likes: captured_data["like_count"] = likes
91
- # Capture Author
92
  if not captured_data["username"]:
93
  user = safe_find_key(json_data, "username")
94
  if user: captured_data["username"] = user
@@ -99,27 +99,19 @@ def scrape_single_url(url):
99
  try:
100
  # === NAVIGATION ===
101
  page.goto(url, wait_until="commit", timeout=45000)
102
- page.wait_for_timeout(5000) # Wait for network packets
103
 
104
- # 📸 DEBUG: Take screenshot if blocked
105
- if "Login" in page.title() or "Page Not Found" in page.title():
106
- print(" ⚠️ Blocked! Saving debug_error.png")
107
- page.screenshot(path="debug_error.png")
108
  data["status"] = "Failed (Login Block)"
109
  browser.close()
110
  return data
111
 
112
- # Fill data from Network Sniffer
113
  if captured_data["play_count"]: data["views"] = str(captured_data["play_count"])
114
  if captured_data["like_count"]: data["likes"] = str(captured_data["like_count"])
115
  if captured_data["username"]: data["author"] = captured_data["username"]
116
 
117
  # --- 4. FALLBACK: VISUAL SCRAPING ---
118
- # If network failed, try reading the screen
119
  if (data["views"] == "N/A" and data["type"] == "REEL") or not data["author"]:
120
- print(" ⚠️ Network missed data. Switching to Visual Scraping...")
121
-
122
- # Get Author from Title if missing
123
  if not data["author"]:
124
  try:
125
  title = page.title()
@@ -127,13 +119,11 @@ def scrape_single_url(url):
127
  if match: data["author"] = match.group(1)
128
  except: pass
129
 
130
- # Go to Profile for Followers & Views
131
  if data["author"]:
132
  if "/reels/" not in page.url:
133
  page.goto(f"https://www.instagram.com/{data['author']}/reels/", wait_until="domcontentloaded")
134
  page.wait_for_timeout(3000)
135
 
136
- # Try to find Followers (Meta Description)
137
  try:
138
  meta = page.locator('meta[property="og:description"]').get_attribute("content")
139
  if meta:
@@ -141,7 +131,6 @@ def scrape_single_url(url):
141
  if len(parts) > 1: data["followers"] = parts[0].strip().split(" ")[-1]
142
  except: pass
143
 
144
- # Try to find View Count on Grid
145
  if data["views"] == "N/A":
146
  try:
147
  shortcode = url.split("/reel/")[1].split("/")[0]
@@ -159,8 +148,6 @@ def scrape_single_url(url):
159
  except Exception as e:
160
  data["status"] = "Error"
161
  print(f"❌ Error: {e}")
162
- try: page.screenshot(path="debug_crash.png")
163
- except: pass
164
 
165
  browser.close()
166
  return data
@@ -199,5 +186,6 @@ def scrape_api():
199
  return jsonify(results)
200
 
201
  if __name__ == '__main__':
202
-
203
- app.run(host='0.0.0.0', port="10000")
 
 
20
  if "instagram.com/" in url: return "PROFILE"
21
  return "UNKNOWN"
22
 
23
+ # 🔥 MANUAL STEALTH: The Key to Headless=True 🔥
24
  def apply_stealth(page):
25
+ """
26
+ Overwrites browser variables so Instagram thinks
27
+ this is a real mobile device, not a server.
28
+ """
29
  page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
30
  page.add_init_script("window.navigator.chrome = { runtime: {} };")
31
  page.add_init_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
32
  page.add_init_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
33
 
34
+ # --- HELPER: RECURSIVE SEARCH ---
35
  def safe_find_key(obj, key):
 
36
  if isinstance(obj, dict):
37
  if key in obj: return obj[key]
38
  for k, v in obj.items():
 
48
  if not url or not url.strip(): return None
49
 
50
  with sync_playwright() as p:
51
+ # 1. LAUNCH BROWSER (Headless=True is REQUIRED for Server)
52
  browser = p.chromium.launch(
53
  headless=True,
54
  args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"]
55
  )
56
 
57
+ # 2. CONFIGURE CONTEXT (Fake Android Phone)
58
  context = browser.new_context(
59
  user_agent="Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36",
60
  viewport={"width": 412, "height": 915},
 
76
  "status": "Starting"
77
  }
78
 
79
+ # --- 3. NETWORK SNIFFER ---
80
  captured_data = {"play_count": None, "username": None, "like_count": None}
81
 
82
  def handle_response(response):
83
  if "instagram.com" in response.url and ("json" in response.headers.get("content-type", "") or "graphql" in response.url):
84
  try:
85
  json_data = response.json()
 
86
  if not captured_data["play_count"]:
87
  plays = safe_find_key(json_data, "play_count") or safe_find_key(json_data, "video_view_count")
88
  if plays: captured_data["play_count"] = plays
 
89
  if not captured_data["like_count"]:
90
  likes = safe_find_key(json_data, "like_count")
91
  if likes: captured_data["like_count"] = likes
 
92
  if not captured_data["username"]:
93
  user = safe_find_key(json_data, "username")
94
  if user: captured_data["username"] = user
 
99
  try:
100
  # === NAVIGATION ===
101
  page.goto(url, wait_until="commit", timeout=45000)
102
+ page.wait_for_timeout(5000)
103
 
104
+ if "Login" in page.title():
 
 
 
105
  data["status"] = "Failed (Login Block)"
106
  browser.close()
107
  return data
108
 
 
109
  if captured_data["play_count"]: data["views"] = str(captured_data["play_count"])
110
  if captured_data["like_count"]: data["likes"] = str(captured_data["like_count"])
111
  if captured_data["username"]: data["author"] = captured_data["username"]
112
 
113
  # --- 4. FALLBACK: VISUAL SCRAPING ---
 
114
  if (data["views"] == "N/A" and data["type"] == "REEL") or not data["author"]:
 
 
 
115
  if not data["author"]:
116
  try:
117
  title = page.title()
 
119
  if match: data["author"] = match.group(1)
120
  except: pass
121
 
 
122
  if data["author"]:
123
  if "/reels/" not in page.url:
124
  page.goto(f"https://www.instagram.com/{data['author']}/reels/", wait_until="domcontentloaded")
125
  page.wait_for_timeout(3000)
126
 
 
127
  try:
128
  meta = page.locator('meta[property="og:description"]').get_attribute("content")
129
  if meta:
 
131
  if len(parts) > 1: data["followers"] = parts[0].strip().split(" ")[-1]
132
  except: pass
133
 
 
134
  if data["views"] == "N/A":
135
  try:
136
  shortcode = url.split("/reel/")[1].split("/")[0]
 
148
  except Exception as e:
149
  data["status"] = "Error"
150
  print(f"❌ Error: {e}")
 
 
151
 
152
  browser.close()
153
  return data
 
186
  return jsonify(results)
187
 
188
  if __name__ == '__main__':
189
+ # Use ENV Port or default to 10000
190
+ port = int(os.environ.get("PORT", 10000))
191
+ app.run(host='0.0.0.0', port=port)