Baskar2005 commited on
Commit
122e402
·
verified ·
1 Parent(s): 4466069

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -39
app.py CHANGED
@@ -1,6 +1,5 @@
1
  from flask import Flask, render_template, request, jsonify
2
  from playwright.sync_api import sync_playwright
3
- # REMOVED: from playwright_stealth import stealth_sync (Use manual function below instead)
4
  from concurrent.futures import ThreadPoolExecutor
5
  import time
6
  import os
@@ -10,7 +9,7 @@ app = Flask(__name__)
10
 
11
  # ---------------- CONFIGURATION ---------------- #
12
  SESSION_FILE = "instagram_session.json"
13
- MAX_WORKERS = 3
14
  # ----------------------------------------------- #
15
 
16
  def identify_url_type(url):
@@ -21,36 +20,23 @@ def identify_url_type(url):
21
  if "instagram.com/" in url: return "PROFILE"
22
  return "UNKNOWN"
23
 
24
- # --- HELPER: MANUAL STEALTH MODE (Fixes ImportError) ---
25
  def apply_stealth(page):
26
  """
27
- Manually hides 'navigator.webdriver' and other bot flags
28
- so Instagram thinks this is a real browser.
29
  """
30
- # 1. Hide the WebDriver flag
31
- page.add_init_script("""
32
- Object.defineProperty(navigator, 'webdriver', {
33
- get: () => undefined
34
- });
35
- """)
36
- # 2. Mock Chrome runtime
37
- page.add_init_script("""
38
- window.navigator.chrome = {
39
- runtime: {}
40
- };
41
- """)
42
- # 3. Mock Plugins (Bots usually have 0)
43
- page.add_init_script("""
44
- Object.defineProperty(navigator, 'plugins', {
45
- get: () => [1, 2, 3, 4, 5]
46
- });
47
- """)
48
  # 4. Mock Languages
49
- page.add_init_script("""
50
- Object.defineProperty(navigator, 'languages', {
51
- get: () => ['en-US', 'en']
52
- });
53
- """)
54
 
55
  # --- HELPER: RECURSIVE SEARCH ---
56
  def find_username_in_json(obj):
@@ -72,37 +58,37 @@ def scrape_single_url(url):
72
  if not url or not url.strip(): return None
73
 
74
  with sync_playwright() as p:
75
- # 1. LAUNCH BROWSER
76
  browser = p.chromium.launch(
77
  headless=True,
78
  args=[
79
- "--disable-blink-features=AutomationControlled",
80
  "--no-sandbox",
81
  "--disable-dev-shm-usage"
82
  ]
83
  )
84
 
85
- # 2. CONFIGURE CONTEXT
86
  context_args = {
87
- "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
88
  "viewport": {"width": 1920, "height": 1080},
89
  "locale": "en-US",
90
  "timezone_id": "America/New_York"
91
  }
92
-
93
- # Try to load session (if exists)
94
  if os.path.exists(SESSION_FILE):
95
  try:
96
  context = browser.new_context(storage_state=SESSION_FILE, **context_args)
97
  except:
98
- print("⚠️ Session corrupted. Guest mode.")
99
  context = browser.new_context(**context_args)
100
  else:
101
  context = browser.new_context(**context_args)
102
 
103
  page = context.new_page()
104
 
105
- # 3. APPLY MANUAL STEALTH (Replaces the library)
106
  apply_stealth(page)
107
 
108
  print(f"⚡ Processing: {url}")
@@ -128,7 +114,7 @@ def scrape_single_url(url):
128
 
129
  # Check Login Wall
130
  time.sleep(4)
131
- if "Login" in page.title() or page.locator("input[name='username']").count() > 0:
132
  data["status"] = "Failed (Login Block)"
133
  browser.close()
134
  return data
@@ -190,7 +176,7 @@ def scrape_single_url(url):
190
  match = re.search(r'\(@(.*?)\)', title)
191
  if match: data["author"] = match.group(1)
192
  except: pass
193
-
194
  if not data["author"]:
195
  try:
196
  links = page.locator("a[href*='/reels/']").all()
@@ -246,7 +232,6 @@ def scrape_single_url(url):
246
  browser.close()
247
  return data
248
 
249
- # --- ROUTES ---
250
  @app.route('/')
251
  def home():
252
  return render_template('index.html')
 
1
  from flask import Flask, render_template, request, jsonify
2
  from playwright.sync_api import sync_playwright
 
3
  from concurrent.futures import ThreadPoolExecutor
4
  import time
5
  import os
 
9
 
10
  # ---------------- CONFIGURATION ---------------- #
11
  SESSION_FILE = "instagram_session.json"
12
+ MAX_WORKERS = 3
13
  # ----------------------------------------------- #
14
 
15
  def identify_url_type(url):
 
20
  if "instagram.com/" in url: return "PROFILE"
21
  return "UNKNOWN"
22
 
23
+ # --- HELPER: MANUAL STEALTH (The Magic Fix) ---
24
  def apply_stealth(page):
25
  """
26
+ Manually overrides browser variables to hide 'Headless' status.
27
+ This replaces the broken 'playwright-stealth' library.
28
  """
29
+ # 1. Hide WebDriver Flag
30
+ page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
31
+
32
+ # 2. Mock Chrome Runtime
33
+ page.add_init_script("window.navigator.chrome = { runtime: {} };")
34
+
35
+ # 3. Mock Plugins (Headless browsers have 0, Humans have many)
36
+ page.add_init_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
37
+
 
 
 
 
 
 
 
 
 
38
  # 4. Mock Languages
39
+ page.add_init_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
 
 
 
 
40
 
41
  # --- HELPER: RECURSIVE SEARCH ---
42
  def find_username_in_json(obj):
 
58
  if not url or not url.strip(): return None
59
 
60
  with sync_playwright() as p:
61
+ # 1. LAUNCH BROWSER (Headless=True for Server)
62
  browser = p.chromium.launch(
63
  headless=True,
64
  args=[
65
+ "--disable-blink-features=AutomationControlled", # Standard bot hide
66
  "--no-sandbox",
67
  "--disable-dev-shm-usage"
68
  ]
69
  )
70
 
71
+ # 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
72
  context_args = {
73
+ "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
74
  "viewport": {"width": 1920, "height": 1080},
75
  "locale": "en-US",
76
  "timezone_id": "America/New_York"
77
  }
78
+
79
+ # Load Session if available, else Guest
80
  if os.path.exists(SESSION_FILE):
81
  try:
82
  context = browser.new_context(storage_state=SESSION_FILE, **context_args)
83
  except:
84
+ print("⚠️ Session Corrupt. Switching to Guest Mode.")
85
  context = browser.new_context(**context_args)
86
  else:
87
  context = browser.new_context(**context_args)
88
 
89
  page = context.new_page()
90
 
91
+ # 3. APPLY MANUAL STEALTH
92
  apply_stealth(page)
93
 
94
  print(f"⚡ Processing: {url}")
 
114
 
115
  # Check Login Wall
116
  time.sleep(4)
117
+ if "Login" in page.title():
118
  data["status"] = "Failed (Login Block)"
119
  browser.close()
120
  return data
 
176
  match = re.search(r'\(@(.*?)\)', title)
177
  if match: data["author"] = match.group(1)
178
  except: pass
179
+
180
  if not data["author"]:
181
  try:
182
  links = page.locator("a[href*='/reels/']").all()
 
232
  browser.close()
233
  return data
234
 
 
235
  @app.route('/')
236
  def home():
237
  return render_template('index.html')