Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from flask import Flask, render_template, request, jsonify
|
| 2 |
from playwright.sync_api import sync_playwright
|
| 3 |
-
# REMOVED: from playwright_stealth import stealth_sync (Use manual function below instead)
|
| 4 |
from concurrent.futures import ThreadPoolExecutor
|
| 5 |
import time
|
| 6 |
import os
|
|
@@ -10,7 +9,7 @@ app = Flask(__name__)
|
|
| 10 |
|
| 11 |
# ---------------- CONFIGURATION ---------------- #
|
| 12 |
SESSION_FILE = "instagram_session.json"
|
| 13 |
-
MAX_WORKERS = 3
|
| 14 |
# ----------------------------------------------- #
|
| 15 |
|
| 16 |
def identify_url_type(url):
|
|
@@ -21,36 +20,23 @@ def identify_url_type(url):
|
|
| 21 |
if "instagram.com/" in url: return "PROFILE"
|
| 22 |
return "UNKNOWN"
|
| 23 |
|
| 24 |
-
# --- HELPER: MANUAL STEALTH
|
| 25 |
def apply_stealth(page):
|
| 26 |
"""
|
| 27 |
-
Manually
|
| 28 |
-
|
| 29 |
"""
|
| 30 |
-
# 1. Hide
|
| 31 |
-
page.add_init_script(""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
#
|
| 37 |
-
page.add_init_script(""
|
| 38 |
-
|
| 39 |
-
runtime: {}
|
| 40 |
-
};
|
| 41 |
-
""")
|
| 42 |
-
# 3. Mock Plugins (Bots usually have 0)
|
| 43 |
-
page.add_init_script("""
|
| 44 |
-
Object.defineProperty(navigator, 'plugins', {
|
| 45 |
-
get: () => [1, 2, 3, 4, 5]
|
| 46 |
-
});
|
| 47 |
-
""")
|
| 48 |
# 4. Mock Languages
|
| 49 |
-
page.add_init_script(""
|
| 50 |
-
Object.defineProperty(navigator, 'languages', {
|
| 51 |
-
get: () => ['en-US', 'en']
|
| 52 |
-
});
|
| 53 |
-
""")
|
| 54 |
|
| 55 |
# --- HELPER: RECURSIVE SEARCH ---
|
| 56 |
def find_username_in_json(obj):
|
|
@@ -72,37 +58,37 @@ def scrape_single_url(url):
|
|
| 72 |
if not url or not url.strip(): return None
|
| 73 |
|
| 74 |
with sync_playwright() as p:
|
| 75 |
-
# 1. LAUNCH BROWSER
|
| 76 |
browser = p.chromium.launch(
|
| 77 |
headless=True,
|
| 78 |
args=[
|
| 79 |
-
"--disable-blink-features=AutomationControlled",
|
| 80 |
"--no-sandbox",
|
| 81 |
"--disable-dev-shm-usage"
|
| 82 |
]
|
| 83 |
)
|
| 84 |
|
| 85 |
-
# 2. CONFIGURE CONTEXT
|
| 86 |
context_args = {
|
| 87 |
-
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
| 88 |
"viewport": {"width": 1920, "height": 1080},
|
| 89 |
"locale": "en-US",
|
| 90 |
"timezone_id": "America/New_York"
|
| 91 |
}
|
| 92 |
-
|
| 93 |
-
#
|
| 94 |
if os.path.exists(SESSION_FILE):
|
| 95 |
try:
|
| 96 |
context = browser.new_context(storage_state=SESSION_FILE, **context_args)
|
| 97 |
except:
|
| 98 |
-
print("⚠️ Session
|
| 99 |
context = browser.new_context(**context_args)
|
| 100 |
else:
|
| 101 |
context = browser.new_context(**context_args)
|
| 102 |
|
| 103 |
page = context.new_page()
|
| 104 |
|
| 105 |
-
# 3. APPLY MANUAL STEALTH
|
| 106 |
apply_stealth(page)
|
| 107 |
|
| 108 |
print(f"⚡ Processing: {url}")
|
|
@@ -128,7 +114,7 @@ def scrape_single_url(url):
|
|
| 128 |
|
| 129 |
# Check Login Wall
|
| 130 |
time.sleep(4)
|
| 131 |
-
if "Login" in page.title()
|
| 132 |
data["status"] = "Failed (Login Block)"
|
| 133 |
browser.close()
|
| 134 |
return data
|
|
@@ -190,7 +176,7 @@ def scrape_single_url(url):
|
|
| 190 |
match = re.search(r'\(@(.*?)\)', title)
|
| 191 |
if match: data["author"] = match.group(1)
|
| 192 |
except: pass
|
| 193 |
-
|
| 194 |
if not data["author"]:
|
| 195 |
try:
|
| 196 |
links = page.locator("a[href*='/reels/']").all()
|
|
@@ -246,7 +232,6 @@ def scrape_single_url(url):
|
|
| 246 |
browser.close()
|
| 247 |
return data
|
| 248 |
|
| 249 |
-
# --- ROUTES ---
|
| 250 |
@app.route('/')
|
| 251 |
def home():
|
| 252 |
return render_template('index.html')
|
|
|
|
| 1 |
from flask import Flask, render_template, request, jsonify
|
| 2 |
from playwright.sync_api import sync_playwright
|
|
|
|
| 3 |
from concurrent.futures import ThreadPoolExecutor
|
| 4 |
import time
|
| 5 |
import os
|
|
|
|
| 9 |
|
| 10 |
# ---------------- CONFIGURATION ---------------- #
|
| 11 |
SESSION_FILE = "instagram_session.json"
|
| 12 |
+
MAX_WORKERS = 3
|
| 13 |
# ----------------------------------------------- #
|
| 14 |
|
| 15 |
def identify_url_type(url):
|
|
|
|
| 20 |
if "instagram.com/" in url: return "PROFILE"
|
| 21 |
return "UNKNOWN"
|
| 22 |
|
| 23 |
+
# --- HELPER: MANUAL STEALTH (The Magic Fix) ---
|
| 24 |
def apply_stealth(page):
|
| 25 |
"""
|
| 26 |
+
Manually overrides browser variables to hide 'Headless' status.
|
| 27 |
+
This replaces the broken 'playwright-stealth' library.
|
| 28 |
"""
|
| 29 |
+
# 1. Hide WebDriver Flag
|
| 30 |
+
page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
| 31 |
+
|
| 32 |
+
# 2. Mock Chrome Runtime
|
| 33 |
+
page.add_init_script("window.navigator.chrome = { runtime: {} };")
|
| 34 |
+
|
| 35 |
+
# 3. Mock Plugins (Headless browsers have 0, Humans have many)
|
| 36 |
+
page.add_init_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
|
| 37 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# 4. Mock Languages
|
| 39 |
+
page.add_init_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# --- HELPER: RECURSIVE SEARCH ---
|
| 42 |
def find_username_in_json(obj):
|
|
|
|
| 58 |
if not url or not url.strip(): return None
|
| 59 |
|
| 60 |
with sync_playwright() as p:
|
| 61 |
+
# 1. LAUNCH BROWSER (Headless=True for Server)
|
| 62 |
browser = p.chromium.launch(
|
| 63 |
headless=True,
|
| 64 |
args=[
|
| 65 |
+
"--disable-blink-features=AutomationControlled", # Standard bot hide
|
| 66 |
"--no-sandbox",
|
| 67 |
"--disable-dev-shm-usage"
|
| 68 |
]
|
| 69 |
)
|
| 70 |
|
| 71 |
+
# 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
|
| 72 |
context_args = {
|
| 73 |
+
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
| 74 |
"viewport": {"width": 1920, "height": 1080},
|
| 75 |
"locale": "en-US",
|
| 76 |
"timezone_id": "America/New_York"
|
| 77 |
}
|
| 78 |
+
|
| 79 |
+
# Load Session if available, else Guest
|
| 80 |
if os.path.exists(SESSION_FILE):
|
| 81 |
try:
|
| 82 |
context = browser.new_context(storage_state=SESSION_FILE, **context_args)
|
| 83 |
except:
|
| 84 |
+
print("⚠️ Session Corrupt. Switching to Guest Mode.")
|
| 85 |
context = browser.new_context(**context_args)
|
| 86 |
else:
|
| 87 |
context = browser.new_context(**context_args)
|
| 88 |
|
| 89 |
page = context.new_page()
|
| 90 |
|
| 91 |
+
# 3. APPLY MANUAL STEALTH
|
| 92 |
apply_stealth(page)
|
| 93 |
|
| 94 |
print(f"⚡ Processing: {url}")
|
|
|
|
| 114 |
|
| 115 |
# Check Login Wall
|
| 116 |
time.sleep(4)
|
| 117 |
+
if "Login" in page.title():
|
| 118 |
data["status"] = "Failed (Login Block)"
|
| 119 |
browser.close()
|
| 120 |
return data
|
|
|
|
| 176 |
match = re.search(r'\(@(.*?)\)', title)
|
| 177 |
if match: data["author"] = match.group(1)
|
| 178 |
except: pass
|
| 179 |
+
|
| 180 |
if not data["author"]:
|
| 181 |
try:
|
| 182 |
links = page.locator("a[href*='/reels/']").all()
|
|
|
|
| 232 |
browser.close()
|
| 233 |
return data
|
| 234 |
|
|
|
|
| 235 |
@app.route('/')
|
| 236 |
def home():
|
| 237 |
return render_template('index.html')
|