Spaces:

Baskar2005
/

TestingI

Runtime error

App Files Files Community

Baskar2005 commited on Jan 14

Commit

d376d1c

verified ·

1 Parent(s): 122e402

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -149

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from concurrent.futures import ThreadPoolExecutor
 import time
 import os
 import re
 app = Flask(__name__)
@@ -16,79 +17,48 @@ def identify_url_type(url):
     if "/reel/" in url: return "REEL"
     if "/p/" in url: return "POST"
     if url.strip("/") == "https://www.instagram.com": return "SYSTEM"
-    if "/explore/" in url or "/direct/" in url or "/stories/" in url: return "SYSTEM"
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
-# --- HELPER: MANUAL STEALTH (The Magic Fix) ---
 def apply_stealth(page):
-    """
-    Manually overrides browser variables to hide 'Headless' status.
-    This replaces the broken 'playwright-stealth' library.
-    """
-    # 1. Hide WebDriver Flag
     page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
-    # 2. Mock Chrome Runtime
     page.add_init_script("window.navigator.chrome = { runtime: {} };")
-    # 3. Mock Plugins (Headless browsers have 0, Humans have many)
     page.add_init_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
-    # 4. Mock Languages
     page.add_init_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
-# --- HELPER: RECURSIVE SEARCH ---
-def find_username_in_json(obj):
     if isinstance(obj, dict):
-        if "owner" in obj and isinstance(obj["owner"], dict):
-            if "username" in obj["owner"]: return obj["owner"]["username"]
-        if "username" in obj and "is_verified" in obj: return obj["username"]
         for k, v in obj.items():
-            if isinstance(v, (dict, list)):
-                res = find_username_in_json(v)
-                if res: return res
     elif isinstance(obj, list):
         for item in obj:
-            res = find_username_in_json(item)
-            if res: return res
     return None
 def scrape_single_url(url):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
-        # 1. LAUNCH BROWSER (Headless=True for Server)
         browser = p.chromium.launch(
             headless=True,
-            args=[
-                "--disable-blink-features=AutomationControlled", # Standard bot hide
-                "--no-sandbox",
-                "--disable-dev-shm-usage"
-            ]
         )
-        # 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
-        context_args = {
-            "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-            "viewport": {"width": 1920, "height": 1080},
-            "locale": "en-US",
-            "timezone_id": "America/New_York"
-        }
-        # Load Session if available, else Guest
-        if os.path.exists(SESSION_FILE):
-            try:
-                context = browser.new_context(storage_state=SESSION_FILE, **context_args)
-            except:
-                print("⚠️ Session Corrupt. Switching to Guest Mode.")
-                context = browser.new_context(**context_args)
-        else:
-            context = browser.new_context(**context_args)
         page = context.new_page()
-        # 3. APPLY MANUAL STEALTH
         apply_stealth(page)
         print(f"⚡ Processing: {url}")
@@ -103,131 +73,94 @@ def scrape_single_url(url):
             "status": "Starting"
         }
-        if data["type"] in ["SYSTEM", "UNKNOWN"]:
-            data["status"] = "Skipped"
-            browser.close()
-            return data
-        try:
-            # === NAVIGATION ===
-            page.goto(url, wait_until="commit", timeout=60000)
-            # Check Login Wall
-            time.sleep(4)
-            if "Login" in page.title():
-                data["status"] = "Failed (Login Block)"
-                browser.close()
-                return data
-            # === PATH A: PROFILE ===
-            if data["type"] == "PROFILE":
-                time.sleep(2)
                 try:
-                    followers_link = page.locator("a[href*='/followers/']").first
-                    if followers_link.count() > 0:
-                        title = followers_link.locator("span[title]").first
-                        if title.count() > 0:
-                            data["followers"] = title.get_attribute("title")
-                        else:
-                            data["followers"] = followers_link.inner_text().split("\n")[0]
                 except: pass
-                if not data["author"]:
-                     data["author"] = url.strip("/").split("/")[-1]
-                data["status"] = "Success"
-            # === PATH B: MEDIA (REEL/POST) ===
-            elif data["type"] in ["REEL", "POST"]:
-                if "/reel/" in url:
-                    shortcode = url.split("/reel/")[1].split("/")[0]
-                else:
-                    shortcode = url.split("/p/")[1].split("/")[0]
-                captured_info = {"username": None}
-                def handle_response(response):
-                    if "instagram.com" in response.url and "json" in response.headers.get("content-type", ""):
-                        try:
-                            json_data = response.json()
-                            found = find_username_in_json(json_data)
-                            if found and not captured_info["username"]:
-                                captured_info["username"] = found
-                        except: pass
-                page.on("response", handle_response)
-                time.sleep(3)
-                page.remove_listener("response", handle_response)
-                # Likes
-                try:
-                    meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
-                    if meta_desc:
-                        likes_match = re.search(r'^([0-9,.]+[KkMm]?) likes', meta_desc)
-                        if likes_match: data["likes"] = likes_match.group(1)
-                except: pass
-                # Author
-                if captured_info["username"]:
-                    data["author"] = captured_info["username"]
                 if not data["author"]:
                     try:
                         title = page.title()
-                        match = re.search(r'\(@(.*?)\)', title)
                         if match: data["author"] = match.group(1)
                     except: pass
-                if not data["author"]:
-                    try:
-                        links = page.locator("a[href*='/reels/']").all()
-                        for link in links:
-                            href = link.get_attribute("href")
-                            if href and "/reels/" in href:
-                                parts = href.strip("/").split("/")
-                                if len(parts) >= 2 and parts[-1] == "reels":
-                                    data["author"] = parts[-2]
-                                    break
-                    except: pass
-                # Views
                 if data["author"]:
-                    is_video = (data["type"] == "REEL")
                     try:
-                        if "video" in page.locator('meta[property="og:type"]').get_attribute("content"): is_video = True
                     except: pass
-                    if is_video:
-                        page.goto(f"https://www.instagram.com/{data['author']}/reels/", wait_until="domcontentloaded")
-                        time.sleep(3)
-                        if "/reels/" in page.url:
-                            try:
-                                target_card = page.locator(f"a[href*='{shortcode}']").first
-                                card_text = target_card.inner_text()
-                                for line in card_text.split('\n'):
-                                    if any(char.isdigit() for char in line):
                                         data["views"] = line.strip()
                                         break
-                            except:
-                                data["views"] = "Not Found"
-                    else:
-                        data["views"] = "N/A (Photo)"
-                    # Followers (Bonus)
-                    try:
-                        fol_link = page.locator("a[href*='/followers/']").first
-                        if fol_link.count() > 0:
-                            t = fol_link.locator("span[title]").first
-                            data["followers"] = t.get_attribute("title")
-                    except: pass
-                    data["status"] = "Success"
-                else:
-                    data["status"] = "Failed (No Author)"
         except Exception as e:
             data["status"] = "Error"
             print(f"❌ Error: {e}")
         browser.close()
         return data
@@ -266,4 +199,5 @@ def scrape_api():
     return jsonify(results)
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=7860)

 import time
 import os
 import re
+import json
 app = Flask(__name__)
     if "/reel/" in url: return "REEL"
     if "/p/" in url: return "POST"
     if url.strip("/") == "https://www.instagram.com": return "SYSTEM"
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
+# 🔥 MANUAL STEALTH: Hides "Headless" status from Instagram
 def apply_stealth(page):
     page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
     page.add_init_script("window.navigator.chrome = { runtime: {} };")
     page.add_init_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
     page.add_init_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
+# --- DATA HELPER ---
+def safe_find_key(obj, key):
+    """Recursively searches for a key in nested JSON."""
     if isinstance(obj, dict):
+        if key in obj: return obj[key]
         for k, v in obj.items():
+            res = safe_find_key(v, key)
+            if res is not None: return res
     elif isinstance(obj, list):
         for item in obj:
+            res = safe_find_key(item, key)
+            if res is not None: return res
     return None
 def scrape_single_url(url):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
+        # 1. LAUNCH BROWSER (Optimized for Server)
         browser = p.chromium.launch(
             headless=True,
+            args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"]
         )
+        # 2. CONTEXT (Mobile User Agent = Easier Data Access)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36",
+            viewport={"width": 412, "height": 915},
+            locale="en-US"
+        )
         page = context.new_page()
         apply_stealth(page)
         print(f"⚡ Processing: {url}")
             "status": "Starting"
         }
+        # --- 3. NETWORK SNIFFER SETUP ---
+        captured_data = {"play_count": None, "username": None, "like_count": None}
+        def handle_response(response):
+            if "instagram.com" in response.url and ("json" in response.headers.get("content-type", "") or "graphql" in response.url):
                 try:
+                    json_data = response.json()
+                    # Capture Views/Plays
+                    if not captured_data["play_count"]:
+                        plays = safe_find_key(json_data, "play_count") or safe_find_key(json_data, "video_view_count")
+                        if plays: captured_data["play_count"] = plays
+                    # Capture Likes
+                    if not captured_data["like_count"]:
+                        likes = safe_find_key(json_data, "like_count")
+                        if likes: captured_data["like_count"] = likes
+                    # Capture Author
+                    if not captured_data["username"]:
+                        user = safe_find_key(json_data, "username")
+                        if user: captured_data["username"] = user
                 except: pass
+        page.on("response", handle_response)
+        try:
+            # === NAVIGATION ===
+            page.goto(url, wait_until="commit", timeout=45000)
+            page.wait_for_timeout(5000) # Wait for network packets
+            # 📸 DEBUG: Take screenshot if blocked
+            if "Login" in page.title() or "Page Not Found" in page.title():
+                print("   ⚠️ Blocked! Saving debug_error.png")
+                page.screenshot(path="debug_error.png")
+                data["status"] = "Failed (Login Block)"
+                browser.close()
+                return data
+            # Fill data from Network Sniffer
+            if captured_data["play_count"]: data["views"] = str(captured_data["play_count"])
+            if captured_data["like_count"]: data["likes"] = str(captured_data["like_count"])
+            if captured_data["username"]: data["author"] = captured_data["username"]
+            # --- 4. FALLBACK: VISUAL SCRAPING ---
+            # If network failed, try reading the screen
+            if (data["views"] == "N/A" and data["type"] == "REEL") or not data["author"]:
+                print("   ⚠️ Network missed data. Switching to Visual Scraping...")
+                # Get Author from Title if missing
                 if not data["author"]:
                     try:
                         title = page.title()
+                        match = re.search(r'\(@(.*?)\)', title)
                         if match: data["author"] = match.group(1)
                     except: pass
+                # Go to Profile for Followers & Views
                 if data["author"]:
+                    if "/reels/" not in page.url:
+                        page.goto(f"https://www.instagram.com/{data['author']}/reels/", wait_until="domcontentloaded")
+                        page.wait_for_timeout(3000)
+                    # Try to find Followers (Meta Description)
                     try:
+                        meta = page.locator('meta[property="og:description"]').get_attribute("content")
+                        if meta:
+                            parts = meta.split("Followers")
+                            if len(parts) > 1: data["followers"] = parts[0].strip().split(" ")[-1]
                     except: pass
+                    # Try to find View Count on Grid
+                    if data["views"] == "N/A":
+                        try:
+                            shortcode = url.split("/reel/")[1].split("/")[0]
+                            card = page.locator(f"a[href*='{shortcode}']").first
+                            if card.count() > 0:
+                                txt = card.inner_text()
+                                for line in txt.split('\n'):
+                                    if any(c.isdigit() for c in line):
                                         data["views"] = line.strip()
                                         break
+                        except: pass
+            data["status"] = "Success"
         except Exception as e:
             data["status"] = "Error"
             print(f"❌ Error: {e}")
+            try: page.screenshot(path="debug_crash.png")
+            except: pass
         browser.close()
         return data
     return jsonify(results)
 if __name__ == '__main__':
+    port = int(os.environ.get("PORT", 10000))
+    app.run(host='0.0.0.0', port=port)