Spaces:

Baskar2005
/

TestingI

Runtime error

App Files Files Community

Baskar2005 commited on Jan 13

Commit

27c717b

verified ·

1 Parent(s): 98d9557

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -52

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from flask import Flask, render_template, request, jsonify
 from playwright.sync_api import sync_playwright
-from playwright_stealth import stealth_sync
 from concurrent.futures import ThreadPoolExecutor
 import time
 import os
@@ -10,7 +10,7 @@ app = Flask(__name__)
 # ---------------- CONFIGURATION ---------------- #
 SESSION_FILE = "instagram_session.json"
-MAX_WORKERS = 3  # Keep low for free tier servers
 # ----------------------------------------------- #
 def identify_url_type(url):
@@ -21,7 +21,38 @@ def identify_url_type(url):
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
-# --- HELPER: RECURSIVE SEARCH (Deep Search for Author) ---
 def find_username_in_json(obj):
     if isinstance(obj, dict):
         if "owner" in obj and isinstance(obj["owner"], dict):
@@ -41,7 +72,7 @@ def scrape_single_url(url):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
-        # 1. LAUNCH BROWSER (Headless + Anti-Detect Args)
         browser = p.chromium.launch(
             headless=True,
             args=[
@@ -51,8 +82,7 @@ def scrape_single_url(url):
             ]
         )
-        # 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
-        # We try to load session, but if it fails/blocks, we continue as Guest
         context_args = {
             "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
             "viewport": {"width": 1920, "height": 1080},
@@ -60,19 +90,20 @@ def scrape_single_url(url):
             "timezone_id": "America/New_York"
         }
         if os.path.exists(SESSION_FILE):
             try:
                 context = browser.new_context(storage_state=SESSION_FILE, **context_args)
             except:
-                print("⚠️ Session file corrupt or incompatible. Starting as Guest.")
                 context = browser.new_context(**context_args)
         else:
             context = browser.new_context(**context_args)
         page = context.new_page()
-        # 3. APPLY STEALTH (Crucial for Hugging Face)
-        stealth_sync(page)
         print(f"⚡ Processing: {url}")
@@ -93,21 +124,14 @@ def scrape_single_url(url):
         try:
             # === NAVIGATION ===
-            # Long timeout + "commit" wait ensuring page load
             page.goto(url, wait_until="commit", timeout=60000)
-            # 4. CHECK FOR LOGIN WALL
             time.sleep(4)
-            page_title = page.title()
-            if "Login" in page_title or "Instagram" == page_title:
-                # Sometimes just "Instagram" means it loaded the login screen, not content
-                # We do a quick check for content
-                if page.locator("input[name='username']").count() > 0:
-                    data["status"] = "Failed (Login Block)"
-                    print("   ⚠️ Blocked by Login Wall")
-                    browser.close()
-                    return data
             # === PATH A: PROFILE ===
             if data["type"] == "PROFILE":
@@ -145,11 +169,10 @@ def scrape_single_url(url):
                         except: pass
                 page.on("response", handle_response)
-                # Reload to trigger network requests if needed, or just wait
                 time.sleep(3)
                 page.remove_listener("response", handle_response)
-                # Get Likes
                 try:
                     meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
                     if meta_desc:
@@ -157,7 +180,7 @@ def scrape_single_url(url):
                         if likes_match: data["likes"] = likes_match.group(1)
                 except: pass
-                # Get Author
                 if captured_info["username"]:
                     data["author"] = captured_info["username"]
@@ -175,36 +198,25 @@ def scrape_single_url(url):
                             href = link.get_attribute("href")
                             if href and "/reels/" in href:
                                 parts = href.strip("/").split("/")
-                                if len(parts) >= 2:
-                                    candidate = parts[-2]
-                                    if candidate not in ["reels", "instagram"]:
-                                        data["author"] = candidate
-                                        break
                     except: pass
-                # Get Views (Video Only)
                 if data["author"]:
-                    is_video = False
-                    if data["type"] == "REEL": is_video = True
                     try:
-                        og_type = page.locator('meta[property="og:type"]').get_attribute("content")
-                        if og_type and "video" in og_type: is_video = True
                     except: pass
                     if is_video:
-                        # Hop to Reels Tab
-                        profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
-                        page.goto(profile_reels_url, wait_until="domcontentloaded")
                         time.sleep(3)
-                        if "/reels/" not in page.url:
-                            data["views"] = "Hidden (Main Grid)"
-                        else:
                             try:
-                                target_selector = f"a[href*='{shortcode}']"
-                                # Wait a bit for grid to load
-                                page.wait_for_selector(target_selector, timeout=5000)
-                                target_card = page.locator(target_selector).first
                                 card_text = target_card.inner_text()
                                 for line in card_text.split('\n'):
                                     if any(char.isdigit() for char in line):
@@ -215,13 +227,12 @@ def scrape_single_url(url):
                     else:
                         data["views"] = "N/A (Photo)"
-                    # Bonus: Get Followers
                     try:
                         fol_link = page.locator("a[href*='/followers/']").first
                         if fol_link.count() > 0:
-                            title = fol_link.locator("span[title]").first
-                            if title.count() > 0:
-                                data["followers"] = title.get_attribute("title")
                     except: pass
                     data["status"] = "Success"
@@ -244,8 +255,8 @@ def home():
 def scrape_api():
     data = request.json
     raw_urls = data.get('urls', [])
     final_urls = []
     if isinstance(raw_urls, list):
         raw_string = ",".join(raw_urls)
     else:
@@ -260,8 +271,7 @@ def scrape_api():
     if not final_urls:
         return jsonify({"error": "No valid URLs provided"}), 400
-    print(f"🔥 API Request: Processing {len(final_urls)} links...")
     results = []
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         results_iterator = executor.map(scrape_single_url, final_urls)
@@ -271,5 +281,4 @@ def scrape_api():
     return jsonify(results)
 if __name__ == '__main__':
-    # HUGGING FACE PORT
     app.run(host='0.0.0.0', port=7860)

 from flask import Flask, render_template, request, jsonify
 from playwright.sync_api import sync_playwright
+# REMOVED: from playwright_stealth import stealth_sync (Use manual function below instead)
 from concurrent.futures import ThreadPoolExecutor
 import time
 import os
 # ---------------- CONFIGURATION ---------------- #
 SESSION_FILE = "instagram_session.json"
+MAX_WORKERS = 3
 # ----------------------------------------------- #
 def identify_url_type(url):
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
+# --- HELPER: MANUAL STEALTH MODE (Fixes ImportError) ---
+def apply_stealth(page):
+    """
+    Manually hides 'navigator.webdriver' and other bot flags
+    so Instagram thinks this is a real browser.
+    """
+    # 1. Hide the WebDriver flag
+    page.add_init_script("""
+        Object.defineProperty(navigator, 'webdriver', {
+            get: () => undefined
+        });
+    """)
+    # 2. Mock Chrome runtime
+    page.add_init_script("""
+        window.navigator.chrome = {
+            runtime: {}
+        };
+    """)
+    # 3. Mock Plugins (Bots usually have 0)
+    page.add_init_script("""
+        Object.defineProperty(navigator, 'plugins', {
+            get: () => [1, 2, 3, 4, 5]
+        });
+    """)
+    # 4. Mock Languages
+    page.add_init_script("""
+        Object.defineProperty(navigator, 'languages', {
+            get: () => ['en-US', 'en']
+        });
+    """)
+# --- HELPER: RECURSIVE SEARCH ---
 def find_username_in_json(obj):
     if isinstance(obj, dict):
         if "owner" in obj and isinstance(obj["owner"], dict):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
+        # 1. LAUNCH BROWSER
         browser = p.chromium.launch(
             headless=True,
             args=[
             ]
         )
+        # 2. CONFIGURE CONTEXT
         context_args = {
             "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
             "viewport": {"width": 1920, "height": 1080},
             "timezone_id": "America/New_York"
         }
+        # Try to load session (if exists)
         if os.path.exists(SESSION_FILE):
             try:
                 context = browser.new_context(storage_state=SESSION_FILE, **context_args)
             except:
+                print("⚠️ Session corrupted. Guest mode.")
                 context = browser.new_context(**context_args)
         else:
             context = browser.new_context(**context_args)
         page = context.new_page()
+        # 3. APPLY MANUAL STEALTH (Replaces the library)
+        apply_stealth(page)
         print(f"⚡ Processing: {url}")
         try:
             # === NAVIGATION ===
             page.goto(url, wait_until="commit", timeout=60000)
+            # Check Login Wall
             time.sleep(4)
+            if "Login" in page.title() or page.locator("input[name='username']").count() > 0:
+                data["status"] = "Failed (Login Block)"
+                browser.close()
+                return data
             # === PATH A: PROFILE ===
             if data["type"] == "PROFILE":
                         except: pass
                 page.on("response", handle_response)
                 time.sleep(3)
                 page.remove_listener("response", handle_response)
+                # Likes
                 try:
                     meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
                     if meta_desc:
                         if likes_match: data["likes"] = likes_match.group(1)
                 except: pass
+                # Author
                 if captured_info["username"]:
                     data["author"] = captured_info["username"]
                             href = link.get_attribute("href")
                             if href and "/reels/" in href:
                                 parts = href.strip("/").split("/")
+                                if len(parts) >= 2 and parts[-1] == "reels":
+                                    data["author"] = parts[-2]
+                                    break
                     except: pass
+                # Views
                 if data["author"]:
+                    is_video = (data["type"] == "REEL")
                     try:
+                        if "video" in page.locator('meta[property="og:type"]').get_attribute("content"): is_video = True
                     except: pass
                     if is_video:
+                        page.goto(f"https://www.instagram.com/{data['author']}/reels/", wait_until="domcontentloaded")
                         time.sleep(3)
+                        if "/reels/" in page.url:
                             try:
+                                target_card = page.locator(f"a[href*='{shortcode}']").first
                                 card_text = target_card.inner_text()
                                 for line in card_text.split('\n'):
                                     if any(char.isdigit() for char in line):
                     else:
                         data["views"] = "N/A (Photo)"
+                    # Followers (Bonus)
                     try:
                         fol_link = page.locator("a[href*='/followers/']").first
                         if fol_link.count() > 0:
+                            t = fol_link.locator("span[title]").first
+                            data["followers"] = t.get_attribute("title")
                     except: pass
                     data["status"] = "Success"
 def scrape_api():
     data = request.json
     raw_urls = data.get('urls', [])
     final_urls = []
     if isinstance(raw_urls, list):
         raw_string = ",".join(raw_urls)
     else:
     if not final_urls:
         return jsonify({"error": "No valid URLs provided"}), 400
+    print(f"🔥 Processing {len(final_urls)} links...")
     results = []
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         results_iterator = executor.map(scrape_single_url, final_urls)
     return jsonify(results)
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)