Spaces:

Baskar2005
/

TestingI

Runtime error

App Files Files Community

Baskar2005 commited on Jan 13

Commit

ab568f2

verified ·

1 Parent(s): 13bc625

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -36

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from flask import Flask, render_template, request, jsonify
 from playwright.sync_api import sync_playwright
 from concurrent.futures import ThreadPoolExecutor
 import time
 import os
@@ -9,7 +10,7 @@ app = Flask(__name__)
 # ---------------- CONFIGURATION ---------------- #
 SESSION_FILE = "instagram_session.json"
-MAX_WORKERS = 3
 # ----------------------------------------------- #
 def identify_url_type(url):
@@ -20,7 +21,7 @@ def identify_url_type(url):
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
-# --- HELPER: RECURSIVE SEARCH ---
 def find_username_in_json(obj):
     if isinstance(obj, dict):
         if "owner" in obj and isinstance(obj["owner"], dict):
@@ -40,34 +41,39 @@ def scrape_single_url(url):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
-        # 🔥 STEALTH CONFIGURATION 🔥
-        # 1. Hide the "Automation" flag
-        # 2. Force Headless=True (Required for Server Stability)
         browser = p.chromium.launch(
-            headless=True,
-            args=["--disable-blink-features=AutomationControlled"]
         )
-        context = browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-            viewport={"width": 1280, "height": 720},
-            storage_state=SESSION_FILE if os.path.exists(SESSION_FILE) else None
-        )
-        # 3. Inject "Real Human" User-Agent (Windows Chrome)
-        # This prevents the "N/A" error by tricking Instagram
         context_args = {
-            "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-            "viewport": {"width": 1280, "height": 720},
-            "locale": "en-US"
         }
         if os.path.exists(SESSION_FILE):
-            context = browser.new_context(storage_state=SESSION_FILE, **context_args)
         else:
             context = browser.new_context(**context_args)
         page = context.new_page()
         print(f"⚡ Processing: {url}")
         data = {
@@ -86,9 +92,26 @@ def scrape_single_url(url):
             return data
         try:
             if data["type"] == "PROFILE":
-                page.goto(url, wait_until="domcontentloaded", timeout=60000)
-                time.sleep(3)
                 try:
                     followers_link = page.locator("a[href*='/followers/']").first
                     if followers_link.count() > 0:
@@ -98,9 +121,12 @@ def scrape_single_url(url):
                         else:
                             data["followers"] = followers_link.inner_text().split("\n")[0]
                 except: pass
-                data["author"] = url.strip("/").split("/")[-1]
                 data["status"] = "Success"
             elif data["type"] in ["REEL", "POST"]:
                 if "/reel/" in url:
                     shortcode = url.split("/reel/")[1].split("/")[0]
@@ -119,10 +145,11 @@ def scrape_single_url(url):
                         except: pass
                 page.on("response", handle_response)
-                page.goto(url, wait_until="domcontentloaded", timeout=60000)
-                time.sleep(4)
                 page.remove_listener("response", handle_response)
                 try:
                     meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
                     if meta_desc:
@@ -130,6 +157,7 @@ def scrape_single_url(url):
                         if likes_match: data["likes"] = likes_match.group(1)
                 except: pass
                 if captured_info["username"]:
                     data["author"] = captured_info["username"]
@@ -138,36 +166,33 @@ def scrape_single_url(url):
                         title = page.title()
                         match = re.search(r'\(@(.*?)\)', title)
                         if match: data["author"] = match.group(1)
-                        else:
-                            match_b = re.search(r'^(.*?)\son\sInstagram', title)
-                            if match_b:
-                                parts = match_b.group(1).split(" ")
-                                if len(parts) == 1: data["author"] = parts[0]
                     except: pass
                 if not data["author"]:
                     try:
                         links = page.locator("a[href*='/reels/']").all()
                         for link in links:
                             href = link.get_attribute("href")
-                            if href:
                                 parts = href.strip("/").split("/")
-                                if len(parts) >= 2 and parts[-1] == "reels":
                                     candidate = parts[-2]
                                     if candidate not in ["reels", "instagram"]:
                                         data["author"] = candidate
                                         break
                     except: pass
                 if data["author"]:
                     is_video = False
                     try:
                         og_type = page.locator('meta[property="og:type"]').get_attribute("content")
                         if og_type and "video" in og_type: is_video = True
                     except: pass
-                    if data["type"] == "REEL": is_video = True
                     if is_video:
                         profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
                         page.goto(profile_reels_url, wait_until="domcontentloaded")
                         time.sleep(3)
@@ -177,7 +202,8 @@ def scrape_single_url(url):
                         else:
                             try:
                                 target_selector = f"a[href*='{shortcode}']"
-                                page.wait_for_selector(target_selector, timeout=8000)
                                 target_card = page.locator(target_selector).first
                                 card_text = target_card.inner_text()
                                 for line in card_text.split('\n'):
@@ -189,6 +215,7 @@ def scrape_single_url(url):
                     else:
                         data["views"] = "N/A (Photo)"
                     try:
                         fol_link = page.locator("a[href*='/followers/']").first
                         if fol_link.count() > 0:
@@ -208,6 +235,7 @@ def scrape_single_url(url):
         browser.close()
         return data
 @app.route('/')
 def home():
     return render_template('index.html')
@@ -243,5 +271,5 @@ def scrape_api():
     return jsonify(results)
 if __name__ == '__main__':
-    # HUGGING FACE REQUIRES PORT 7860
     app.run(host='0.0.0.0', port=7860)

 from flask import Flask, render_template, request, jsonify
 from playwright.sync_api import sync_playwright
+from playwright_stealth import stealth_sync
 from concurrent.futures import ThreadPoolExecutor
 import time
 import os
 # ---------------- CONFIGURATION ---------------- #
 SESSION_FILE = "instagram_session.json"
+MAX_WORKERS = 3  # Keep low for free tier servers
 # ----------------------------------------------- #
 def identify_url_type(url):
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
+# --- HELPER: RECURSIVE SEARCH (Deep Search for Author) ---
 def find_username_in_json(obj):
     if isinstance(obj, dict):
         if "owner" in obj and isinstance(obj["owner"], dict):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
+        # 1. LAUNCH BROWSER (Headless + Anti-Detect Args)
         browser = p.chromium.launch(
+            headless=True,
+            args=[
+                "--disable-blink-features=AutomationControlled",
+                "--no-sandbox",
+                "--disable-dev-shm-usage"
+            ]
         )
+        # 2. CONFIGURE CONTEXT (Windows 10 Fingerprint)
+        # We try to load session, but if it fails/blocks, we continue as Guest
         context_args = {
+            "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+            "viewport": {"width": 1920, "height": 1080},
+            "locale": "en-US",
+            "timezone_id": "America/New_York"
         }
         if os.path.exists(SESSION_FILE):
+            try:
+                context = browser.new_context(storage_state=SESSION_FILE, **context_args)
+            except:
+                print("⚠️ Session file corrupt or incompatible. Starting as Guest.")
+                context = browser.new_context(**context_args)
         else:
             context = browser.new_context(**context_args)
         page = context.new_page()
+        # 3. APPLY STEALTH (Crucial for Hugging Face)
+        stealth_sync(page)
         print(f"⚡ Processing: {url}")
         data = {
             return data
         try:
+            # === NAVIGATION ===
+            # Long timeout + "commit" wait ensuring page load
+            page.goto(url, wait_until="commit", timeout=60000)
+            # 4. CHECK FOR LOGIN WALL
+            time.sleep(4)
+            page_title = page.title()
+            if "Login" in page_title or "Instagram" == page_title:
+                # Sometimes just "Instagram" means it loaded the login screen, not content
+                # We do a quick check for content
+                if page.locator("input[name='username']").count() > 0:
+                    data["status"] = "Failed (Login Block)"
+                    print("   ⚠️ Blocked by Login Wall")
+                    browser.close()
+                    return data
+            # === PATH A: PROFILE ===
             if data["type"] == "PROFILE":
+                time.sleep(2)
                 try:
                     followers_link = page.locator("a[href*='/followers/']").first
                     if followers_link.count() > 0:
                         else:
                             data["followers"] = followers_link.inner_text().split("\n")[0]
                 except: pass
+                if not data["author"]:
+                     data["author"] = url.strip("/").split("/")[-1]
                 data["status"] = "Success"
+            # === PATH B: MEDIA (REEL/POST) ===
             elif data["type"] in ["REEL", "POST"]:
                 if "/reel/" in url:
                     shortcode = url.split("/reel/")[1].split("/")[0]
                         except: pass
                 page.on("response", handle_response)
+                # Reload to trigger network requests if needed, or just wait
+                time.sleep(3)
                 page.remove_listener("response", handle_response)
+                # Get Likes
                 try:
                     meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
                     if meta_desc:
                         if likes_match: data["likes"] = likes_match.group(1)
                 except: pass
+                # Get Author
                 if captured_info["username"]:
                     data["author"] = captured_info["username"]
                         title = page.title()
                         match = re.search(r'\(@(.*?)\)', title)
                         if match: data["author"] = match.group(1)
                     except: pass
                 if not data["author"]:
                     try:
                         links = page.locator("a[href*='/reels/']").all()
                         for link in links:
                             href = link.get_attribute("href")
+                            if href and "/reels/" in href:
                                 parts = href.strip("/").split("/")
+                                if len(parts) >= 2:
                                     candidate = parts[-2]
                                     if candidate not in ["reels", "instagram"]:
                                         data["author"] = candidate
                                         break
                     except: pass
+                # Get Views (Video Only)
                 if data["author"]:
                     is_video = False
+                    if data["type"] == "REEL": is_video = True
                     try:
                         og_type = page.locator('meta[property="og:type"]').get_attribute("content")
                         if og_type and "video" in og_type: is_video = True
                     except: pass
                     if is_video:
+                        # Hop to Reels Tab
                         profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
                         page.goto(profile_reels_url, wait_until="domcontentloaded")
                         time.sleep(3)
                         else:
                             try:
                                 target_selector = f"a[href*='{shortcode}']"
+                                # Wait a bit for grid to load
+                                page.wait_for_selector(target_selector, timeout=5000)
                                 target_card = page.locator(target_selector).first
                                 card_text = target_card.inner_text()
                                 for line in card_text.split('\n'):
                     else:
                         data["views"] = "N/A (Photo)"
+                    # Bonus: Get Followers
                     try:
                         fol_link = page.locator("a[href*='/followers/']").first
                         if fol_link.count() > 0:
         browser.close()
         return data
+# --- ROUTES ---
 @app.route('/')
 def home():
     return render_template('index.html')
     return jsonify(results)
 if __name__ == '__main__':
+    # HUGGING FACE PORT
     app.run(host='0.0.0.0', port=7860)