Spaces:

Baskar2005
/

TestingI

Runtime error

App Files Files Community

Baskar2005 commited on Jan 13

Commit

4cbd83c

verified ·

1 Parent(s): e0d8c79

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -37

app.py CHANGED Viewed

@@ -9,8 +9,7 @@ app = Flask(__name__)
 # ---------------- CONFIGURATION ---------------- #
 SESSION_FILE = "instagram_session.json"
-HEADLESS_MODE = False   # Keep False for stability
-MAX_WORKERS = 3         # Reduced to 3 to prevent lagging your PC
 # ----------------------------------------------- #
 def identify_url_type(url):
@@ -21,41 +20,46 @@ def identify_url_type(url):
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
-# --- HELPER: RECURSIVE SEARCH ( The "Main.py" Logic ) ---
 def find_username_in_json(obj):
     if isinstance(obj, dict):
-        # Priority 1: Check inside 'owner' object
         if "owner" in obj and isinstance(obj["owner"], dict):
-            if "username" in obj["owner"]:
-                return obj["owner"]["username"]
-        # Priority 2: Check standard user object
-        if "username" in obj and "is_verified" in obj:
-            return obj["username"]
-        # Recursive Loop
         for k, v in obj.items():
             if isinstance(v, (dict, list)):
                 res = find_username_in_json(v)
                 if res: return res
     elif isinstance(obj, list):
         for item in obj:
             res = find_username_in_json(item)
             if res: return res
     return None
-# --- WORKER FUNCTION ---
 def scrape_single_url(url):
     if not url or not url.strip(): return None
-    # New Browser Instance for Thread Safety
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=HEADLESS_MODE)
         if os.path.exists(SESSION_FILE):
-            context = browser.new_context(storage_state=SESSION_FILE)
         else:
-            context = browser.new_context()
         page = context.new_page()
         print(f"⚡ Processing: {url}")
@@ -70,14 +74,12 @@ def scrape_single_url(url):
             "status": "Starting"
         }
-        # Skip System Links
         if data["type"] in ["SYSTEM", "UNKNOWN"]:
             data["status"] = "Skipped"
             browser.close()
             return data
         try:
-            # === PATH A: PROFILE ===
             if data["type"] == "PROFILE":
                 page.goto(url, wait_until="domcontentloaded", timeout=60000)
                 time.sleep(3)
@@ -93,14 +95,12 @@ def scrape_single_url(url):
                 data["author"] = url.strip("/").split("/")[-1]
                 data["status"] = "Success"
-            # === PATH B: MEDIA (REEL/POST) ===
             elif data["type"] in ["REEL", "POST"]:
                 if "/reel/" in url:
                     shortcode = url.split("/reel/")[1].split("/")[0]
                 else:
                     shortcode = url.split("/p/")[1].split("/")[0]
-                # 1. NETWORK LISTENER (Restored Robust Logic)
                 captured_info = {"username": None}
                 def handle_response(response):
@@ -117,7 +117,6 @@ def scrape_single_url(url):
                 time.sleep(4)
                 page.remove_listener("response", handle_response)
-                # 2. GET LIKES (Meta Tag)
                 try:
                     meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
                     if meta_desc:
@@ -125,27 +124,21 @@ def scrape_single_url(url):
                         if likes_match: data["likes"] = likes_match.group(1)
                 except: pass
-                # 3. GET AUTHOR (Network > Title > Pattern)
                 if captured_info["username"]:
                     data["author"] = captured_info["username"]
-                # Fallback: Title Tag
                 if not data["author"]:
                     try:
                         title = page.title()
-                        # Matches "Username (@handle) on Instagram"
                         match = re.search(r'\(@(.*?)\)', title)
-                        if match:
-                            data["author"] = match.group(1)
                         else:
-                            # Matches "Username on Instagram" (Start of title)
                             match_b = re.search(r'^(.*?)\son\sInstagram', title)
                             if match_b:
                                 parts = match_b.group(1).split(" ")
                                 if len(parts) == 1: data["author"] = parts[0]
                     except: pass
-                # Fallback: Link Pattern
                 if not data["author"]:
                     try:
                         links = page.locator("a[href*='/reels/']").all()
@@ -160,10 +153,8 @@ def scrape_single_url(url):
                                         break
                     except: pass
-                # 4. GET VIEWS (Hop to Profile)
                 if data["author"]:
                     is_video = False
-                    # Check if Reel or Video Post
                     try:
                         og_type = page.locator('meta[property="og:type"]').get_attribute("content")
                         if og_type and "video" in og_type: is_video = True
@@ -171,7 +162,6 @@ def scrape_single_url(url):
                     if data["type"] == "REEL": is_video = True
                     if is_video:
-                        # Hop to Reels Tab
                         profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
                         page.goto(profile_reels_url, wait_until="domcontentloaded")
                         time.sleep(3)
@@ -193,7 +183,6 @@ def scrape_single_url(url):
                     else:
                         data["views"] = "N/A (Photo)"
-                    # Followers (Bonus)
                     try:
                         fol_link = page.locator("a[href*='/followers/']").first
                         if fol_link.count() > 0:
@@ -207,13 +196,12 @@ def scrape_single_url(url):
                     data["status"] = "Failed (No Author)"
         except Exception as e:
-            data["status"] = f"Error"
             print(f"❌ Error: {e}")
         browser.close()
         return data
-# --- ROUTES ---
 @app.route('/')
 def home():
     return render_template('index.html')
@@ -249,4 +237,5 @@ def scrape_api():
     return jsonify(results)
 if __name__ == '__main__':
-    app.run(debug=True, host='0.0.0.0',port=7860, use_reloader=False)

 # ---------------- CONFIGURATION ---------------- #
 SESSION_FILE = "instagram_session.json"
+MAX_WORKERS = 3
 # ----------------------------------------------- #
 def identify_url_type(url):
     if "instagram.com/" in url: return "PROFILE"
     return "UNKNOWN"
+# --- HELPER: RECURSIVE SEARCH ---
 def find_username_in_json(obj):
     if isinstance(obj, dict):
         if "owner" in obj and isinstance(obj["owner"], dict):
+            if "username" in obj["owner"]: return obj["owner"]["username"]
+        if "username" in obj and "is_verified" in obj: return obj["username"]
         for k, v in obj.items():
             if isinstance(v, (dict, list)):
                 res = find_username_in_json(v)
                 if res: return res
     elif isinstance(obj, list):
         for item in obj:
             res = find_username_in_json(item)
             if res: return res
     return None
 def scrape_single_url(url):
     if not url or not url.strip(): return None
     with sync_playwright() as p:
+        # 🔥 STEALTH CONFIGURATION 🔥
+        # 1. Hide the "Automation" flag
+        # 2. Force Headless=True (Required for Server Stability)
+        browser = p.chromium.launch(
+            headless=True,
+            args=["--disable-blink-features=AutomationControlled"]
+        )
+        # 3. Inject "Real Human" User-Agent (Windows Chrome)
+        # This prevents the "N/A" error by tricking Instagram
+        context_args = {
+            "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+            "viewport": {"width": 1280, "height": 720},
+            "locale": "en-US"
+        }
         if os.path.exists(SESSION_FILE):
+            context = browser.new_context(storage_state=SESSION_FILE, **context_args)
         else:
+            context = browser.new_context(**context_args)
         page = context.new_page()
         print(f"⚡ Processing: {url}")
             "status": "Starting"
         }
         if data["type"] in ["SYSTEM", "UNKNOWN"]:
             data["status"] = "Skipped"
             browser.close()
             return data
         try:
             if data["type"] == "PROFILE":
                 page.goto(url, wait_until="domcontentloaded", timeout=60000)
                 time.sleep(3)
                 data["author"] = url.strip("/").split("/")[-1]
                 data["status"] = "Success"
             elif data["type"] in ["REEL", "POST"]:
                 if "/reel/" in url:
                     shortcode = url.split("/reel/")[1].split("/")[0]
                 else:
                     shortcode = url.split("/p/")[1].split("/")[0]
                 captured_info = {"username": None}
                 def handle_response(response):
                 time.sleep(4)
                 page.remove_listener("response", handle_response)
                 try:
                     meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
                     if meta_desc:
                         if likes_match: data["likes"] = likes_match.group(1)
                 except: pass
                 if captured_info["username"]:
                     data["author"] = captured_info["username"]
                 if not data["author"]:
                     try:
                         title = page.title()
                         match = re.search(r'\(@(.*?)\)', title)
+                        if match: data["author"] = match.group(1)
                         else:
                             match_b = re.search(r'^(.*?)\son\sInstagram', title)
                             if match_b:
                                 parts = match_b.group(1).split(" ")
                                 if len(parts) == 1: data["author"] = parts[0]
                     except: pass
                 if not data["author"]:
                     try:
                         links = page.locator("a[href*='/reels/']").all()
                                         break
                     except: pass
                 if data["author"]:
                     is_video = False
                     try:
                         og_type = page.locator('meta[property="og:type"]').get_attribute("content")
                         if og_type and "video" in og_type: is_video = True
                     if data["type"] == "REEL": is_video = True
                     if is_video:
                         profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
                         page.goto(profile_reels_url, wait_until="domcontentloaded")
                         time.sleep(3)
                     else:
                         data["views"] = "N/A (Photo)"
                     try:
                         fol_link = page.locator("a[href*='/followers/']").first
                         if fol_link.count() > 0:
                     data["status"] = "Failed (No Author)"
         except Exception as e:
+            data["status"] = "Error"
             print(f"❌ Error: {e}")
         browser.close()
         return data
 @app.route('/')
 def home():
     return render_template('index.html')
     return jsonify(results)
 if __name__ == '__main__':
+    # HUGGING FACE REQUIRES PORT 7860
+    app.run(host='0.0.0.0', port=7860)