Spaces:

poemsforaphrodite
/

scraper

Paused

App Files Files Community

poemsforaphrodite commited on Jul 3, 2025

Commit

11f5c5f

verified ·

1 Parent(s): bbfa3ce

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +139 -115

src/streamlit_app.py CHANGED Viewed

@@ -34,7 +34,7 @@ st.set_page_config(
 load_dotenv(dotenv_path=".env.local")
 # Setup MongoDB connection
-MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://datacollector:43HTpLfqPAjFCLL@cluster0.mongodb.net/?retryWrites=true&w=majority")
 # Try to connect to MongoDB, but continue if it fails
 try:
@@ -156,7 +156,7 @@ def run_apify_comment_analysis(input):
     since_date = input["since"]
     until_date = input.get("until", datetime.now().strftime("%Y-%m-%d"))  # NEW: Add until date
-    # Use fixed date format as specified in the example - FIXED QUERY PARAMETERS
     run_input = {
         "@": id,
         "filter:blue_verified": False,
@@ -180,7 +180,7 @@ def run_apify_comment_analysis(input):
         "filter:videos": False,
         "filter:vine": False,
         "include:nativeretweets": False,
-        "lang": "en",
         "since": since_date + "_00:00:00_UTC",
         "to": id,
         "until": until_date + "_23:59:59_UTC",
@@ -190,7 +190,8 @@ def run_apify_comment_analysis(input):
         "min_replies": 0,
         "-min_retweets": 0,
         "-min_faves": 0,
-        "-min_replies": 0
     }
     # Show loading state
@@ -200,6 +201,9 @@ def run_apify_comment_analysis(input):
         # Fetch ALL data from the run's dataset (no maxItems limit)
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
     return data, run["defaultDatasetId"]
@@ -212,7 +216,7 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
     min_retweets = input.get("min_retweets", 0)  # NEW: Configurable engagement
     min_replies = input.get("min_replies", 0)  # NEW: Configurable engagement
-    # Use the exact format provided by the user - IMPROVED QUERY PARAMETERS
     run_input = {
         "filter:blue_verified": False,
         "filter:consumer_video": False,
@@ -223,11 +227,11 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
         "filter:media": False,
         "filter:mentions": False,
         "filter:native_video": False,
-        "filter:nativeretweets": False,
         "filter:news": False,
         "filter:pro_video": False,
         "filter:quote": False,
-        "filter:replies": False,
         "filter:safe": False,
         "filter:spaces": False,
         "filter:twimg": False,
@@ -235,17 +239,18 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
         "filter:videos": False,
         "filter:vine": False,
         "from": username,
-        "include:nativeretweets": False,
-        "lang": "en",
         "queryType": "Latest",
         "since": since_date + "_00:00:00_UTC",
         "until": until_date + "_23:59:59_UTC",
-        "min_faves": min_faves,  # NEW: User configurable, default 0
-        "min_retweets": min_retweets,  # NEW: User configurable, default 0
-        "min_replies": min_replies,  # NEW: User configurable, default 0
         "-min_retweets": 0,
         "-min_faves": 0,
-        "-min_replies": 0
     }
     # Show loading state
@@ -255,6 +260,9 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
         # Fetch ALL data from the run's dataset (no maxItems limit)
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
     return data, run["defaultDatasetId"]
@@ -923,6 +931,107 @@ def clear_all_tweets_data():
 # --- End Scheduler DB helpers ---
 # App header
 st.title("🐦 Twitter Scraper")
@@ -1180,7 +1289,13 @@ with tabs[2]:
                         if not processed_df.empty:
                             account_details = metrics.get("account_details", {})
                             followers_info = f" | {account_details.get('followers_count', 'N/A')} followers" if account_details.get('followers_count') else ""
-                            st.success(f"Found {len(processed_df)} tweets for @{username} from {date_range}{followers_info}.")
                         else:
                             st.warning(f"No results for @{username} in the specified date range.")
@@ -1535,11 +1650,21 @@ st.sidebar.info(
     - Deleted tweets
     - API rate limiting
     - Account restrictions
     💡 **Tips for better results:**
     - Use appropriate date ranges
     - Keep engagement filters at 0 (default) for maximum capture
     - Use broader time periods for more comprehensive data
     """
 )
@@ -1620,107 +1745,6 @@ try:
 except:
     pass
-def run_apify_followers_analysis(input):
-    """
-    Fetch followers/following data using Apify actor
-    This is a placeholder for when the followers actor works
-    """
-    username = input["username"]
-    relationship_type = input.get("relationship_type", "followers")  # "followers" or "following"
-    max_items = input.get("max_items", 100)
-    # Try the followers actor first
-    try:
-        if relationship_type == "followers":
-            run_input = {
-                "twitterHandles": [username],
-                "maxItems": max_items,
-                "getFollowers": True,
-                "getFollowing": False,
-                "getRetweeters": False,
-                "includeUnavailableUsers": False,
-            }
-        else:  # following
-            run_input = {
-                "twitterHandles": [username],
-                "maxItems": max_items,
-                "getFollowers": False,
-                "getFollowing": True,
-                "getRetweeters": False,
-                "includeUnavailableUsers": False,
-            }
-        with st.spinner(f"Fetching {relationship_type} for @{username}..."):
-            # Try the actor you specified
-            run = client.actor("V38PZzpEgOfeeWvZY").call(run_input=run_input)
-            data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
-            if data:
-                return data, run["defaultDatasetId"]
-            else:
-                # Fallback: Use alternative followers scraper
-                return run_apify_followers_fallback(input)
-    except Exception as e:
-        st.warning(f"Primary followers actor failed: {e}")
-        # Fallback to alternative scraper
-        return run_apify_followers_fallback(input)
-def run_apify_followers_fallback(input):
-    """
-    Fallback method using alternative followers scraper
-    """
-    username = input["username"]
-    relationship_type = input.get("relationship_type", "followers")
-    max_items = input.get("max_items", 100)
-    try:
-        # Use curious_coder/twitter-scraper as fallback
-        run_input = {
-            "profileUrl": f"https://twitter.com/{username}",
-            "friendshipType": relationship_type,  # "followers" or "following"
-            "count": max_items,
-            "minDelay": 1,
-            "maxDelay": 3
-        }
-        with st.spinner(f"Fetching {relationship_type} for @{username} (fallback method)..."):
-            run = client.actor("curious_coder/twitter-scraper").call(run_input=run_input)
-            data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
-            return data, run["defaultDatasetId"]
-    except Exception as e:
-        st.error(f"All followers scrapers failed: {e}")
-        return [], None
-def process_followers_data(data, relationship_type="followers"):
-    """
-    Process followers/following data into a structured format
-    """
-    processed_data = []
-    for item in data:
-        # Handle different data structures from different actors
-        username = item.get('username', item.get('screen_name', item.get('userName', '')))
-        name = item.get('name', item.get('displayName', ''))
-        processed_item = {
-            "Username": username,
-            "Name": name,
-            "Bio": item.get('description', item.get('bio', '')),
-            "Location": item.get('location', ''),
-            "Followers": item.get('followers_count', item.get('followersCount', item.get('followers', 0))),
-            "Following": item.get('following_count', item.get('followingCount', item.get('following', 0))),
-            "Tweets": item.get('tweet_count', item.get('statusesCount', item.get('statuses_count', 0))),
-            "Verified": item.get('verified', item.get('isVerified', False)),
-            "Profile_Image": item.get('profile_image_url', item.get('profileImageUrl', '')),
-            "Created_At": item.get('created_at', item.get('createdAt', '')),
-            "URL": item.get('url', f"https://twitter.com/{username}"),
-            "Relationship_Type": relationship_type
-        }
-        processed_data.append(processed_item)
-    return pd.DataFrame(processed_data)
 # Footer with attribution
 st.divider()

 load_dotenv(dotenv_path=".env.local")
 # Setup MongoDB connection
+MONGODB_URI = os.getenv("MONGODB_URI")
 # Try to connect to MongoDB, but continue if it fails
 try:
     since_date = input["since"]
     until_date = input.get("until", datetime.now().strftime("%Y-%m-%d"))  # NEW: Add until date
+    # ENHANCED: Improved query parameters for better comment capture
     run_input = {
         "@": id,
         "filter:blue_verified": False,
         "filter:videos": False,
         "filter:vine": False,
         "include:nativeretweets": False,
+        "lang": "",  # CHANGED: Remove language filter for all comments
         "since": since_date + "_00:00:00_UTC",
         "to": id,
         "until": until_date + "_23:59:59_UTC",
         "min_replies": 0,
         "-min_retweets": 0,
         "-min_faves": 0,
+        "-min_replies": 0,
+        "sort": "time"  # ADDED: Sort by time for chronological order
     }
     # Show loading state
         # Fetch ALL data from the run's dataset (no maxItems limit)
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        # ENHANCED: Log query details for debugging
+        st.info(f"🔍 Query Details: to:@{id} since:{since_date} until:{until_date} | Raw results: {len(data)} comments")
     return data, run["defaultDatasetId"]
     min_retweets = input.get("min_retweets", 0)  # NEW: Configurable engagement
     min_replies = input.get("min_replies", 0)  # NEW: Configurable engagement
+    # ENHANCED: More comprehensive query parameters for better accuracy
     run_input = {
         "filter:blue_verified": False,
         "filter:consumer_video": False,
         "filter:media": False,
         "filter:mentions": False,
         "filter:native_video": False,
+        "filter:nativeretweets": False,  # Include retweets for accurate count
         "filter:news": False,
         "filter:pro_video": False,
         "filter:quote": False,
+        "filter:replies": False,  # Include replies for accurate count
         "filter:safe": False,
         "filter:spaces": False,
         "filter:twimg": False,
         "filter:videos": False,
         "filter:vine": False,
         "from": username,
+        "include:nativeretweets": True,  # CHANGED: Include retweets to match Twitter counts
+        "lang": "",  # CHANGED: Remove language filter to capture all tweets
         "queryType": "Latest",
         "since": since_date + "_00:00:00_UTC",
         "until": until_date + "_23:59:59_UTC",
+        "min_faves": min_faves,
+        "min_retweets": min_retweets,
+        "min_replies": min_replies,
         "-min_retweets": 0,
         "-min_faves": 0,
+        "-min_replies": 0,
+        "sort": "time"  # ADDED: Sort by time for chronological order
     }
     # Show loading state
         # Fetch ALL data from the run's dataset (no maxItems limit)
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        # ENHANCED: Log query details for debugging
+        st.info(f"🔍 Query Details: from:{username} since:{since_date} until:{until_date} | Raw results: {len(data)} tweets")
     return data, run["defaultDatasetId"]
 # --- End Scheduler DB helpers ---
+def run_apify_followers_analysis(input):
+    """
+    Fetch followers/following data using Apify actor
+    """
+    username = input["username"]
+    relationship_type = input.get("relationship_type", "followers")  # "followers" or "following"
+    max_items = input.get("max_items", 100)
+    # Try the followers actor first
+    try:
+        if relationship_type == "followers":
+            run_input = {
+                "twitterHandles": [username],
+                "maxItems": max_items,
+                "getFollowers": True,
+                "getFollowing": False,
+                "getRetweeters": False,
+                "includeUnavailableUsers": False,
+            }
+        else:  # following
+            run_input = {
+                "twitterHandles": [username],
+                "maxItems": max_items,
+                "getFollowers": False,
+                "getFollowing": True,
+                "getRetweeters": False,
+                "includeUnavailableUsers": False,
+            }
+        with st.spinner(f"Fetching {relationship_type} for @{username}..."):
+            # Try the actor you specified
+            run = client.actor("V38PZzpEgOfeeWvZY").call(run_input=run_input)
+            data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+            if data:
+                return data, run["defaultDatasetId"]
+            else:
+                # Fallback: Use alternative followers scraper
+                return run_apify_followers_fallback(input)
+    except Exception as e:
+        st.warning(f"Primary followers actor failed: {e}")
+        # Fallback to alternative scraper
+        return run_apify_followers_fallback(input)
+def run_apify_followers_fallback(input):
+    """
+    Fallback method using alternative followers scraper
+    """
+    username = input["username"]
+    relationship_type = input.get("relationship_type", "followers")
+    max_items = input.get("max_items", 100)
+    try:
+        # Use curious_coder/twitter-scraper as fallback
+        run_input = {
+            "profileUrl": f"https://twitter.com/{username}",
+            "friendshipType": relationship_type,  # "followers" or "following"
+            "count": max_items,
+            "minDelay": 1,
+            "maxDelay": 3
+        }
+        with st.spinner(f"Fetching {relationship_type} for @{username} (fallback method)..."):
+            run = client.actor("curious_coder/twitter-scraper").call(run_input=run_input)
+            data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+            return data, run["defaultDatasetId"]
+    except Exception as e:
+        st.error(f"All followers scrapers failed: {e}")
+        return [], None
+def process_followers_data(data, relationship_type="followers"):
+    """
+    Process followers/following data into a structured format
+    """
+    processed_data = []
+    for item in data:
+        # Handle different data structures from different actors
+        username = item.get('username', item.get('screen_name', item.get('userName', '')))
+        name = item.get('name', item.get('displayName', ''))
+        processed_item = {
+            "Username": username,
+            "Name": name,
+            "Bio": item.get('description', item.get('bio', '')),
+            "Location": item.get('location', ''),
+            "Followers": item.get('followers_count', item.get('followersCount', item.get('followers', 0))),
+            "Following": item.get('following_count', item.get('followingCount', item.get('following', 0))),
+            "Tweets": item.get('tweet_count', item.get('statusesCount', item.get('statuses_count', 0))),
+            "Verified": item.get('verified', item.get('isVerified', False)),
+            "Profile_Image": item.get('profile_image_url', item.get('profileImageUrl', '')),
+            "Created_At": item.get('created_at', item.get('createdAt', '')),
+            "URL": item.get('url', f"https://twitter.com/{username}"),
+            "Relationship_Type": relationship_type
+        }
+        processed_data.append(processed_item)
+    return pd.DataFrame(processed_data)
 # App header
 st.title("🐦 Twitter Scraper")
                         if not processed_df.empty:
                             account_details = metrics.get("account_details", {})
                             followers_info = f" | {account_details.get('followers_count', 'N/A')} followers" if account_details.get('followers_count') else ""
+                            following_info = f" | {account_details.get('following_count', 'N/A')} following" if account_details.get('following_count') else ""
+                            st.success(f"Found {len(processed_df)} tweets for @{username} from {date_range}{followers_info}{following_info}.")
+                            # ENHANCED: Debug mode for account details
+                            if account_details:
+                                with st.expander(f"🔍 Debug Account Info for @{username}"):
+                                    st.json(account_details)
                         else:
                             st.warning(f"No results for @{username} in the specified date range.")
     - Deleted tweets
     - API rate limiting
     - Account restrictions
+    - Language filtering (now disabled by default)
+    - Time zone differences (API uses UTC, display shows IST)
     💡 **Tips for better results:**
     - Use appropriate date ranges
     - Keep engagement filters at 0 (default) for maximum capture
     - Use broader time periods for more comprehensive data
+    - Check the debug info shown with query results
+    - Compare against multiple time ranges for consistency
+    🔧 **Troubleshooting discrepancies:**
+    - Twitter's web interface may include/exclude different content types
+    - Retweets are now included by default for better accuracy
+    - Language filter removed to capture all tweets
+    - Check the raw results count vs processed count
     """
 )
 except:
     pass
 # Footer with attribution
 st.divider()