poemsforaphrodite commited on
Commit
11f5c5f
·
verified ·
1 Parent(s): bbfa3ce

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +139 -115
src/streamlit_app.py CHANGED
@@ -34,7 +34,7 @@ st.set_page_config(
34
  load_dotenv(dotenv_path=".env.local")
35
 
36
  # Setup MongoDB connection
37
- MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://datacollector:43HTpLfqPAjFCLL@cluster0.mongodb.net/?retryWrites=true&w=majority")
38
 
39
  # Try to connect to MongoDB, but continue if it fails
40
  try:
@@ -156,7 +156,7 @@ def run_apify_comment_analysis(input):
156
  since_date = input["since"]
157
  until_date = input.get("until", datetime.now().strftime("%Y-%m-%d")) # NEW: Add until date
158
 
159
- # Use fixed date format as specified in the example - FIXED QUERY PARAMETERS
160
  run_input = {
161
  "@": id,
162
  "filter:blue_verified": False,
@@ -180,7 +180,7 @@ def run_apify_comment_analysis(input):
180
  "filter:videos": False,
181
  "filter:vine": False,
182
  "include:nativeretweets": False,
183
- "lang": "en",
184
  "since": since_date + "_00:00:00_UTC",
185
  "to": id,
186
  "until": until_date + "_23:59:59_UTC",
@@ -190,7 +190,8 @@ def run_apify_comment_analysis(input):
190
  "min_replies": 0,
191
  "-min_retweets": 0,
192
  "-min_faves": 0,
193
- "-min_replies": 0
 
194
  }
195
 
196
  # Show loading state
@@ -200,6 +201,9 @@ def run_apify_comment_analysis(input):
200
 
201
  # Fetch ALL data from the run's dataset (no maxItems limit)
202
  data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
 
 
 
203
 
204
  return data, run["defaultDatasetId"]
205
 
@@ -212,7 +216,7 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
212
  min_retweets = input.get("min_retweets", 0) # NEW: Configurable engagement
213
  min_replies = input.get("min_replies", 0) # NEW: Configurable engagement
214
 
215
- # Use the exact format provided by the user - IMPROVED QUERY PARAMETERS
216
  run_input = {
217
  "filter:blue_verified": False,
218
  "filter:consumer_video": False,
@@ -223,11 +227,11 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
223
  "filter:media": False,
224
  "filter:mentions": False,
225
  "filter:native_video": False,
226
- "filter:nativeretweets": False,
227
  "filter:news": False,
228
  "filter:pro_video": False,
229
  "filter:quote": False,
230
- "filter:replies": False,
231
  "filter:safe": False,
232
  "filter:spaces": False,
233
  "filter:twimg": False,
@@ -235,17 +239,18 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
235
  "filter:videos": False,
236
  "filter:vine": False,
237
  "from": username,
238
- "include:nativeretweets": False,
239
- "lang": "en",
240
  "queryType": "Latest",
241
  "since": since_date + "_00:00:00_UTC",
242
  "until": until_date + "_23:59:59_UTC",
243
- "min_faves": min_faves, # NEW: User configurable, default 0
244
- "min_retweets": min_retweets, # NEW: User configurable, default 0
245
- "min_replies": min_replies, # NEW: User configurable, default 0
246
  "-min_retweets": 0,
247
  "-min_faves": 0,
248
- "-min_replies": 0
 
249
  }
250
 
251
  # Show loading state
@@ -255,6 +260,9 @@ def run_apify_account_analysis(input, disable_engagement_filters=True):
255
 
256
  # Fetch ALL data from the run's dataset (no maxItems limit)
257
  data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
 
 
 
258
 
259
  return data, run["defaultDatasetId"]
260
 
@@ -923,6 +931,107 @@ def clear_all_tweets_data():
923
 
924
  # --- End Scheduler DB helpers ---
925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  # App header
927
  st.title("🐦 Twitter Scraper")
928
 
@@ -1180,7 +1289,13 @@ with tabs[2]:
1180
  if not processed_df.empty:
1181
  account_details = metrics.get("account_details", {})
1182
  followers_info = f" | {account_details.get('followers_count', 'N/A')} followers" if account_details.get('followers_count') else ""
1183
- st.success(f"Found {len(processed_df)} tweets for @{username} from {date_range}{followers_info}.")
 
 
 
 
 
 
1184
  else:
1185
  st.warning(f"No results for @{username} in the specified date range.")
1186
 
@@ -1535,11 +1650,21 @@ st.sidebar.info(
1535
  - Deleted tweets
1536
  - API rate limiting
1537
  - Account restrictions
 
 
1538
 
1539
  💡 **Tips for better results:**
1540
  - Use appropriate date ranges
1541
  - Keep engagement filters at 0 (default) for maximum capture
1542
  - Use broader time periods for more comprehensive data
 
 
 
 
 
 
 
 
1543
  """
1544
  )
1545
 
@@ -1620,107 +1745,6 @@ try:
1620
  except:
1621
  pass
1622
 
1623
- def run_apify_followers_analysis(input):
1624
- """
1625
- Fetch followers/following data using Apify actor
1626
- This is a placeholder for when the followers actor works
1627
- """
1628
- username = input["username"]
1629
- relationship_type = input.get("relationship_type", "followers") # "followers" or "following"
1630
- max_items = input.get("max_items", 100)
1631
-
1632
- # Try the followers actor first
1633
- try:
1634
- if relationship_type == "followers":
1635
- run_input = {
1636
- "twitterHandles": [username],
1637
- "maxItems": max_items,
1638
- "getFollowers": True,
1639
- "getFollowing": False,
1640
- "getRetweeters": False,
1641
- "includeUnavailableUsers": False,
1642
- }
1643
- else: # following
1644
- run_input = {
1645
- "twitterHandles": [username],
1646
- "maxItems": max_items,
1647
- "getFollowers": False,
1648
- "getFollowing": True,
1649
- "getRetweeters": False,
1650
- "includeUnavailableUsers": False,
1651
- }
1652
-
1653
- with st.spinner(f"Fetching {relationship_type} for @{username}..."):
1654
- # Try the actor you specified
1655
- run = client.actor("V38PZzpEgOfeeWvZY").call(run_input=run_input)
1656
- data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
1657
-
1658
- if data:
1659
- return data, run["defaultDatasetId"]
1660
- else:
1661
- # Fallback: Use alternative followers scraper
1662
- return run_apify_followers_fallback(input)
1663
-
1664
- except Exception as e:
1665
- st.warning(f"Primary followers actor failed: {e}")
1666
- # Fallback to alternative scraper
1667
- return run_apify_followers_fallback(input)
1668
-
1669
- def run_apify_followers_fallback(input):
1670
- """
1671
- Fallback method using alternative followers scraper
1672
- """
1673
- username = input["username"]
1674
- relationship_type = input.get("relationship_type", "followers")
1675
- max_items = input.get("max_items", 100)
1676
-
1677
- try:
1678
- # Use curious_coder/twitter-scraper as fallback
1679
- run_input = {
1680
- "profileUrl": f"https://twitter.com/{username}",
1681
- "friendshipType": relationship_type, # "followers" or "following"
1682
- "count": max_items,
1683
- "minDelay": 1,
1684
- "maxDelay": 3
1685
- }
1686
-
1687
- with st.spinner(f"Fetching {relationship_type} for @{username} (fallback method)..."):
1688
- run = client.actor("curious_coder/twitter-scraper").call(run_input=run_input)
1689
- data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
1690
- return data, run["defaultDatasetId"]
1691
-
1692
- except Exception as e:
1693
- st.error(f"All followers scrapers failed: {e}")
1694
- return [], None
1695
-
1696
- def process_followers_data(data, relationship_type="followers"):
1697
- """
1698
- Process followers/following data into a structured format
1699
- """
1700
- processed_data = []
1701
-
1702
- for item in data:
1703
- # Handle different data structures from different actors
1704
- username = item.get('username', item.get('screen_name', item.get('userName', '')))
1705
- name = item.get('name', item.get('displayName', ''))
1706
-
1707
- processed_item = {
1708
- "Username": username,
1709
- "Name": name,
1710
- "Bio": item.get('description', item.get('bio', '')),
1711
- "Location": item.get('location', ''),
1712
- "Followers": item.get('followers_count', item.get('followersCount', item.get('followers', 0))),
1713
- "Following": item.get('following_count', item.get('followingCount', item.get('following', 0))),
1714
- "Tweets": item.get('tweet_count', item.get('statusesCount', item.get('statuses_count', 0))),
1715
- "Verified": item.get('verified', item.get('isVerified', False)),
1716
- "Profile_Image": item.get('profile_image_url', item.get('profileImageUrl', '')),
1717
- "Created_At": item.get('created_at', item.get('createdAt', '')),
1718
- "URL": item.get('url', f"https://twitter.com/{username}"),
1719
- "Relationship_Type": relationship_type
1720
- }
1721
- processed_data.append(processed_item)
1722
-
1723
- return pd.DataFrame(processed_data)
1724
 
1725
  # Footer with attribution
1726
  st.divider()
 
34
  load_dotenv(dotenv_path=".env.local")
35
 
36
  # Setup MongoDB connection
37
+ MONGODB_URI = os.getenv("MONGODB_URI")
38
 
39
  # Try to connect to MongoDB, but continue if it fails
40
  try:
 
156
  since_date = input["since"]
157
  until_date = input.get("until", datetime.now().strftime("%Y-%m-%d")) # NEW: Add until date
158
 
159
+ # ENHANCED: Improved query parameters for better comment capture
160
  run_input = {
161
  "@": id,
162
  "filter:blue_verified": False,
 
180
  "filter:videos": False,
181
  "filter:vine": False,
182
  "include:nativeretweets": False,
183
+ "lang": "", # CHANGED: Remove language filter for all comments
184
  "since": since_date + "_00:00:00_UTC",
185
  "to": id,
186
  "until": until_date + "_23:59:59_UTC",
 
190
  "min_replies": 0,
191
  "-min_retweets": 0,
192
  "-min_faves": 0,
193
+ "-min_replies": 0,
194
+ "sort": "time" # ADDED: Sort by time for chronological order
195
  }
196
 
197
  # Show loading state
 
201
 
202
  # Fetch ALL data from the run's dataset (no maxItems limit)
203
  data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
204
+
205
+ # ENHANCED: Log query details for debugging
206
+ st.info(f"🔍 Query Details: to:@{id} since:{since_date} until:{until_date} | Raw results: {len(data)} comments")
207
 
208
  return data, run["defaultDatasetId"]
209
 
 
216
  min_retweets = input.get("min_retweets", 0) # NEW: Configurable engagement
217
  min_replies = input.get("min_replies", 0) # NEW: Configurable engagement
218
 
219
+ # ENHANCED: More comprehensive query parameters for better accuracy
220
  run_input = {
221
  "filter:blue_verified": False,
222
  "filter:consumer_video": False,
 
227
  "filter:media": False,
228
  "filter:mentions": False,
229
  "filter:native_video": False,
230
+ "filter:nativeretweets": False, # Include retweets for accurate count
231
  "filter:news": False,
232
  "filter:pro_video": False,
233
  "filter:quote": False,
234
+ "filter:replies": False, # Include replies for accurate count
235
  "filter:safe": False,
236
  "filter:spaces": False,
237
  "filter:twimg": False,
 
239
  "filter:videos": False,
240
  "filter:vine": False,
241
  "from": username,
242
+ "include:nativeretweets": True, # CHANGED: Include retweets to match Twitter counts
243
+ "lang": "", # CHANGED: Remove language filter to capture all tweets
244
  "queryType": "Latest",
245
  "since": since_date + "_00:00:00_UTC",
246
  "until": until_date + "_23:59:59_UTC",
247
+ "min_faves": min_faves,
248
+ "min_retweets": min_retweets,
249
+ "min_replies": min_replies,
250
  "-min_retweets": 0,
251
  "-min_faves": 0,
252
+ "-min_replies": 0,
253
+ "sort": "time" # ADDED: Sort by time for chronological order
254
  }
255
 
256
  # Show loading state
 
260
 
261
  # Fetch ALL data from the run's dataset (no maxItems limit)
262
  data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
263
+
264
+ # ENHANCED: Log query details for debugging
265
+ st.info(f"🔍 Query Details: from:{username} since:{since_date} until:{until_date} | Raw results: {len(data)} tweets")
266
 
267
  return data, run["defaultDatasetId"]
268
 
 
931
 
932
  # --- End Scheduler DB helpers ---
933
 
934
+ def run_apify_followers_analysis(input):
935
+ """
936
+ Fetch followers/following data using Apify actor
937
+ """
938
+ username = input["username"]
939
+ relationship_type = input.get("relationship_type", "followers") # "followers" or "following"
940
+ max_items = input.get("max_items", 100)
941
+
942
+ # Try the followers actor first
943
+ try:
944
+ if relationship_type == "followers":
945
+ run_input = {
946
+ "twitterHandles": [username],
947
+ "maxItems": max_items,
948
+ "getFollowers": True,
949
+ "getFollowing": False,
950
+ "getRetweeters": False,
951
+ "includeUnavailableUsers": False,
952
+ }
953
+ else: # following
954
+ run_input = {
955
+ "twitterHandles": [username],
956
+ "maxItems": max_items,
957
+ "getFollowers": False,
958
+ "getFollowing": True,
959
+ "getRetweeters": False,
960
+ "includeUnavailableUsers": False,
961
+ }
962
+
963
+ with st.spinner(f"Fetching {relationship_type} for @{username}..."):
964
+ # Try the actor you specified
965
+ run = client.actor("V38PZzpEgOfeeWvZY").call(run_input=run_input)
966
+ data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
967
+
968
+ if data:
969
+ return data, run["defaultDatasetId"]
970
+ else:
971
+ # Fallback: Use alternative followers scraper
972
+ return run_apify_followers_fallback(input)
973
+
974
+ except Exception as e:
975
+ st.warning(f"Primary followers actor failed: {e}")
976
+ # Fallback to alternative scraper
977
+ return run_apify_followers_fallback(input)
978
+
979
+ def run_apify_followers_fallback(input):
980
+ """
981
+ Fallback method using alternative followers scraper
982
+ """
983
+ username = input["username"]
984
+ relationship_type = input.get("relationship_type", "followers")
985
+ max_items = input.get("max_items", 100)
986
+
987
+ try:
988
+ # Use curious_coder/twitter-scraper as fallback
989
+ run_input = {
990
+ "profileUrl": f"https://twitter.com/{username}",
991
+ "friendshipType": relationship_type, # "followers" or "following"
992
+ "count": max_items,
993
+ "minDelay": 1,
994
+ "maxDelay": 3
995
+ }
996
+
997
+ with st.spinner(f"Fetching {relationship_type} for @{username} (fallback method)..."):
998
+ run = client.actor("curious_coder/twitter-scraper").call(run_input=run_input)
999
+ data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
1000
+ return data, run["defaultDatasetId"]
1001
+
1002
+ except Exception as e:
1003
+ st.error(f"All followers scrapers failed: {e}")
1004
+ return [], None
1005
+
1006
+ def process_followers_data(data, relationship_type="followers"):
1007
+ """
1008
+ Process followers/following data into a structured format
1009
+ """
1010
+ processed_data = []
1011
+
1012
+ for item in data:
1013
+ # Handle different data structures from different actors
1014
+ username = item.get('username', item.get('screen_name', item.get('userName', '')))
1015
+ name = item.get('name', item.get('displayName', ''))
1016
+
1017
+ processed_item = {
1018
+ "Username": username,
1019
+ "Name": name,
1020
+ "Bio": item.get('description', item.get('bio', '')),
1021
+ "Location": item.get('location', ''),
1022
+ "Followers": item.get('followers_count', item.get('followersCount', item.get('followers', 0))),
1023
+ "Following": item.get('following_count', item.get('followingCount', item.get('following', 0))),
1024
+ "Tweets": item.get('tweet_count', item.get('statusesCount', item.get('statuses_count', 0))),
1025
+ "Verified": item.get('verified', item.get('isVerified', False)),
1026
+ "Profile_Image": item.get('profile_image_url', item.get('profileImageUrl', '')),
1027
+ "Created_At": item.get('created_at', item.get('createdAt', '')),
1028
+ "URL": item.get('url', f"https://twitter.com/{username}"),
1029
+ "Relationship_Type": relationship_type
1030
+ }
1031
+ processed_data.append(processed_item)
1032
+
1033
+ return pd.DataFrame(processed_data)
1034
+
1035
  # App header
1036
  st.title("🐦 Twitter Scraper")
1037
 
 
1289
  if not processed_df.empty:
1290
  account_details = metrics.get("account_details", {})
1291
  followers_info = f" | {account_details.get('followers_count', 'N/A')} followers" if account_details.get('followers_count') else ""
1292
+ following_info = f" | {account_details.get('following_count', 'N/A')} following" if account_details.get('following_count') else ""
1293
+ st.success(f"Found {len(processed_df)} tweets for @{username} from {date_range}{followers_info}{following_info}.")
1294
+
1295
+ # ENHANCED: Debug mode for account details
1296
+ if account_details:
1297
+ with st.expander(f"🔍 Debug Account Info for @{username}"):
1298
+ st.json(account_details)
1299
  else:
1300
  st.warning(f"No results for @{username} in the specified date range.")
1301
 
 
1650
  - Deleted tweets
1651
  - API rate limiting
1652
  - Account restrictions
1653
+ - Language filtering (now disabled by default)
1654
+ - Time zone differences (API uses UTC, display shows IST)
1655
 
1656
  💡 **Tips for better results:**
1657
  - Use appropriate date ranges
1658
  - Keep engagement filters at 0 (default) for maximum capture
1659
  - Use broader time periods for more comprehensive data
1660
+ - Check the debug info shown with query results
1661
+ - Compare against multiple time ranges for consistency
1662
+
1663
+ 🔧 **Troubleshooting discrepancies:**
1664
+ - Twitter's web interface may include/exclude different content types
1665
+ - Retweets are now included by default for better accuracy
1666
+ - Language filter removed to capture all tweets
1667
+ - Check the raw results count vs processed count
1668
  """
1669
  )
1670
 
 
1745
  except:
1746
  pass
1747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1748
 
1749
  # Footer with attribution
1750
  st.divider()