Spaces:

poemsforaphrodite
/

scraper

Paused

App Files Files Community

poemsforaphrodite commited on May 6, 2025

Commit

303bb41

verified ·

1 Parent(s): ff34b27

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +106 -43

src/streamlit_app.py CHANGED Viewed

@@ -2,10 +2,11 @@ import os
 import streamlit as st
 import pandas as pd
 import json
-from datetime import datetime
 import plotly.express as px
 import numpy as np
 from collections import Counter
 # Try to import Google Generative AI, but handle it gracefully if not installed
 try:
@@ -53,7 +54,7 @@ def get_gemini_summary(tweets_data, context=""):
         all_tweets = "\n\n".join(tweets_text)
-        # Create a prompt for Gemini
         prompt = f"""
         {context}
@@ -61,14 +62,24 @@ def get_gemini_summary(tweets_data, context=""):
         {all_tweets}
-        Please provide a comprehensive summary of these tweets, including:
         1. Main themes and topics discussed
         2. Overall sentiment
         3. Key insights or patterns
         4. Most engaging content
-        5. Any recommendations based on the analysis
-        Format the summary in a clear, structured way with bullet points where appropriate.
         """
         # Generate summary using Gemini
@@ -85,34 +96,37 @@ def run_apify_comment_analysis(input):
     since_date = input["since"]
     max_items = input["max_items"]
     # Use fixed date format as specified in the example
     run_input = {
         "@": id,
-        "filter:blue_verified": False,
-        "filter:consumer_video": False,
-        "filter:has_engagement": True,
-        "filter:hashtags": False,
-        "filter:images": False,
-        "filter:links": False,
-        "filter:media": False,
-        "filter:mentions": False,
-        "filter:native_video": False,
-        "filter:nativeretweets": False,
-        "filter:news": False,
-        "filter:pro_video": False,
-        "filter:quote": False,
-        "filter:replies": False,
-        "filter:safe": False,
-        "filter:spaces": False,
-        "filter:twimg": False,
-        "filter:verified": False,
-        "filter:videos": False,
-        "filter:vine": False,
-        "include:nativeretweets": False,
-        "lang": "en",
         "since": since_date + "_23:59:59_UTC",
-        "to": id,
-        "until": "2025-12-31_23:59:59_UTC",
         "maxItems": max_items,
         "queryType": "Latest",
         "min_retweets": 0,
@@ -130,6 +144,10 @@ def run_apify_comment_analysis(input):
         # Fetch data from the run's dataset
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
     return data, run["defaultDatasetId"]
@@ -139,6 +157,9 @@ def run_apify_account_analysis(input):
     since_date = input["since"]
     max_items = input["max_items"]
     # Use the exact format provided by the user
     run_input = {
         "filter:blue_verified": False,
@@ -168,7 +189,7 @@ def run_apify_account_analysis(input):
         "min_faves": 500,
         "queryType": "Latest",
         "since": since_date + "_23:59:59_UTC",
-        "until": "2025-12-31_23:59:59_UTC",
         "min_retweets": 0,
         "min_replies": 0,
         "-min_retweets": 0,
@@ -180,9 +201,13 @@ def run_apify_account_analysis(input):
     with st.spinner("Fetching tweets from Twitter..."):
         # Run the Actor and wait for it to finish
         run = client.actor("CJdippxWmn9uRfooo").call(run_input=run_input)
         # Fetch data from the run's dataset
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
     return data, run["defaultDatasetId"]
@@ -206,6 +231,23 @@ def extract_mentions(text):
     mentions = [word[1:] for word in words if word.startswith('@')]
     return mentions
 # Function to process tweet data and create dataframe
 def process_tweet_data(data):
     processed_data = []
@@ -219,11 +261,15 @@ def process_tweet_data(data):
             try:
                 # Try to parse the Twitter date format
                 date_obj = datetime.strptime(date_str, "%a %b %d %H:%M:%S %z %Y")
-                formatted_date = date_obj.strftime("%Y-%m-%d %H:%M:%S")
-                date_only = date_obj.strftime("%Y-%m-%d")
-                time_only = date_obj.strftime("%H:%M")
-                hour = date_obj.hour
-                day_of_week = date_obj.strftime("%A")
             except:
                 formatted_date = date_str
                 date_only = ""
@@ -255,6 +301,9 @@ def process_tweet_data(data):
             # Calculate tweet length
             tweet_length = len(text) if text else 0
             processed_item = {
                 "Date": formatted_date,
                 "Date_Only": date_only,
@@ -269,6 +318,7 @@ def process_tweet_data(data):
                 "Likes": item.get("likeCount", 0),
                 "Retweets": item.get("retweetCount", 0),
                 "Replies": item.get("replyCount", 0),
                 "Views": item.get("viewCount", 0),
                 "URL": item.get("url", ""),
                 "Is_Reply": item.get("isReply", False),
@@ -304,15 +354,16 @@ def analyze_and_display_data(data, dataset_id, analysis_type="Account"):
         total_likes = df["Likes"].sum()
         total_retweets = df["Retweets"].sum()
         total_replies = df["Replies"].sum()
         total_views = df["Views"].sum()
         # Engagement metrics
-        total_engagement = total_likes + total_retweets + total_replies
         avg_engagement_per_tweet = total_engagement / total_tweets if total_tweets > 0 else 0
         engagement_rate = (total_engagement / total_views * 100) if total_views > 0 else 0
         # Find most engaging tweet
-        df["Engagement"] = df["Likes"] + df["Retweets"] + df["Replies"]
         most_engaging_tweet = df.loc[df["Engagement"].idxmax()] if not df.empty else None
         # Tweet type breakdown
@@ -362,8 +413,8 @@ def analyze_and_display_data(data, dataset_id, analysis_type="Account"):
                 st.metric("Total Retweets", f"{total_retweets:,}")
                 st.metric("Total Replies", f"{total_replies:,}")
             with col3:
                 st.metric("Total Views", f"{total_views:,}")
-                st.metric("Total Engagement", f"{total_engagement:,}")
             # Engagement metrics
             st.subheader("⚡ Engagement Analysis")
@@ -426,7 +477,7 @@ def analyze_and_display_data(data, dataset_id, analysis_type="Account"):
                     st.write(most_engaging_tweet['Text'])
                     # Display metrics in a row
-                    cols = st.columns(4)
                     with cols[0]:
                         st.write(f"💬 {most_engaging_tweet['Replies']}")
                     with cols[1]:
@@ -434,6 +485,8 @@ def analyze_and_display_data(data, dataset_id, analysis_type="Account"):
                     with cols[2]:
                         st.write(f"❤️ {most_engaging_tweet['Likes']}")
                     with cols[3]:
                         st.write(f"👁️ {most_engaging_tweet['Views']}")
                     # Link to original tweet
@@ -453,7 +506,7 @@ def analyze_and_display_data(data, dataset_id, analysis_type="Account"):
             # Tweets by hour of day
             if not df_by_hour.empty:
                 fig_by_hour = px.bar(df_by_hour, x="Hour", y="Count",
-                                      title="Tweets by Hour of Day",
                                       labels={"Hour": "Hour (24h format)", "Count": "Number of Tweets"})
                 st.plotly_chart(fig_by_hour, use_container_width=True)
@@ -503,7 +556,7 @@ def display_tweet_list(df):
             st.write(row['Text'])
             # Display metrics in a row
-            cols = st.columns(4)
             with cols[0]:
                 st.write(f"💬 {row['Replies']}")
             with cols[1]:
@@ -511,6 +564,8 @@ def display_tweet_list(df):
             with cols[2]:
                 st.write(f"❤️ {row['Likes']}")
             with cols[3]:
                 st.write(f"👁️ {row['Views']}")
             # Indicate if tweet has media without showing it
@@ -676,12 +731,20 @@ try:
     with open("requirements.txt", "r") as f:
         requirements = f.read()
     if "google-generativeai" not in requirements:
         with open("requirements.txt", "a") as f:
             f.write("\ngoogle-generativeai>=0.3.0\n")
 except:
     pass
 # Footer with attribution
 st.divider()
-st.caption("Powered by Apify Twitter Scraper API • Created with Streamlit • AI Summaries by Google Gemini")

 import streamlit as st
 import pandas as pd
 import json
+from datetime import datetime, timedelta
 import plotly.express as px
 import numpy as np
 from collections import Counter
+import pytz
 # Try to import Google Generative AI, but handle it gracefully if not installed
 try:
         all_tweets = "\n\n".join(tweets_text)
+        # Create a prompt for Gemini with enhanced analysis requirements
         prompt = f"""
         {context}
         {all_tweets}
+        Please provide a comprehensive analysis of these tweets, including:
         1. Main themes and topics discussed
         2. Overall sentiment
         3. Key insights or patterns
         4. Most engaging content
+        Additionally, please provide these specific analyses:
+        5. Political/Brand Affiliation Analysis: Analyze which party or brand the reply tweeters belong to. Identify if there are instances where people from the same party/brand are tweeting negatively about their own party/brand.
+        6. Top 10 Positive Tweets: List the most positive tweets with their tweet numbers and brief explanation.
+        7. Top 10 Negative Tweets: List the most negative tweets with their tweet numbers and brief explanation.
+        8. Top 10 Recommendations: Provide specific suggestions and recommendations to help the party or brand improve their messaging, engagement, or content strategy based on the tweet analysis.
+        Format the analysis in a clear, structured way with bullet points where appropriate and clear section headings.
         """
         # Generate summary using Gemini
     since_date = input["since"]
     max_items = input["max_items"]
+    # Get current date for the "until" parameter
+    current_date = datetime.now().strftime("%Y-%m-%d")
     # Use fixed date format as specified in the example
     run_input = {
         "@": id,
+    "filter:blue_verified": False,
+    "filter:consumer_video": False,
+    "filter:has_engagement": True,
+    "filter:hashtags": False,
+    "filter:images": False,
+    "filter:links": False,
+    "filter:media": False,
+    "filter:mentions": False,
+    "filter:native_video": False,
+    "filter:nativeretweets": False,
+    "filter:news": False,
+    "filter:pro_video": False,
+    "filter:quote": False,
+    "filter:replies": False,
+    "filter:safe": False,
+    "filter:spaces": False,
+    "filter:twimg": False,
+    "filter:verified": False,
+    "filter:videos": False,
+    "filter:vine": False,
+    "include:nativeretweets": False,
+    "lang": "en",
         "since": since_date + "_23:59:59_UTC",
+    "to": id,
+        "until": current_date + "_23:59:59_UTC",
         "maxItems": max_items,
         "queryType": "Latest",
         "min_retweets": 0,
         # Fetch data from the run's dataset
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        # Ensure we don't exceed max_items
+        if len(data) > max_items:
+            data = data[:max_items]
     return data, run["defaultDatasetId"]
     since_date = input["since"]
     max_items = input["max_items"]
+    # Get current date for the "until" parameter
+    current_date = datetime.now().strftime("%Y-%m-%d")
     # Use the exact format provided by the user
     run_input = {
         "filter:blue_verified": False,
         "min_faves": 500,
         "queryType": "Latest",
         "since": since_date + "_23:59:59_UTC",
+        "until": current_date + "_23:59:59_UTC",
         "min_retweets": 0,
         "min_replies": 0,
         "-min_retweets": 0,
     with st.spinner("Fetching tweets from Twitter..."):
         # Run the Actor and wait for it to finish
         run = client.actor("CJdippxWmn9uRfooo").call(run_input=run_input)
         # Fetch data from the run's dataset
         data = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        # Ensure we don't exceed max_items
+        if len(data) > max_items:
+            data = data[:max_items]
     return data, run["defaultDatasetId"]
     mentions = [word[1:] for word in words if word.startswith('@')]
     return mentions
+# Function to convert UTC time to Indian Standard Time (IST)
+def convert_to_ist(utc_datetime):
+    if not utc_datetime:
+        return None
+    # Create timezone objects
+    utc_tz = pytz.timezone('UTC')
+    ist_tz = pytz.timezone('Asia/Kolkata')
+    # If datetime is naive, make it timezone-aware with UTC
+    if utc_datetime.tzinfo is None:
+        utc_datetime = utc_tz.localize(utc_datetime)
+    # Convert to IST
+    ist_datetime = utc_datetime.astimezone(ist_tz)
+    return ist_datetime
 # Function to process tweet data and create dataframe
 def process_tweet_data(data):
     processed_data = []
             try:
                 # Try to parse the Twitter date format
                 date_obj = datetime.strptime(date_str, "%a %b %d %H:%M:%S %z %Y")
+                # Convert to IST
+                ist_date_obj = convert_to_ist(date_obj)
+                formatted_date = ist_date_obj.strftime("%Y-%m-%d %H:%M:%S")
+                date_only = ist_date_obj.strftime("%Y-%m-%d")
+                time_only = ist_date_obj.strftime("%H:%M")
+                hour = ist_date_obj.hour
+                day_of_week = ist_date_obj.strftime("%A")
             except:
                 formatted_date = date_str
                 date_only = ""
             # Calculate tweet length
             tweet_length = len(text) if text else 0
+            # Get bookmarks count if available
+            bookmarks = item.get("bookmarkCount", 0)
             processed_item = {
                 "Date": formatted_date,
                 "Date_Only": date_only,
                 "Likes": item.get("likeCount", 0),
                 "Retweets": item.get("retweetCount", 0),
                 "Replies": item.get("replyCount", 0),
+                "Bookmarks": bookmarks,
                 "Views": item.get("viewCount", 0),
                 "URL": item.get("url", ""),
                 "Is_Reply": item.get("isReply", False),
         total_likes = df["Likes"].sum()
         total_retweets = df["Retweets"].sum()
         total_replies = df["Replies"].sum()
+        total_bookmarks = df["Bookmarks"].sum()
         total_views = df["Views"].sum()
         # Engagement metrics
+        total_engagement = total_likes + total_retweets + total_replies + total_bookmarks
         avg_engagement_per_tweet = total_engagement / total_tweets if total_tweets > 0 else 0
         engagement_rate = (total_engagement / total_views * 100) if total_views > 0 else 0
         # Find most engaging tweet
+        df["Engagement"] = df["Likes"] + df["Retweets"] + df["Replies"] + df["Bookmarks"]
         most_engaging_tweet = df.loc[df["Engagement"].idxmax()] if not df.empty else None
         # Tweet type breakdown
                 st.metric("Total Retweets", f"{total_retweets:,}")
                 st.metric("Total Replies", f"{total_replies:,}")
             with col3:
+                st.metric("Total Bookmarks", f"{total_bookmarks:,}")
                 st.metric("Total Views", f"{total_views:,}")
             # Engagement metrics
             st.subheader("⚡ Engagement Analysis")
                     st.write(most_engaging_tweet['Text'])
                     # Display metrics in a row
+                    cols = st.columns(5)
                     with cols[0]:
                         st.write(f"💬 {most_engaging_tweet['Replies']}")
                     with cols[1]:
                     with cols[2]:
                         st.write(f"❤️ {most_engaging_tweet['Likes']}")
                     with cols[3]:
+                        st.write(f"🔖 {most_engaging_tweet['Bookmarks']}")
+                    with cols[4]:
                         st.write(f"👁️ {most_engaging_tweet['Views']}")
                     # Link to original tweet
             # Tweets by hour of day
             if not df_by_hour.empty:
                 fig_by_hour = px.bar(df_by_hour, x="Hour", y="Count",
+                                      title="Tweets by Hour of Day (Indian Time)",
                                       labels={"Hour": "Hour (24h format)", "Count": "Number of Tweets"})
                 st.plotly_chart(fig_by_hour, use_container_width=True)
             st.write(row['Text'])
             # Display metrics in a row
+            cols = st.columns(5)
             with cols[0]:
                 st.write(f"💬 {row['Replies']}")
             with cols[1]:
             with cols[2]:
                 st.write(f"❤️ {row['Likes']}")
             with cols[3]:
+                st.write(f"🔖 {row['Bookmarks']}")
+            with cols[4]:
                 st.write(f"👁️ {row['Views']}")
             # Indicate if tweet has media without showing it
     with open("requirements.txt", "r") as f:
         requirements = f.read()
+    updated_requirements = False
     if "google-generativeai" not in requirements:
         with open("requirements.txt", "a") as f:
             f.write("\ngoogle-generativeai>=0.3.0\n")
+            updated_requirements = True
+    if "pytz" not in requirements:
+        with open("requirements.txt", "a") as f:
+            f.write("\npytz\n")
+            updated_requirements = True
 except:
     pass
 # Footer with attribution
 st.divider()
+st.caption("Powered by Apify Twitter Scraper API • Created with Streamlit • AI Summaries by Google Gemini • Times in Indian Standard Time (IST)")