Spaces:

lawlevisan
/

Twitter-Analysis

Sleeping

App Files Files Community

lawlevisan commited on Oct 30, 2025

Commit

d68e0e5

verified ·

1 Parent(s): b1eb703

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +299 -49

src/streamlit_app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-#streamlit_app.py
 import streamlit as st
 import pandas as pd
 import os
@@ -7,57 +7,32 @@ import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from datetime import datetime, timedelta
-import numpy as np
-import time
-import re
-import seaborn as sns
-import matplotlib.pyplot as plt
-from evaluation import evaluate_model
-# Run evaluation on the scraped CSV folder (no logs shown)
-evaluate_model("drug_analysis_data_3months")
-st.set_page_config(
-    page_title="Twitter Drug Crime Monitoring",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Import NLTK with error handling
 import nltk
-from nltk.corpus import stopwords
 try:
     stopwords.words('english')
 except LookupError:
-    nltk.download('stopwords', quiet=True)
-    nltk.download('vader_lexicon', quiet=True)
-    nltk.download('punkt', quiet=True)
 # Now you can safely use it
 english_stopwords = stopwords.words('english')
-# Import autorefresh with fallback
-try:
-    from streamlit_autorefresh import st_autorefresh
-    AUTOREFRESH_AVAILABLE = True
-except ImportError:
-    AUTOREFRESH_AVAILABLE = False
-# Check required secrets AFTER imports
-required_secrets = ["MONGO_URI"]
-missing_secrets = [s for s in required_secrets if not os.getenv(s)]
-if missing_secrets:
-    st.error(f"❌ Missing required secrets: {', '.join(missing_secrets)}")
-    st.info("Please add these in Hugging Face Space Settings → Repository secrets")
-    st.warning("The dashboard will run in limited mode without database connectivity.")
-    # Don't stop - allow dashboard to work with local CSV files
-# Limit dataframe size in memory
-MAX_ROWS_IN_MEMORY = 10000
 # Custom CSS for better styling
 st.markdown("""
@@ -164,7 +139,7 @@ def validate_dataframe(df):
     return True, "DataFrame is valid"
-@st.cache_data(ttl=600)  # Cache for 10 minutes
 def load_data():
     """Load the most recent data with robust error handling."""
     start_time = time.time()
@@ -233,9 +208,7 @@ def load_data():
             st.sidebar.metric("Load Time", f"{load_time:.2f}s")
             st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
             st.sidebar.info(f"Source: {latest_file}")
-            if len(df) > MAX_ROWS_IN_MEMORY:
-                df = df.tail(MAX_ROWS_IN_MEMORY)
             return df, report_data
         except Exception as e:
@@ -442,15 +415,12 @@ auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)")
 from streamlit_autorefresh import st_autorefresh
 if auto_refresh:
-    if AUTOREFRESH_AVAILABLE:
-        st_autorefresh(interval=30*1000, key="refresh")
-    else:
-        st.sidebar.warning("⚠️ Auto-refresh not available. Install streamlit-autorefresh package.")
 # Navigation tabs - ENHANCED with new options
 analysis_type = st.sidebar.radio(
     "Select Analysis View",
-    ["Summary", "Risk Analysis", "Actionable Insights",
      "Geographic Analysis", "User Analysis",
      "Content Analysis", "📊 Volume Trends", "🧠 User Behavior",
      "📍 Heatmaps", "⚠️ Risk Patterns"]
@@ -501,6 +471,23 @@ if search_term:
 # Display current filter status
 st.sidebar.info(f"Showing {len(df)} tweets")
 # ------------------------
 # EXECUTIVE SUMMARY
 # ------------------------
@@ -972,6 +959,269 @@ elif analysis_type == "Actionable Insights":
         fig_users.update_layout(yaxis=dict(autorange="reversed"))
         st.plotly_chart(fig_users, use_container_width=True)
 # ------------------------
 # GEOGRAPHIC ANALYSIS (Enhanced)
 # ------------------------

+#modify_app.py
 import streamlit as st
 import pandas as pd
 import os
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from datetime import datetime, timedelta
 import nltk
+from nltk.corpus import stopwords  # ✅ import first
+# Ensure stopwords data is downloaded
 try:
     stopwords.words('english')
 except LookupError:
+    nltk.download('stopwords')
 # Now you can safely use it
 english_stopwords = stopwords.words('english')
+import numpy as np
+import time
+import seaborn as sns
+import matplotlib.pyplot as plt
+from alerts import compute_dynamic_risk,assign_dynamic_risk_level,trigger_alerts
+from evaluation import evaluate_model
+# Run evaluation on the scraped CSV folder
+evaluate_model("drug_analysis_data_3months")
+import re
+st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide")
 # Custom CSS for better styling
 st.markdown("""
     return True, "DataFrame is valid"
+@st.cache_data
 def load_data():
     """Load the most recent data with robust error handling."""
     start_time = time.time()
             st.sidebar.metric("Load Time", f"{load_time:.2f}s")
             st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
             st.sidebar.info(f"Source: {latest_file}")
             return df, report_data
         except Exception as e:
 from streamlit_autorefresh import st_autorefresh
 if auto_refresh:
+    st_autorefresh(interval=30*1000, key="refresh")
 # Navigation tabs - ENHANCED with new options
 analysis_type = st.sidebar.radio(
     "Select Analysis View",
+    ["Summary", "Risk Analysis", "Actionable Insights", "📈 Predictive Analytics", "🌐 Network Analysis",
      "Geographic Analysis", "User Analysis",
      "Content Analysis", "📊 Volume Trends", "🧠 User Behavior",
      "📍 Heatmaps", "⚠️ Risk Patterns"]
 # Display current filter status
 st.sidebar.info(f"Showing {len(df)} tweets")
+# ---------------- EMAIL ALERTS SECTION ---------------
+st.sidebar.header("📩 Email Alerts")
+num_tweets = st.sidebar.number_input(
+    "Number of high-risk tweets to send",
+    min_value=1,
+    max_value=50,
+    value=5,
+    step=1
+)
+send_button = st.sidebar.button("Send Alerts")
+if send_button:
+    st.info(f"Sending top {num_tweets} high-risk tweets via email...")
+    trigger_alerts(max_tweets=num_tweets)
+    st.success(f"✅ Alerts sent for top {num_tweets} tweets!")
 # ------------------------
 # EXECUTIVE SUMMARY
 # ------------------------
         fig_users.update_layout(yaxis=dict(autorange="reversed"))
         st.plotly_chart(fig_users, use_container_width=True)
+# ------------------------
+# NEW: PREDICTIVE ANALYTICS
+# ------------------------
+elif analysis_type == "📈 Predictive Analytics":
+    st.header("📈 Predictive Analytics & Trends")
+    st.subheader("📊 Activity Forecast")
+    if "datetime" in df.columns and len(df) >= 7:
+        # Daily activity trend
+        daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count")
+        daily_activity.columns = ["date", "count"]
+        daily_activity["date"] = pd.to_datetime(daily_activity["date"])
+        # Calculate moving average
+        daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
+        daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
+        # Create forecast visualization
+        fig_forecast = go.Figure()
+        fig_forecast.add_trace(go.Scatter(
+            x=daily_activity["date"],
+            y=daily_activity["count"],
+            name="Actual Activity",
+            mode="lines+markers",
+            line=dict(color="#1f77b4")
+        ))
+        fig_forecast.add_trace(go.Scatter(
+            x=daily_activity["date"],
+            y=daily_activity["7_day_ma"],
+            name="7-Day Moving Average",
+            mode="lines",
+            line=dict(color="#ff7f0e", dash="dash")
+        ))
+        fig_forecast.update_layout(
+            title="Tweet Activity Trend & Forecast",
+            xaxis_title="Date",
+            yaxis_title="Number of Tweets",
+            hovermode="x unified"
+        )
+        st.plotly_chart(fig_forecast, use_container_width=True)
+        # Trend analysis
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            recent_avg = daily_activity["count"].tail(7).mean()
+            st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day")
+        with col2:
+            if len(daily_activity) >= 14:
+                prev_avg = daily_activity["count"].tail(14).head(7).mean()
+                change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0
+                st.metric("Week-over-Week Change", f"{change:+.1f}%")
+        with col3:
+            peak_day = daily_activity.loc[daily_activity["count"].idxmax()]
+            st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d"))
+    # User activity prediction
+    st.subheader("👤 High-Risk User Patterns")
+    if "username" in df.columns and "risk_level" in df.columns:
+        user_risk_scores = df.groupby("username").agg({
+            "tweet_id": "count",
+            "risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum()
+        }).reset_index()
+        user_risk_scores.columns = ["username", "tweet_count", "risk_score"]
+        # Identify escalating users
+        escalating_users = user_risk_scores[
+            (user_risk_scores["risk_score"] > 0) &
+            (user_risk_scores["tweet_count"] >= 3)
+        ].sort_values("risk_score", ascending=False).head(15)
+        if not escalating_users.empty:
+            fig_escalating = px.scatter(
+                escalating_users,
+                x="tweet_count",
+                y="risk_score",
+                size="risk_score",
+                hover_data=["username"],
+                title="High-Risk User Activity Matrix",
+                labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"}
+            )
+            st.plotly_chart(fig_escalating, use_container_width=True)
+            st.write("**Users to Monitor:**")
+            for _, user in escalating_users.head(10).iterrows():
+                st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}")
+# ------------------------
+# NEW: NETWORK ANALYSIS
+# ------------------------
+elif analysis_type == "🌐 Network Analysis":
+    st.header("🌐 Network Analysis")
+    st.subheader("👥 User Connection Analysis")
+    # Mentions network
+    if "mentions" in df.columns:
+        st.write("### User Mention Network")
+        mention_pairs = []
+        for _, row in df.iterrows():
+            if pd.notna(row.get("mentions")) and row["mentions"]:
+                mentions = str(row["mentions"]).split()
+                for mention in mentions:
+                    mention_clean = mention.strip("@")
+                    if mention_clean:
+                        mention_pairs.append({
+                            "from": row["username"],
+                            "to": mention_clean,
+                            "risk_level": row.get("risk_level", "UNKNOWN")
+                        })
+        if mention_pairs:
+            mention_df = pd.DataFrame(mention_pairs)
+            # Top mentioned users
+            top_mentioned = mention_df["to"].value_counts().head(15)
+            fig_mentioned = px.bar(
+                x=top_mentioned.values,
+                y=top_mentioned.index,
+                orientation="h",
+                title="Most Mentioned Users",
+                labels={"x": "Times Mentioned", "y": "Username"}
+            )
+            fig_mentioned.update_layout(yaxis=dict(autorange="reversed"))
+            st.plotly_chart(fig_mentioned, use_container_width=True)
+            # Connection strength
+            connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions")
+            strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False)
+            if not strong_connections.empty:
+                st.write("### 🔗 Strong Connections (2+ mentions)")
+                for _, conn in strong_connections.head(20).iterrows():
+                    st.write(f"- @{conn['from']} → @{conn['to']}: {conn['mentions']} times")
+        else:
+            st.info("No mention data available")
+    # Location clustering
+    st.subheader("📍 Location-Based Clustering")
+    if "user_location" in df.columns:
+        location_users = df.groupby("user_location").agg({
+            "username": lambda x: list(x.unique()),
+            "tweet_id": "count",
+            "risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0
+        }).reset_index()
+        location_users.columns = ["location", "users", "tweet_count", "critical_count"]
+        location_users = location_users[location_users["location"] != ""]
+        location_users = location_users[location_users["tweet_count"] >= 3]
+        location_users["user_count"] = location_users["users"].apply(len)
+        if not location_users.empty:
+            fig_clusters = px.scatter(
+                location_users,
+                x="tweet_count",
+                y="user_count",
+                size="critical_count",
+                hover_data=["location"],
+                title="Location Clusters (Activity vs Users)",
+                labels={
+                    "tweet_count": "Total Tweets",
+                    "user_count": "Unique Users",
+                    "critical_count": "Critical Tweets"
+                }
+            )
+            st.plotly_chart(fig_clusters, use_container_width=True)
+            # High-density locations
+            high_density = location_users.sort_values("user_count", ascending=False).head(10)
+            st.write("### 🏙️ High-Density Locations")
+            for _, loc in high_density.iterrows():
+                with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"):
+                    st.write(f"**Critical tweets:** {loc['critical_count']}")
+                    st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}")
+                    if len(loc['users']) > 10:
+                        st.write(f"... and {len(loc['users']) - 10} more")
+    # Co-occurrence analysis
+    st.subheader("🔗 Keyword Co-occurrence")
+    if "content" in df.columns:
+        # Define drug/crime keywords
+        drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"]
+        crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"]
+        cooccurrence = []
+        for _, row in df.iterrows():
+            content_lower = row["content"].lower()
+            found_drug = [kw for kw in drug_keywords if kw in content_lower]
+            found_crime = [kw for kw in crime_keywords if kw in content_lower]
+            for drug in found_drug:
+                for crime in found_crime:
+                    cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime})
+        if cooccurrence:
+            cooc_df = pd.DataFrame(cooccurrence)
+            cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count")
+            cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20)
+            if not cooc_counts.empty:
+                fig_cooc = px.bar(
+                    cooc_counts,
+                    x="count",
+                    y="drug_keyword",
+                    color="crime_keyword",
+                    title="Drug-Crime Keyword Co-occurrence",
+                    orientation="h"
+                )
+                st.plotly_chart(fig_cooc, use_container_width=True)
+        else:
+            st.info("No significant keyword co-occurrences found")
+    # Temporal clustering
+    st.subheader("⏰ Temporal Activity Clusters")
+    if "datetime" in df.columns and "username" in df.columns:
+        df_copy = df.copy()
+        df_copy["hour"] = df_copy["datetime"].dt.hour
+        df_copy["day_of_week"] = df_copy["datetime"].dt.day_name()
+        # Find users active at unusual hours (late night/early morning)
+        unusual_hours = [0, 1, 2, 3, 4, 5]
+        night_activity = df_copy[df_copy["hour"].isin(unusual_hours)]
+        if len(night_activity) > 0:
+            night_users = night_activity.groupby("username").size().reset_index(name="night_tweets")
+            night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False)
+            if not night_users.empty:
+                st.write(f"### 🌙 Users Active During Late Night (12 AM - 6 AM)")
+                fig_night = px.bar(
+                    night_users.head(15),
+                    x="night_tweets",
+                    y="username",
+                    orientation="h",
+                    title="Top Users with Late Night Activity"
+                )
+                fig_night.update_layout(yaxis=dict(autorange="reversed"))
+                st.plotly_chart(fig_night, use_container_width=True)
+                st.info("⚠️ Late night activity may indicate suspicious behavior patterns")
 # ------------------------
 # GEOGRAPHIC ANALYSIS (Enhanced)
 # ------------------------