Spaces:

lawlevisan
/

Twitter-Analysis

Sleeping

App Files Files Community

lawlevisan commited on Oct 22, 2025

Commit

b8b04d0

verified ·

1 Parent(s): 644e2d5

Update src/evaluation.py

Browse files

Files changed (1) hide show

src/evaluation.py +153 -29

src/evaluation.py CHANGED Viewed

@@ -1,29 +1,153 @@
-# evaluation.py
-from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, brier_score_loss
-from alerts import compute_dynamic_risk
-def evaluate_model(test_tweets):
-    """
-    test_tweets: list of dicts with fields
-        - true_risk_level: "CRITICAL"/"HIGH"/...
-        - dynamic_risk_score: 0-100
-    """
-    # Compute predicted risk level from dynamic score
-    y_true = [1 if t['true_risk_level'] == "CRITICAL" else 0 for t in test_tweets]
-    y_prob = []
-    y_pred = []
-    for t in test_tweets:
-        score = t["dynamic_risk_score"]
-        prob = compute_dynamic_risk(score)
-        y_prob.append(prob)
-        y_pred.append(1 if prob >= 0.75 else 0)  # threshold for CRITICAL
-    print("=== Classification Report ===")
-    print(classification_report(y_true, y_pred, target_names=["Non-Critical","Critical"]))
-    print("=== Confusion Matrix ===")
-    print(confusion_matrix(y_true, y_pred))
-    print("ROC-AUC:", roc_auc_score(y_true, y_prob))
-    print("Brier Score:", brier_score_loss(y_true, y_prob))

+# evaluation.py
+import os
+import pandas as pd
+import re
+from collections import Counter
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
+def evaluate_model(scraper_folder="drug_analysis_data_3months"):
+    """
+    Full evaluation function for Twitter drug/crime scraping dataset:
+    - Computes general stats, missing data, duplicates
+    - Drug/Crime-related stats
+    - Time coverage
+    - Text analysis
+    - User/source analysis
+    - Scraper evaluation metrics
+    - Classification metrics if applicable
+    """
+    # -----------------------------
+    # Load CSVs
+    # -----------------------------
+    csv_files = [f for f in os.listdir(scraper_folder) if f.endswith(".csv")]
+    if not csv_files:
+        print("❌ No CSV files found in scraper folder!")
+        return
+    dfs = [pd.read_csv(os.path.join(scraper_folder, f)) for f in csv_files]
+    df = pd.concat(dfs, ignore_index=True)
+    print(f"✅ Loaded {len(df)} rows from {len(csv_files)} CSV files.\n")
+    # -----------------------------
+    # General Stats
+    # -----------------------------
+    print("=== General Stats ===")
+    print("Columns:", df.columns.tolist())
+    print("Total rows:", len(df))
+    print("Missing values per column:\n", df.isna().sum())
+    print("\nDuplicate rows:", df.duplicated().sum())
+    # Sample rows with missing data
+    missing_rows = df[df.isna().any(axis=1)]
+    if not missing_rows.empty:
+        print("\nSample rows with missing values:\n", missing_rows.head())
+    # Sample duplicate rows
+    duplicates = df[df.duplicated(keep=False)]
+    if not duplicates.empty:
+        print("\nSample duplicate rows:\n", duplicates.head())
+    # -----------------------------
+    # Drug/Crime-related stats
+    # -----------------------------
+    for col in ["is_drug_related", "is_crime_related", "risk_level"]:
+        if col in df.columns:
+            print(f"\n=== {col} Distribution ===")
+            print(df[col].value_counts())
+            print("Proportion:\n", round(df[col].value_counts(normalize=True), 4))
+    # Risk level numeric analysis
+    if "risk_level" in df.columns and pd.api.types.is_numeric_dtype(df["risk_level"]):
+        print("\n=== Risk Level Stats ===")
+        print("Average risk:", round(df["risk_level"].mean(), 2))
+        print("Max risk:", df["risk_level"].max())
+        high_risk_count = (df["risk_level"] >= 0.7).sum()  # Threshold
+        print("Number of high-risk items (risk >= 0.7):", high_risk_count)
+    # -----------------------------
+    # Time coverage
+    # -----------------------------
+    if "datetime" in df.columns:
+        df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
+        print("\n=== Date Range ===")
+        print("Earliest:", df["datetime"].min())
+        print("Latest:", df["datetime"].max())
+        # Daily counts
+        df["date"] = df["datetime"].dt.date
+        daily_counts = df.groupby("date").size()
+        print("\n=== Daily Counts of Posts ===")
+        print(daily_counts)
+    # -----------------------------
+    # Text Analysis
+    # -----------------------------
+    if "text" in df.columns:
+        df["text"] = df["text"].astype(str)
+        df["text_length"] = df["text"].apply(len)
+        print("\n=== Text Length Stats ===")
+        print("Average length:", round(df["text_length"].mean(), 2))
+        print("Min length:", df["text_length"].min())
+        print("Max length:", df["text_length"].max())
+        # Top 10 most common words
+        words = Counter()
+        for t in df["text"]:
+            words.update(re.findall(r"\w+", t.lower()))
+        print("\nTop 10 common words:", words.most_common(10))
+    # -----------------------------
+    # User / Source Analysis
+    # -----------------------------
+    if "username" in df.columns:
+        print("\n=== User Analysis ===")
+        print("Total unique users:", df["username"].nunique())
+        top_users = df["username"].value_counts().head(10)
+        print("Top 10 users by post count:\n", top_users)
+    # -----------------------------
+    # Scraper Evaluation Metrics
+    # -----------------------------
+    print("\n=== Scraper Evaluation Metrics ===")
+    # 1. Completeness (% of filled cells)
+    completeness = 1 - df.isna().mean().mean()
+    print(f"Completeness (all columns filled): {round(completeness*100, 2)}%")
+    # 2. Duplicate rate (% of duplicate rows)
+    duplicate_rate = df.duplicated().mean()
+    print(f"Duplicate rows rate: {round(duplicate_rate*100, 2)}%")
+    # 3. Drug/Crime relevance (if available)
+    for col in ["is_drug_related", "is_crime_related"]:
+        if col in df.columns:
+            relevance = df[col].sum() / len(df)
+            print(f"{col} relevance rate: {round(relevance*100,2)}%")
+    # 4. Time coverage (active days vs total days)
+    if "datetime" in df.columns:
+        total_days = (df["datetime"].max() - df["datetime"].min()).days + 1
+        active_days = df["date"].nunique()
+        coverage_ratio = active_days / total_days
+        print(f"Time coverage ratio (active days / total days): {round(coverage_ratio*100,2)}%")
+    # 5. Average text length (proxy for content richness)
+    if "text" in df.columns:
+        print(f"Average text length: {round(df['text_length'].mean(),2)} characters")
+    # 6. Classification Metrics (using scraper labels as pseudo-ground truth)
+    if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
+        y_true = df["is_crime_related"]
+        y_pred = df["is_drug_related"]
+        print("\n=== Classification Metrics (is_drug_related vs is_crime_related) ===")
+        print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
+        print("Precision:", round(precision_score(y_true, y_pred), 4))
+        print("Recall:", round(recall_score(y_true, y_pred), 4))
+        print("F1-score:", round(f1_score(y_true, y_pred), 4))
+        print("\nClassification Report:\n", classification_report(y_true, y_pred))
+    else:
+        print("\n⚠️ Skipping classification metrics: Not enough columns for evaluation.")
+    print("\n✅ Data evaluation + metrics complete!")