lawlevisan commited on
Commit
b8b04d0
·
verified ·
1 Parent(s): 644e2d5

Update src/evaluation.py

Browse files
Files changed (1) hide show
  1. src/evaluation.py +153 -29
src/evaluation.py CHANGED
@@ -1,29 +1,153 @@
1
- # evaluation.py
2
- from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, brier_score_loss
3
- from alerts import compute_dynamic_risk
4
-
5
- def evaluate_model(test_tweets):
6
- """
7
- test_tweets: list of dicts with fields
8
- - true_risk_level: "CRITICAL"/"HIGH"/...
9
- - dynamic_risk_score: 0-100
10
- """
11
- # Compute predicted risk level from dynamic score
12
- y_true = [1 if t['true_risk_level'] == "CRITICAL" else 0 for t in test_tweets]
13
- y_prob = []
14
- y_pred = []
15
-
16
- for t in test_tweets:
17
- score = t["dynamic_risk_score"]
18
- prob = compute_dynamic_risk(score)
19
- y_prob.append(prob)
20
- y_pred.append(1 if prob >= 0.75 else 0) # threshold for CRITICAL
21
-
22
- print("=== Classification Report ===")
23
- print(classification_report(y_true, y_pred, target_names=["Non-Critical","Critical"]))
24
-
25
- print("=== Confusion Matrix ===")
26
- print(confusion_matrix(y_true, y_pred))
27
-
28
- print("ROC-AUC:", roc_auc_score(y_true, y_prob))
29
- print("Brier Score:", brier_score_loss(y_true, y_prob))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation.py
2
+
3
+ import os
4
+ import pandas as pd
5
+ import re
6
+ from collections import Counter
7
+ from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
8
+
9
+ def evaluate_model(scraper_folder="drug_analysis_data_3months"):
10
+ """
11
+ Full evaluation function for Twitter drug/crime scraping dataset:
12
+ - Computes general stats, missing data, duplicates
13
+ - Drug/Crime-related stats
14
+ - Time coverage
15
+ - Text analysis
16
+ - User/source analysis
17
+ - Scraper evaluation metrics
18
+ - Classification metrics if applicable
19
+ """
20
+
21
+ # -----------------------------
22
+ # Load CSVs
23
+ # -----------------------------
24
+ csv_files = [f for f in os.listdir(scraper_folder) if f.endswith(".csv")]
25
+ if not csv_files:
26
+ print("❌ No CSV files found in scraper folder!")
27
+ return
28
+
29
+ dfs = [pd.read_csv(os.path.join(scraper_folder, f)) for f in csv_files]
30
+ df = pd.concat(dfs, ignore_index=True)
31
+ print(f"✅ Loaded {len(df)} rows from {len(csv_files)} CSV files.\n")
32
+
33
+ # -----------------------------
34
+ # General Stats
35
+ # -----------------------------
36
+ print("=== General Stats ===")
37
+ print("Columns:", df.columns.tolist())
38
+ print("Total rows:", len(df))
39
+ print("Missing values per column:\n", df.isna().sum())
40
+ print("\nDuplicate rows:", df.duplicated().sum())
41
+
42
+ # Sample rows with missing data
43
+ missing_rows = df[df.isna().any(axis=1)]
44
+ if not missing_rows.empty:
45
+ print("\nSample rows with missing values:\n", missing_rows.head())
46
+
47
+ # Sample duplicate rows
48
+ duplicates = df[df.duplicated(keep=False)]
49
+ if not duplicates.empty:
50
+ print("\nSample duplicate rows:\n", duplicates.head())
51
+
52
+ # -----------------------------
53
+ # Drug/Crime-related stats
54
+ # -----------------------------
55
+ for col in ["is_drug_related", "is_crime_related", "risk_level"]:
56
+ if col in df.columns:
57
+ print(f"\n=== {col} Distribution ===")
58
+ print(df[col].value_counts())
59
+ print("Proportion:\n", round(df[col].value_counts(normalize=True), 4))
60
+
61
+ # Risk level numeric analysis
62
+ if "risk_level" in df.columns and pd.api.types.is_numeric_dtype(df["risk_level"]):
63
+ print("\n=== Risk Level Stats ===")
64
+ print("Average risk:", round(df["risk_level"].mean(), 2))
65
+ print("Max risk:", df["risk_level"].max())
66
+ high_risk_count = (df["risk_level"] >= 0.7).sum() # Threshold
67
+ print("Number of high-risk items (risk >= 0.7):", high_risk_count)
68
+
69
+ # -----------------------------
70
+ # Time coverage
71
+ # -----------------------------
72
+ if "datetime" in df.columns:
73
+ df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
74
+ print("\n=== Date Range ===")
75
+ print("Earliest:", df["datetime"].min())
76
+ print("Latest:", df["datetime"].max())
77
+
78
+ # Daily counts
79
+ df["date"] = df["datetime"].dt.date
80
+ daily_counts = df.groupby("date").size()
81
+ print("\n=== Daily Counts of Posts ===")
82
+ print(daily_counts)
83
+
84
+ # -----------------------------
85
+ # Text Analysis
86
+ # -----------------------------
87
+ if "text" in df.columns:
88
+ df["text"] = df["text"].astype(str)
89
+ df["text_length"] = df["text"].apply(len)
90
+ print("\n=== Text Length Stats ===")
91
+ print("Average length:", round(df["text_length"].mean(), 2))
92
+ print("Min length:", df["text_length"].min())
93
+ print("Max length:", df["text_length"].max())
94
+
95
+ # Top 10 most common words
96
+ words = Counter()
97
+ for t in df["text"]:
98
+ words.update(re.findall(r"\w+", t.lower()))
99
+ print("\nTop 10 common words:", words.most_common(10))
100
+
101
+ # -----------------------------
102
+ # User / Source Analysis
103
+ # -----------------------------
104
+ if "username" in df.columns:
105
+ print("\n=== User Analysis ===")
106
+ print("Total unique users:", df["username"].nunique())
107
+ top_users = df["username"].value_counts().head(10)
108
+ print("Top 10 users by post count:\n", top_users)
109
+
110
+ # -----------------------------
111
+ # Scraper Evaluation Metrics
112
+ # -----------------------------
113
+ print("\n=== Scraper Evaluation Metrics ===")
114
+
115
+ # 1. Completeness (% of filled cells)
116
+ completeness = 1 - df.isna().mean().mean()
117
+ print(f"Completeness (all columns filled): {round(completeness*100, 2)}%")
118
+
119
+ # 2. Duplicate rate (% of duplicate rows)
120
+ duplicate_rate = df.duplicated().mean()
121
+ print(f"Duplicate rows rate: {round(duplicate_rate*100, 2)}%")
122
+
123
+ # 3. Drug/Crime relevance (if available)
124
+ for col in ["is_drug_related", "is_crime_related"]:
125
+ if col in df.columns:
126
+ relevance = df[col].sum() / len(df)
127
+ print(f"{col} relevance rate: {round(relevance*100,2)}%")
128
+
129
+ # 4. Time coverage (active days vs total days)
130
+ if "datetime" in df.columns:
131
+ total_days = (df["datetime"].max() - df["datetime"].min()).days + 1
132
+ active_days = df["date"].nunique()
133
+ coverage_ratio = active_days / total_days
134
+ print(f"Time coverage ratio (active days / total days): {round(coverage_ratio*100,2)}%")
135
+
136
+ # 5. Average text length (proxy for content richness)
137
+ if "text" in df.columns:
138
+ print(f"Average text length: {round(df['text_length'].mean(),2)} characters")
139
+
140
+ # 6. Classification Metrics (using scraper labels as pseudo-ground truth)
141
+ if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
142
+ y_true = df["is_crime_related"]
143
+ y_pred = df["is_drug_related"]
144
+ print("\n=== Classification Metrics (is_drug_related vs is_crime_related) ===")
145
+ print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
146
+ print("Precision:", round(precision_score(y_true, y_pred), 4))
147
+ print("Recall:", round(recall_score(y_true, y_pred), 4))
148
+ print("F1-score:", round(f1_score(y_true, y_pred), 4))
149
+ print("\nClassification Report:\n", classification_report(y_true, y_pred))
150
+ else:
151
+ print("\n⚠️ Skipping classification metrics: Not enough columns for evaluation.")
152
+
153
+ print("\n✅ Data evaluation + metrics complete!")