Eliminate duplicate payload.get() calls
Browse files- scripts/eda.py +19 -14
scripts/eda.py
CHANGED
|
@@ -131,20 +131,21 @@ def compute_stats(client, sample_size: int | None = None) -> dict:
|
|
| 131 |
|
| 132 |
for payload in scroll_all_payloads(client, limit=sample_size):
|
| 133 |
review_id = payload.get("review_id")
|
|
|
|
|
|
|
|
|
|
| 134 |
if review_id and review_id not in review_ratings:
|
| 135 |
-
review_ratings[review_id] =
|
| 136 |
|
| 137 |
-
ratings.append(
|
| 138 |
text_lengths.append(len(payload.get("text", "")))
|
| 139 |
timestamps.append(payload.get("timestamp", 0))
|
| 140 |
product_ids.add(payload.get("product_id"))
|
| 141 |
-
review_ids.add(
|
| 142 |
|
| 143 |
# Track chunks per review
|
| 144 |
-
review_id = payload.get("review_id")
|
| 145 |
-
total_chunks = payload.get("total_chunks", 1)
|
| 146 |
if review_id:
|
| 147 |
-
chunks_per_review[review_id] = total_chunks
|
| 148 |
|
| 149 |
print(f" Scanned {len(ratings):,} total chunks")
|
| 150 |
|
|
@@ -280,6 +281,17 @@ def generate_figures(stats: dict) -> None:
|
|
| 280 |
print(f" Saved: {FIGURES_DIR / 'temporal_distribution.png'}")
|
| 281 |
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
def save_eda_stats(stats: dict, collection_info: dict) -> Path:
|
| 284 |
"""Save EDA statistics to JSON using existing save_results pattern.
|
| 285 |
|
|
@@ -304,14 +316,7 @@ def save_eda_stats(stats: dict, collection_info: dict) -> Path:
|
|
| 304 |
if unique_reviews
|
| 305 |
else 0.0,
|
| 306 |
},
|
| 307 |
-
"temporal":
|
| 308 |
-
"start_date": datetime.fromtimestamp(
|
| 309 |
-
min(t for t in stats["timestamps"] if t > 0) / 1000
|
| 310 |
-
).strftime("%Y-%m-%d"),
|
| 311 |
-
"end_date": datetime.fromtimestamp(
|
| 312 |
-
max(t for t in stats["timestamps"] if t > 0) / 1000
|
| 313 |
-
).strftime("%Y-%m-%d"),
|
| 314 |
-
},
|
| 315 |
"rating_distribution": stats["rating_dist"],
|
| 316 |
"chunk_length": {
|
| 317 |
"median_chars": int(np.median(stats["text_lengths"])),
|
|
|
|
| 131 |
|
| 132 |
for payload in scroll_all_payloads(client, limit=sample_size):
|
| 133 |
review_id = payload.get("review_id")
|
| 134 |
+
rating = payload.get("rating")
|
| 135 |
+
|
| 136 |
+
# Track one rating per review (for review-level distribution)
|
| 137 |
if review_id and review_id not in review_ratings:
|
| 138 |
+
review_ratings[review_id] = rating
|
| 139 |
|
| 140 |
+
ratings.append(rating)
|
| 141 |
text_lengths.append(len(payload.get("text", "")))
|
| 142 |
timestamps.append(payload.get("timestamp", 0))
|
| 143 |
product_ids.add(payload.get("product_id"))
|
| 144 |
+
review_ids.add(review_id)
|
| 145 |
|
| 146 |
# Track chunks per review
|
|
|
|
|
|
|
| 147 |
if review_id:
|
| 148 |
+
chunks_per_review[review_id] = payload.get("total_chunks", 1)
|
| 149 |
|
| 150 |
print(f" Scanned {len(ratings):,} total chunks")
|
| 151 |
|
|
|
|
| 281 |
print(f" Saved: {FIGURES_DIR / 'temporal_distribution.png'}")
|
| 282 |
|
| 283 |
|
| 284 |
+
def _compute_temporal_range(timestamps: list) -> dict:
|
| 285 |
+
"""Extract start/end dates from millisecond timestamps."""
|
| 286 |
+
valid = [t for t in timestamps if t and t > 0]
|
| 287 |
+
if not valid:
|
| 288 |
+
return {"start_date": None, "end_date": None}
|
| 289 |
+
return {
|
| 290 |
+
"start_date": datetime.fromtimestamp(min(valid) / 1000).strftime("%Y-%m-%d"),
|
| 291 |
+
"end_date": datetime.fromtimestamp(max(valid) / 1000).strftime("%Y-%m-%d"),
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
|
| 295 |
def save_eda_stats(stats: dict, collection_info: dict) -> Path:
|
| 296 |
"""Save EDA statistics to JSON using existing save_results pattern.
|
| 297 |
|
|
|
|
| 316 |
if unique_reviews
|
| 317 |
else 0.0,
|
| 318 |
},
|
| 319 |
+
"temporal": _compute_temporal_range(stats["timestamps"]),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
"rating_distribution": stats["rating_dist"],
|
| 321 |
"chunk_length": {
|
| 322 |
"median_chars": int(np.median(stats["text_lengths"])),
|