vxa8502 commited on
Commit
98a32b3
·
1 Parent(s): eeabd95

Eliminate duplicate payload.get() calls

Browse files
Files changed (1) hide show
  1. scripts/eda.py +19 -14
scripts/eda.py CHANGED
@@ -131,20 +131,21 @@ def compute_stats(client, sample_size: int | None = None) -> dict:
131
 
132
  for payload in scroll_all_payloads(client, limit=sample_size):
133
  review_id = payload.get("review_id")
 
 
 
134
  if review_id and review_id not in review_ratings:
135
- review_ratings[review_id] = payload.get("rating")
136
 
137
- ratings.append(payload.get("rating"))
138
  text_lengths.append(len(payload.get("text", "")))
139
  timestamps.append(payload.get("timestamp", 0))
140
  product_ids.add(payload.get("product_id"))
141
- review_ids.add(payload.get("review_id"))
142
 
143
  # Track chunks per review
144
- review_id = payload.get("review_id")
145
- total_chunks = payload.get("total_chunks", 1)
146
  if review_id:
147
- chunks_per_review[review_id] = total_chunks
148
 
149
  print(f" Scanned {len(ratings):,} total chunks")
150
 
@@ -280,6 +281,17 @@ def generate_figures(stats: dict) -> None:
280
  print(f" Saved: {FIGURES_DIR / 'temporal_distribution.png'}")
281
 
282
 
 
 
 
 
 
 
 
 
 
 
 
283
  def save_eda_stats(stats: dict, collection_info: dict) -> Path:
284
  """Save EDA statistics to JSON using existing save_results pattern.
285
 
@@ -304,14 +316,7 @@ def save_eda_stats(stats: dict, collection_info: dict) -> Path:
304
  if unique_reviews
305
  else 0.0,
306
  },
307
- "temporal": {
308
- "start_date": datetime.fromtimestamp(
309
- min(t for t in stats["timestamps"] if t > 0) / 1000
310
- ).strftime("%Y-%m-%d"),
311
- "end_date": datetime.fromtimestamp(
312
- max(t for t in stats["timestamps"] if t > 0) / 1000
313
- ).strftime("%Y-%m-%d"),
314
- },
315
  "rating_distribution": stats["rating_dist"],
316
  "chunk_length": {
317
  "median_chars": int(np.median(stats["text_lengths"])),
 
131
 
132
  for payload in scroll_all_payloads(client, limit=sample_size):
133
  review_id = payload.get("review_id")
134
+ rating = payload.get("rating")
135
+
136
+ # Track one rating per review (for review-level distribution)
137
  if review_id and review_id not in review_ratings:
138
+ review_ratings[review_id] = rating
139
 
140
+ ratings.append(rating)
141
  text_lengths.append(len(payload.get("text", "")))
142
  timestamps.append(payload.get("timestamp", 0))
143
  product_ids.add(payload.get("product_id"))
144
+ review_ids.add(review_id)
145
 
146
  # Track chunks per review
 
 
147
  if review_id:
148
+ chunks_per_review[review_id] = payload.get("total_chunks", 1)
149
 
150
  print(f" Scanned {len(ratings):,} total chunks")
151
 
 
281
  print(f" Saved: {FIGURES_DIR / 'temporal_distribution.png'}")
282
 
283
 
284
+ def _compute_temporal_range(timestamps: list) -> dict:
285
+ """Extract start/end dates from millisecond timestamps."""
286
+ valid = [t for t in timestamps if t and t > 0]
287
+ if not valid:
288
+ return {"start_date": None, "end_date": None}
289
+ return {
290
+ "start_date": datetime.fromtimestamp(min(valid) / 1000).strftime("%Y-%m-%d"),
291
+ "end_date": datetime.fromtimestamp(max(valid) / 1000).strftime("%Y-%m-%d"),
292
+ }
293
+
294
+
295
  def save_eda_stats(stats: dict, collection_info: dict) -> Path:
296
  """Save EDA statistics to JSON using existing save_results pattern.
297
 
 
316
  if unique_reviews
317
  else 0.0,
318
  },
319
+ "temporal": _compute_temporal_range(stats["timestamps"]),
 
 
 
 
 
 
 
320
  "rating_distribution": stats["rating_dist"],
321
  "chunk_length": {
322
  "median_chars": int(np.median(stats["text_lengths"])),