Ethscriptions commited on
Commit
daca5f3
·
verified ·
1 Parent(s): 8c4699d

Upload historical_sessions.py

Browse files
Files changed (1) hide show
  1. historical_sessions.py +408 -0
historical_sessions.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import date, datetime, time as dt_time, timedelta
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from cinema_api_client import fetch_canonical_movie_names
11
+
12
+
13
+ ROOT_DIR = Path(__file__).resolve().parent
14
+ STATE_DIR = ROOT_DIR / "cinema_cache"
15
+ LOCAL_HISTORY_FILE = STATE_DIR / "historical_sessions.csv"
16
+ LOCAL_HISTORY_MANIFEST_FILE = STATE_DIR / "historical_sessions_manifest.json"
17
+ LEGACY_HISTORY_FILE = ROOT_DIR / "persistent_data.csv"
18
+
19
+ HISTORY_COLUMNS = [
20
+ "showId",
21
+ "影片名称",
22
+ "影片名称_清理后",
23
+ "放映日期",
24
+ "放映时间",
25
+ "影厅",
26
+ "座位数",
27
+ "总收入",
28
+ "总人次",
29
+ "场次",
30
+ "影片时长(分钟)",
31
+ "影片时长档位",
32
+ "影片时长类型",
33
+ "影片编码",
34
+ "影片语言",
35
+ "影片制式",
36
+ ]
37
+
38
+
39
+ def ensure_state_dir() -> None:
40
+ STATE_DIR.mkdir(parents=True, exist_ok=True)
41
+
42
+
43
+ def clean_movie_title(raw_title, canonical_names=None):
44
+ if not isinstance(raw_title, str):
45
+ return raw_title
46
+
47
+ base_name = None
48
+ if canonical_names:
49
+ sorted_names = sorted(canonical_names, key=len, reverse=True)
50
+ for name in sorted_names:
51
+ if name in raw_title:
52
+ base_name = name
53
+ break
54
+
55
+ if not base_name:
56
+ base_name = raw_title.split(" ", 1)[0]
57
+
58
+ raw_upper = raw_title.upper()
59
+ suffix = ""
60
+ if "HDR LED" in raw_upper:
61
+ suffix = "(HDR LED)"
62
+ elif "CINITY" in raw_upper:
63
+ suffix = "(CINITY)"
64
+ elif "杜比" in raw_upper or "DOLBY" in raw_upper:
65
+ suffix = "(杜比视界)"
66
+ elif "IMAX" in raw_upper:
67
+ suffix = "(数字IMAX3D)" if "3D" in raw_upper else "(数字IMAX)"
68
+ elif "巨幕" in raw_upper:
69
+ suffix = "(中国巨幕立体)" if "立体" in raw_upper else "(中国巨幕)"
70
+ elif "3D" in raw_upper:
71
+ suffix = "(数字3D)"
72
+
73
+ if suffix and suffix not in base_name:
74
+ return f"{base_name}{suffix}"
75
+ return base_name
76
+
77
+
78
+ def round_minutes_to_10min(minutes):
79
+ numeric_value = pd.to_numeric(minutes, errors="coerce")
80
+ if pd.isna(numeric_value) or float(numeric_value) <= 0:
81
+ return np.nan
82
+ return int(np.floor((float(numeric_value) + 5) / 10) * 10)
83
+
84
+
85
+ def create_duration_label(minutes):
86
+ if pd.isna(minutes):
87
+ return np.nan
88
+ return f"{int(minutes)}分钟档"
89
+
90
+
91
+ def create_empty_history_df() -> pd.DataFrame:
92
+ data = {column: pd.Series(dtype="object") for column in HISTORY_COLUMNS}
93
+ data["放映日期"] = pd.Series(dtype="datetime64[ns]")
94
+ data["座位数"] = pd.Series(dtype="int64")
95
+ data["总收入"] = pd.Series(dtype="float64")
96
+ data["总人次"] = pd.Series(dtype="int64")
97
+ data["场次"] = pd.Series(dtype="int64")
98
+ data["影片时长(分钟)"] = pd.Series(dtype="float64")
99
+ data["影片时长档位"] = pd.Series(dtype="float64")
100
+ return pd.DataFrame(data)
101
+
102
+
103
+ def normalize_time_value(value):
104
+ if pd.isna(value):
105
+ return None
106
+ if isinstance(value, datetime):
107
+ return value.time().replace(second=0, microsecond=0)
108
+ if isinstance(value, dt_time):
109
+ return value.replace(second=0, microsecond=0)
110
+
111
+ numeric_value = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0]
112
+ if pd.notna(numeric_value) and 0 <= float(numeric_value) < 1:
113
+ total_minutes = int(round(float(numeric_value) * 24 * 60)) % (24 * 60)
114
+ return (datetime.min + timedelta(minutes=total_minutes)).time()
115
+
116
+ parsed = pd.to_datetime(str(value), errors="coerce")
117
+ if pd.isna(parsed):
118
+ return None
119
+ return parsed.time().replace(second=0, microsecond=0)
120
+
121
+
122
+ def _normalize_history_df(df: Optional[pd.DataFrame]) -> pd.DataFrame:
123
+ if df is None or df.empty:
124
+ return create_empty_history_df()
125
+
126
+ normalized = df.copy()
127
+ for column in HISTORY_COLUMNS:
128
+ if column not in normalized.columns:
129
+ normalized[column] = np.nan
130
+
131
+ normalized["影片名称"] = normalized["影片名称"].astype(str).str.strip()
132
+ normalized = normalized[normalized["影片名称"].ne("") & normalized["影片名称"].ne("nan")].copy()
133
+ normalized["影片名称_清理后"] = normalized["影片名称_清理后"].where(
134
+ normalized["影片名称_清理后"].notna(),
135
+ normalized["影片名称"].apply(clean_movie_title),
136
+ )
137
+ normalized["影片名称_清理后"] = normalized["影片名称_清理后"].astype(str).str.strip()
138
+
139
+ normalized["放映日期"] = pd.to_datetime(normalized["放映日期"], errors="coerce").dt.normalize()
140
+ normalized["放映时间"] = normalized["放映时间"].apply(normalize_time_value)
141
+
142
+ for column in ["座位数", "总人次", "场次"]:
143
+ normalized[column] = pd.to_numeric(normalized[column], errors="coerce").fillna(0).round().astype(int)
144
+ normalized["总收入"] = pd.to_numeric(normalized["总收入"], errors="coerce").fillna(0.0).astype(float)
145
+ normalized["影片时长(分钟)"] = pd.to_numeric(normalized["影片时长(分钟)"], errors="coerce")
146
+ normalized = normalized[
147
+ (normalized["影片时长(分钟)"].isna()) |
148
+ ((normalized["影片时长(分钟)"] > 0) & (normalized["影片时长(分钟)"] <= 400))
149
+ ].copy()
150
+ normalized["影片时长档位"] = normalized["影片时长(分钟)"].apply(round_minutes_to_10min)
151
+ normalized["影片时长类型"] = normalized["影片时长档位"].apply(create_duration_label)
152
+
153
+ normalized["影厅"] = normalized["影厅"].fillna("").astype(str).str.strip()
154
+ normalized["showId"] = normalized["showId"].fillna("").astype(str).str.strip()
155
+ normalized["影片编码"] = normalized["影片编码"].fillna("").astype(str).str.strip()
156
+ normalized["影片语言"] = normalized["影片语言"].fillna("").astype(str).str.strip()
157
+ normalized["影片制式"] = normalized["影片制式"].fillna("").astype(str).str.strip()
158
+
159
+ normalized = normalized.dropna(subset=["放映日期", "放映时间"]).copy()
160
+ normalized["放映时间_str"] = normalized["放映时间"].apply(lambda value: value.strftime("%H:%M:%S") if isinstance(value, dt_time) else "")
161
+
162
+ with_show_id = normalized[normalized["showId"].ne("")].copy()
163
+ without_show_id = normalized[normalized["showId"].eq("")].copy()
164
+
165
+ if not with_show_id.empty:
166
+ with_show_id = with_show_id.drop_duplicates(subset=["showId"], keep="last")
167
+ if not without_show_id.empty:
168
+ without_show_id = without_show_id.drop_duplicates(
169
+ subset=["影片名称", "放映日期", "放映时间_str", "影厅"],
170
+ keep="last",
171
+ )
172
+
173
+ normalized = pd.concat([with_show_id, without_show_id], ignore_index=True)
174
+ normalized = normalized.sort_values(["放映日期", "放映时间_str", "影厅", "影片名称"]).reset_index(drop=True)
175
+ normalized.drop(columns=["放映时间_str"], inplace=True)
176
+ return normalized[HISTORY_COLUMNS]
177
+
178
+
179
+ def load_history_df() -> pd.DataFrame:
180
+ ensure_state_dir()
181
+
182
+ if LOCAL_HISTORY_FILE.exists():
183
+ try:
184
+ return _normalize_history_df(pd.read_csv(LOCAL_HISTORY_FILE))
185
+ except Exception:
186
+ return create_empty_history_df()
187
+
188
+ if LEGACY_HISTORY_FILE.exists():
189
+ try:
190
+ legacy_df = pd.read_csv(LEGACY_HISTORY_FILE)
191
+ history_df = _normalize_history_df(legacy_df)
192
+ save_history_df(history_df)
193
+ return history_df
194
+ except Exception:
195
+ return create_empty_history_df()
196
+
197
+ return create_empty_history_df()
198
+
199
+
200
+ def save_history_df(df: pd.DataFrame) -> pd.DataFrame:
201
+ ensure_state_dir()
202
+ normalized = _normalize_history_df(df)
203
+ normalized.to_csv(LOCAL_HISTORY_FILE, index=False)
204
+ return normalized
205
+
206
+
207
+ def merge_history_df(existing_df: Optional[pd.DataFrame], new_df: Optional[pd.DataFrame]) -> pd.DataFrame:
208
+ frames = []
209
+ if existing_df is not None and not existing_df.empty:
210
+ frames.append(existing_df)
211
+ if new_df is not None and not new_df.empty:
212
+ frames.append(new_df)
213
+ merged = pd.concat(frames, ignore_index=True) if frames else create_empty_history_df()
214
+ return save_history_df(merged)
215
+
216
+
217
+ def prepare_manual_report_history_df(raw_df: pd.DataFrame) -> pd.DataFrame:
218
+ if raw_df is None or raw_df.empty:
219
+ return create_empty_history_df()
220
+
221
+ prepared = raw_df.copy()
222
+ prepared["场次"] = 1
223
+ prepared.rename(
224
+ columns={
225
+ 0: "影片名称",
226
+ 1: "放映日期",
227
+ 2: "放映时间",
228
+ 5: "总人次",
229
+ 6: "总收入",
230
+ 7: "座位数",
231
+ },
232
+ inplace=True,
233
+ )
234
+ required_cols = ["影片名称", "放映日期", "放映时间", "座位数", "总收入", "总人次", "场次"]
235
+ prepared = prepared[required_cols]
236
+ prepared.dropna(subset=["影片名称", "放映日期", "放映时间"], inplace=True)
237
+ prepared["影片名称_清理后"] = prepared["影片名称"].apply(clean_movie_title)
238
+ prepared["影厅"] = ""
239
+ prepared["showId"] = ""
240
+ prepared["影片编码"] = ""
241
+ prepared["影片语言"] = ""
242
+ prepared["影片制式"] = ""
243
+ prepared["影片时长(分钟)"] = np.nan
244
+ prepared["影片时长档位"] = np.nan
245
+ prepared["影片时长类型"] = np.nan
246
+ return _normalize_history_df(prepared)
247
+
248
+
249
+ def prepare_history_df_from_schedule(schedule_list: List[dict], show_date: str, hall_seat_map=None, token: Optional[str] = None) -> pd.DataFrame:
250
+ if not schedule_list:
251
+ return create_empty_history_df()
252
+
253
+ hall_seat_map = {str(key): value for key, value in (hall_seat_map or {}).items()}
254
+ canonical_names = fetch_canonical_movie_names(token, show_date) if token else []
255
+ rows = []
256
+
257
+ for item in schedule_list:
258
+ movie_name = item.get("movieName")
259
+ start_time = item.get("showStartTime")
260
+ if not movie_name or not start_time:
261
+ continue
262
+
263
+ movie_length = pd.to_numeric(item.get("movieLength"), errors="coerce")
264
+ cleaned_name = clean_movie_title(movie_name, canonical_names if canonical_names else None)
265
+ hall_id = str(item.get("hallId") or "").strip()
266
+ rows.append(
267
+ {
268
+ "showId": str(item.get("showId") or "").strip(),
269
+ "影片名称": cleaned_name,
270
+ "影片名称_清理后": cleaned_name,
271
+ "放映日期": show_date,
272
+ "放映时间": start_time,
273
+ "影厅": item.get("hallName") or "",
274
+ "座位数": hall_seat_map.get(hall_id, 0),
275
+ "总收入": pd.to_numeric(item.get("soldBoxOffice"), errors="coerce"),
276
+ "总人次": pd.to_numeric(item.get("soldTicketNum"), errors="coerce"),
277
+ "场次": 1,
278
+ "影片时长(分钟)": movie_length,
279
+ "影片时长档位": round_minutes_to_10min(movie_length),
280
+ "影片时长类型": create_duration_label(round_minutes_to_10min(movie_length)),
281
+ "影片编码": str(item.get("movieNum") or "").strip(),
282
+ "影片语言": str(item.get("movieLanguage") or "").strip(),
283
+ "影片制式": str(item.get("movieMediaType") or "").strip(),
284
+ }
285
+ )
286
+
287
+ return _normalize_history_df(pd.DataFrame(rows))
288
+
289
+
290
+ def get_available_date_set(df: Optional[pd.DataFrame]) -> set:
291
+ if df is None or df.empty or "放映日期" not in df.columns:
292
+ return set()
293
+ return {value.date() for value in pd.to_datetime(df["放映日期"], errors="coerce").dropna()}
294
+
295
+
296
+ def find_missing_dates(df: Optional[pd.DataFrame], start_date: date, end_date: date) -> List[date]:
297
+ if start_date > end_date:
298
+ return []
299
+ existing_dates = get_available_date_set(df)
300
+ missing_dates = []
301
+ current = start_date
302
+ while current <= end_date:
303
+ if current not in existing_dates:
304
+ missing_dates.append(current)
305
+ current += timedelta(days=1)
306
+ return missing_dates
307
+
308
+
309
+ def build_duration_reference_from_history(df: Optional[pd.DataFrame]) -> pd.DataFrame:
310
+ if df is None or df.empty or "影片时长(分钟)" not in df.columns:
311
+ return pd.DataFrame(
312
+ columns=["影片", "影片名称_清理后", "影片时长(分钟)", "影片时长档位", "影片时长类型", "记录场次"]
313
+ )
314
+
315
+ duration_df = df.copy()
316
+ duration_df["影片时长(分钟)"] = pd.to_numeric(duration_df["影片时长(分钟)"], errors="coerce")
317
+ duration_df = duration_df.dropna(subset=["影片名称_清理后", "影片时长(分钟)"]).copy()
318
+ if duration_df.empty:
319
+ return pd.DataFrame(
320
+ columns=["影片", "影片名称_清理后", "影片时长(分钟)", "影片时长档位", "影片时长类型", "记录场次"]
321
+ )
322
+
323
+ duration_df["影片时长(分钟)"] = duration_df["影片时长(分钟)"].round().astype(int)
324
+ duration_df["影片时长档位"] = duration_df["影片时长(分钟)"].apply(round_minutes_to_10min)
325
+ duration_df["影片时长类型"] = duration_df["影片时长档位"].apply(create_duration_label)
326
+ duration_df["影片"] = duration_df["影片名称"]
327
+
328
+ summary = (
329
+ duration_df.groupby(["影片名称_清理后", "影片时长(分钟)", "影片时长档位", "影片时长类型"], as_index=False)
330
+ .agg(影片=("影片", "first"), 记录场次=("场次", "sum"))
331
+ .sort_values(["影片名称_清理后", "影片时长(分钟)"])
332
+ .reset_index(drop=True)
333
+ )
334
+ return summary[["影片", "影片名称_清理后", "影片时长(分钟)", "影片时长档位", "影片时长类型", "记录场次"]]
335
+
336
+
337
+ def summarize_total_box_office_by_movies(df: Optional[pd.DataFrame], movie_names: Iterable[str]) -> pd.DataFrame:
338
+ requested_names = [str(name).strip() for name in (movie_names or []) if str(name).strip()]
339
+ if not requested_names:
340
+ return pd.DataFrame(columns=["影片", "总票房"])
341
+
342
+ if df is None or df.empty:
343
+ return pd.DataFrame({"影片": requested_names, "总票房": [0.0] * len(requested_names)})
344
+
345
+ history_df = df.copy()
346
+ history_df["总收入"] = pd.to_numeric(history_df["总收入"], errors="coerce").fillna(0.0)
347
+ totals = history_df.groupby("影片名称_清理后")["总收入"].sum().to_dict()
348
+ output_df = pd.DataFrame(
349
+ {
350
+ "影片": requested_names,
351
+ "总票房": [float(totals.get(name, 0.0)) for name in requested_names],
352
+ }
353
+ )
354
+ return output_df.sort_values(["总票房", "影片"], ascending=[False, True]).reset_index(drop=True)
355
+
356
+
357
+ def default_history_manifest() -> dict:
358
+ return {
359
+ "synced_dates": [],
360
+ "updated_at": "",
361
+ "last_successful_target_date": "",
362
+ }
363
+
364
+
365
+ def load_history_manifest() -> dict:
366
+ ensure_state_dir()
367
+ manifest = default_history_manifest()
368
+
369
+ if LOCAL_HISTORY_MANIFEST_FILE.exists():
370
+ try:
371
+ payload = json.loads(LOCAL_HISTORY_MANIFEST_FILE.read_text(encoding="utf-8"))
372
+ if isinstance(payload, dict):
373
+ manifest.update(payload)
374
+ except Exception:
375
+ pass
376
+
377
+ synced_dates = manifest.get("synced_dates", [])
378
+ if not isinstance(synced_dates, list):
379
+ synced_dates = []
380
+ manifest["synced_dates"] = sorted({str(item).strip() for item in synced_dates if str(item).strip()})
381
+ return manifest
382
+
383
+
384
+ def save_history_manifest(manifest: Optional[dict]) -> dict:
385
+ ensure_state_dir()
386
+ final_manifest = default_history_manifest()
387
+ if isinstance(manifest, dict):
388
+ final_manifest.update(manifest)
389
+
390
+ synced_dates = final_manifest.get("synced_dates", [])
391
+ if not isinstance(synced_dates, list):
392
+ synced_dates = []
393
+ final_manifest["synced_dates"] = sorted({str(item).strip() for item in synced_dates if str(item).strip()})
394
+
395
+ LOCAL_HISTORY_MANIFEST_FILE.write_text(
396
+ json.dumps(final_manifest, ensure_ascii=False, indent=2),
397
+ encoding="utf-8",
398
+ )
399
+ return final_manifest
400
+
401
+
402
+ def get_synced_date_set(manifest: Optional[dict]) -> set:
403
+ if not isinstance(manifest, dict):
404
+ return set()
405
+ synced_dates = manifest.get("synced_dates", [])
406
+ if not isinstance(synced_dates, list):
407
+ return set()
408
+ return {str(item).strip() for item in synced_dates if str(item).strip()}