Merry99 commited on
Commit
57443be
Β·
1 Parent(s): 33f6b6b

batch insert

Browse files
Files changed (7) hide show
  1. .gitignore +0 -4
  2. Dockerfile +3 -14
  3. app.py +175 -69
  4. crontab +0 -4
  5. last_push_date.txt +0 -1
  6. start_with_cron.sh +0 -25
  7. upload_hf_dataset.py +0 -108
.gitignore CHANGED
@@ -34,7 +34,3 @@ Thumbs.db
34
 
35
  # 둜그
36
  *.log
37
-
38
- # μ—…λ‘œλ“œ 데이터
39
- uploads/
40
- training_data/
 
34
 
35
  # 둜그
36
  *.log
 
 
 
 
Dockerfile CHANGED
@@ -6,7 +6,6 @@ WORKDIR /app
6
  # μ‹œμŠ€ν…œ νŒ¨ν‚€μ§€ μ—…λ°μ΄νŠΈ 및 ν•„μš”ν•œ νŒ¨ν‚€μ§€ μ„€μΉ˜
7
  RUN apt-get update && apt-get install -y \
8
  build-essential \
9
- cron \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # requirements 볡사 및 μ„€μΉ˜
@@ -16,22 +15,12 @@ RUN pip install --no-cache-dir -r requirements.txt
16
  # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ½”λ“œ 볡사
17
  COPY . .
18
 
19
- # cron μ„€μ • 파일 볡사
20
- COPY crontab /etc/cron.d/batch-push-cron
21
-
22
- # cron κΆŒν•œ μ„€μ •
23
- RUN chmod 0644 /etc/cron.d/batch-push-cron
24
- RUN crontab /etc/cron.d/batch-push-cron
25
-
26
  # 둜그 디렉토리 생성
27
- RUN mkdir -p /var/log
28
 
29
  # Hugging Face SpaceλŠ” 포트 7860을 μ‚¬μš©ν•©λ‹ˆλ‹€
30
  EXPOSE 7860
31
 
32
- # μ‹œμž‘ 슀크립트 볡사 및 μ‹€ν–‰
33
- COPY start_with_cron.sh /start_with_cron.sh
34
- RUN chmod +x /start_with_cron.sh
35
-
36
- CMD ["/start_with_cron.sh"]
37
 
 
6
  # μ‹œμŠ€ν…œ νŒ¨ν‚€μ§€ μ—…λ°μ΄νŠΈ 및 ν•„μš”ν•œ νŒ¨ν‚€μ§€ μ„€μΉ˜
7
  RUN apt-get update && apt-get install -y \
8
  build-essential \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
  # requirements 볡사 및 μ„€μΉ˜
 
15
  # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ½”λ“œ 볡사
16
  COPY . .
17
 
 
 
 
 
 
 
 
18
  # 둜그 디렉토리 생성
19
+ RUN mkdir -p /app/logs
20
 
21
  # Hugging Face SpaceλŠ” 포트 7860을 μ‚¬μš©ν•©λ‹ˆλ‹€
22
  EXPOSE 7860
23
 
24
+ # FastAPI μ„œλ²„ 직접 μ‹€ν–‰ (APScheduler 포함)
25
+ CMD ["python", "start.py"]
 
 
 
26
 
app.py CHANGED
@@ -3,7 +3,6 @@ import json
3
  from typing import List, Optional
4
  from fastapi import FastAPI, HTTPException, Request
5
  from pydantic import BaseModel, Field, ConfigDict
6
- from typing import List
7
  import oracledb
8
  from dotenv import load_dotenv
9
  import json
@@ -56,6 +55,24 @@ class StatePayload(BaseModel):
56
  user_emb: Optional[List[float]] = Field(default=None, description="length=12")
57
  model_version: Optional[str] = None
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # ----- μœ ν‹Έ -----
61
  def clob_json(obj) -> str:
@@ -74,7 +91,7 @@ def root():
74
  "health_db": "/health/db (DB μ—°κ²° 체크)",
75
  "docs": "/docs",
76
  "upload_state": "/upload_state",
77
- "upload_dataset": "/upload_dataset",
78
  "user_dataset": "/user_dataset/{user_id}"
79
  }
80
  }
@@ -146,91 +163,180 @@ def upload_state(p: StatePayload):
146
  except Exception as e:
147
  raise HTTPException(500, f"upload_state failed: {e}")
148
 
149
-
150
- @app.post("/upload_dataset")
151
- async def upload_to_dataset(request: Request):
152
- """둜컬 νŒŒμΌμ— μ‚¬μš©μžλ³„ 데이터 μ €μž₯"""
 
 
 
 
 
 
 
153
  try:
154
- data = await request.json()
155
- user_id = data.get("user_id")
156
-
157
- if not user_id:
158
- raise HTTPException(status_code=400, detail="user_idκ°€ ν•„μš”ν•©λ‹ˆλ‹€")
159
-
160
- # 데이터에 νƒ€μž„μŠ€νƒ¬ν”„ μΆ”κ°€
161
- data["timestamp"] = datetime.now().isoformat()
162
-
163
- # 둜컬 데이터 디렉토리 생성
164
- data_dir = "user_data"
165
- os.makedirs(data_dir, exist_ok=True)
166
-
167
- # μ‚¬μš©μžλ³„ JSON 파일 경둜
168
- user_file = os.path.join(data_dir, f"{user_id}.json")
169
-
170
- # κΈ°μ‘΄ 데이터 λ‘œλ“œ
171
- existing_data = []
172
- if os.path.exists(user_file):
173
- try:
174
- with open(user_file, 'r', encoding='utf-8') as f:
175
- existing_data = json.load(f)
176
- print(f"πŸ“Š κΈ°μ‘΄ 데이터 λ‘œλ“œ: {user_id} ({len(existing_data)}개 λ ˆμ½”λ“œ)")
177
- except:
178
- existing_data = []
179
-
180
- # μƒˆ 데이터 μΆ”κ°€
181
- existing_data.append(data)
182
-
183
- # 파일 μ €μž₯
184
- with open(user_file, 'w', encoding='utf-8') as f:
185
- json.dump(existing_data, f, ensure_ascii=False, indent=2)
186
-
187
- print(f"βœ… 둜컬 파일 μ €μž₯ μ™„λ£Œ: {user_id} ({len(existing_data)}개 λ ˆμ½”λ“œ)")
188
- return {
189
- "user_id": user_id,
190
- "rows": len(existing_data),
191
- "status": "success",
192
- "filename": f"{user_id}.json",
193
- "file_path": user_file,
194
- "message": f"Data saved to local file: {user_file}"
195
- }
196
 
 
 
 
 
 
 
 
 
197
  except Exception as e:
198
- print(f"❌ 둜컬 μ €μž₯ μ‹€νŒ¨: {e}")
199
- raise HTTPException(status_code=500, detail=f"둜컬 μ €μž₯ μ‹€νŒ¨: {str(e)}")
200
 
201
  @app.get("/user_dataset/{user_id}")
202
  async def read_user_dataset(user_id: str):
203
- """둜컬 νŒŒμΌμ—μ„œ μ‚¬μš©μž 데이터 쑰회"""
204
  try:
205
- # μ‚¬μš©μžλ³„ JSON 파일 경둜
206
- data_dir = "user_data"
207
- user_file = os.path.join(data_dir, f"{user_id}.json")
 
 
 
208
 
209
- if not os.path.exists(user_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  return {
211
  "user_id": user_id,
212
  "count": 0,
213
  "recent_data": [],
214
- "source": "local_file",
 
215
  "message": "No data found"
216
  }
217
 
218
- # 데이터 λ‘œλ“œ
219
- with open(user_file, 'r', encoding='utf-8') as f:
220
- data = json.load(f)
 
 
 
 
 
 
 
 
 
 
221
 
222
- # 졜근 5개 λ ˆμ½”λ“œ λ°˜ν™˜
223
- recent_data = data[-5:] if len(data) > 5 else data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  return {
226
- "user_id": user_id,
227
- "count": len(data),
228
- "recent_data": recent_data,
229
- "filename": f"{user_id}.json",
230
- "source": "local_file"
 
231
  }
232
-
 
 
233
  except Exception as e:
234
- print(f"❌ 둜컬 쑰회 μ‹€νŒ¨: {e}")
235
- raise HTTPException(status_code=500, detail=f"둜컬 쑰회 μ‹€νŒ¨: {str(e)}")
236
 
 
3
  from typing import List, Optional
4
  from fastapi import FastAPI, HTTPException, Request
5
  from pydantic import BaseModel, Field, ConfigDict
 
6
  import oracledb
7
  from dotenv import load_dotenv
8
  import json
 
55
  user_emb: Optional[List[float]] = Field(default=None, description="length=12")
56
  model_version: Optional[str] = None
57
 
58
+ # 배치 λ°μ΄ν„°μš© μŠ€ν‚€λ§ˆ
59
+ class BatchDataItem(BaseModel):
60
+ user_id: str
61
+ session_id: str
62
+ measure_date: str
63
+ rms: float
64
+ freq: float
65
+ fatigue: float
66
+ mode: str
67
+ window_count: int
68
+ windows: List[dict] = Field(default_factory=list)
69
+ measurement_count: int
70
+
71
+ class BatchUploadPayload(BaseModel):
72
+ batch_data: List[BatchDataItem]
73
+ batch_size: int
74
+ batch_date: str
75
+
76
 
77
  # ----- μœ ν‹Έ -----
78
  def clob_json(obj) -> str:
 
91
  "health_db": "/health/db (DB μ—°κ²° 체크)",
92
  "docs": "/docs",
93
  "upload_state": "/upload_state",
94
+ "upload_batch_dataset": "/upload_batch_dataset (배치 데이터)",
95
  "user_dataset": "/user_dataset/{user_id}"
96
  }
97
  }
 
163
  except Exception as e:
164
  raise HTTPException(500, f"upload_state failed: {e}")
165
 
166
+ @app.on_event("startup")
167
+ async def startup_event():
168
+ """μ„œλ²„ μ‹œμž‘ μ‹œ μ΄ˆκΈ°ν™”"""
169
+ print("πŸš€ MuscleCare API μ„œλ²„ μ‹œμž‘ 쀑...")
170
+
171
+ # 둜그 디렉토리 생성 (둜컬/배포 ν™˜κ²½ ꡬ뢄)
172
+ log_dir = "/app/logs" if os.path.exists("/app") else "./logs"
173
+ os.makedirs(log_dir, exist_ok=True)
174
+ print(f"πŸ“ 둜그 디렉토리 생성: {log_dir}")
175
+
176
+ # Oracle DB μ΄ˆκΈ°ν™”
177
  try:
178
+ db_initialized = init_db_from_env()
179
+ if db_initialized:
180
+ print("βœ… Oracle DB μ—°κ²° μ™„λ£Œ")
181
+ else:
182
+ print("⚠️ Oracle DB μ—°κ²° μ‹€νŒ¨ - DB κ΄€λ ¨ κΈ°λŠ₯이 λΉ„ν™œμ„±ν™”λ©λ‹ˆλ‹€")
183
+ except Exception as e:
184
+ print(f"❌ Oracle DB μ΄ˆκΈ°ν™” 였λ₯˜: {e}")
185
+
186
+ print("βœ… μ„œλ²„ μ‹œμž‘ μ™„λ£Œ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ @app.on_event("shutdown")
189
+ async def shutdown_event():
190
+ """μ„œλ²„ μ’…λ£Œ μ‹œ 정리"""
191
+ print("πŸ›‘ μ„œλ²„ μ’…λ£Œ 쀑...")
192
+ try:
193
+ db_manager = get_db_manager()
194
+ db_manager.close()
195
+ print("βœ… Oracle DB μ—°κ²° μ’…λ£Œ μ™„λ£Œ")
196
  except Exception as e:
197
+ print(f"❌ μ’…λ£Œ 처리 였λ₯˜: {e}")
198
+
199
 
200
  @app.get("/user_dataset/{user_id}")
201
  async def read_user_dataset(user_id: str):
202
+ """Hugging Face Hubμ—μ„œ μ‚¬μš©μž 데이터 쑰회"""
203
  try:
204
+ # Hugging Face ν™˜κ²½λ³€μˆ˜ 확인
205
+ hf_repo_id = os.getenv("HF_DATA_REPO_ID")
206
+ hf_token = os.getenv("HF_DATA_TOKEN")
207
+
208
+ if not hf_repo_id or not hf_token:
209
+ raise HTTPException(status_code=500, detail="Hugging Face 섀정이 ν•„μš”ν•©λ‹ˆλ‹€ (HF_DATA_REPO_ID, HF_DATA_TOKEN)")
210
 
211
+ # Hugging Face Hubμ—μ„œ μ‚¬μš©μž 데이터 λ‘œλ“œ
212
+ try:
213
+ dataset = load_dataset(hf_repo_id, split=user_id, token=hf_token)
214
+ data = dataset.to_pandas().to_dict(orient="records")
215
+
216
+ # 졜근 5개 λ ˆμ½”λ“œ λ°˜ν™˜
217
+ recent_data = data[-5:] if len(data) > 5 else data
218
+
219
+ return {
220
+ "user_id": user_id,
221
+ "count": len(data),
222
+ "recent_data": recent_data,
223
+ "filename": f"{user_id}.parquet",
224
+ "source": "huggingface_hub",
225
+ "repo_id": hf_repo_id
226
+ }
227
+
228
+ except Exception as e:
229
+ # 데이터가 μ—†λŠ” 경우
230
  return {
231
  "user_id": user_id,
232
  "count": 0,
233
  "recent_data": [],
234
+ "source": "huggingface_hub",
235
+ "repo_id": hf_repo_id,
236
  "message": "No data found"
237
  }
238
 
239
+ except HTTPException:
240
+ raise
241
+ except Exception as e:
242
+ print(f"❌ Hugging Face Hub 쑰회 μ‹€νŒ¨: {e}")
243
+ raise HTTPException(status_code=500, detail=f"Hugging Face Hub 쑰회 μ‹€νŒ¨: {str(e)}")
244
+
245
+ @app.post("/upload_batch_dataset")
246
+ async def upload_batch_dataset(payload: BatchUploadPayload):
247
+ """배치 λ‹¨μœ„λ‘œ μ‚¬μš©μž 데이터λ₯Ό Hugging Face Hub둜 ν‘Έμ‹œ"""
248
+ try:
249
+ # Hugging Face ν™˜κ²½λ³€μˆ˜ 확인
250
+ hf_repo_id = os.getenv("HF_DATA_REPO_ID")
251
+ hf_token = os.getenv("HF_DATA_TOKEN")
252
 
253
+ if not hf_repo_id or not hf_token:
254
+ raise HTTPException(status_code=500, detail="Hugging Face 섀정이 ν•„μš”ν•©λ‹ˆλ‹€ (HF_DATA_REPO_ID, HF_DATA_TOKEN)")
255
+
256
+ # μ‚¬μš©μžλ³„λ‘œ 데이터 κ·Έλ£Ήν™”
257
+ user_data_groups = {}
258
+ for item in payload.batch_data:
259
+ user_id = item.user_id
260
+ if user_id not in user_data_groups:
261
+ user_data_groups[user_id] = []
262
+
263
+ # 데이터 λ³€ν™˜
264
+ record = {
265
+ "session_id": item.session_id,
266
+ "measure_date": item.measure_date,
267
+ "rms": item.rms,
268
+ "freq": item.freq,
269
+ "fatigue": item.fatigue,
270
+ "mode": item.mode,
271
+ "window_count": item.window_count,
272
+ "windows": item.windows,
273
+ "measurement_count": item.measurement_count,
274
+ "batch_date": payload.batch_date,
275
+ "batch_size": payload.batch_size,
276
+ "timestamp": datetime.now().isoformat()
277
+ }
278
+ user_data_groups[user_id].append(record)
279
+
280
+ results = {}
281
 
282
+ # ν˜„μž¬ repo에 μžˆλŠ” λͺ¨λ“  split 뢈러였기
283
+ try:
284
+ existing = load_dataset(hf_repo_id, token=hf_token)
285
+ all_splits = list(existing.keys())
286
+ print(f"πŸ“‚ κΈ°μ‘΄ splits: {all_splits}")
287
+ except Exception:
288
+ existing = DatasetDict()
289
+ print("πŸ“‚ κΈ°μ‘΄ repo μ—†μŒ β†’ μƒˆλ‘œ 생성")
290
+
291
+ # ν˜„μž¬ μ‚¬μš©μžλ§Œ μ—…λ°μ΄νŠΈ
292
+ for user_id, records in user_data_groups.items():
293
+ try:
294
+ df = pd.DataFrame(records)
295
+ new_dataset = Dataset.from_pandas(df)
296
+
297
+ if user_id in existing:
298
+ # κΈ°μ‘΄ λ°μ΄ν„°ν”„λ ˆμž„κ³Ό 병합
299
+ old_df = existing[user_id].to_pandas()
300
+ merged = pd.concat([old_df, df], ignore_index=True)
301
+ existing[user_id] = Dataset.from_pandas(merged)
302
+ print(f"πŸ“Š {user_id}: κΈ°μ‘΄ 데이터와 병합 ({len(old_df)} + {len(df)} = {len(merged)}개 λ ˆμ½”λ“œ)")
303
+ else:
304
+ existing[user_id] = new_dataset
305
+ print(f"πŸ“Š {user_id}: μ‹ κ·œ 데이터 μΆ”κ°€ ({len(df)}개 λ ˆμ½”λ“œ)")
306
+
307
+ results[user_id] = {
308
+ "status": "success",
309
+ "new_rows": len(records),
310
+ "filename": f"{user_id}.parquet"
311
+ }
312
+
313
+ except Exception as e:
314
+ print(f"❌ {user_id} 처리 μ‹€νŒ¨: {e}")
315
+ results[user_id] = {
316
+ "status": "failed",
317
+ "error": str(e)
318
+ }
319
+
320
+ # λͺ¨λ“  split ν†΅μ§Έλ‘œ λ‹€μ‹œ push
321
+ try:
322
+ existing.push_to_hub(hf_repo_id, token=hf_token, private=True)
323
+ print(f"βœ… 전체 DatasetDict ν‘Έμ‹œ μ™„λ£Œ: {len(existing)}개 μ‚¬μš©μž")
324
+ except Exception as e:
325
+ print(f"❌ 전체 ν‘Έμ‹œ μ‹€νŒ¨: {e}")
326
+ raise HTTPException(status_code=500, detail=f"전체 ν‘Έμ‹œ μ‹€νŒ¨: {str(e)}")
327
+
328
  return {
329
+ "batch_date": payload.batch_date,
330
+ "batch_size": payload.batch_size,
331
+ "processed_users": len(user_data_groups),
332
+ "results": results,
333
+ "repo_id": hf_repo_id,
334
+ "message": f"Batch upload completed for {len(user_data_groups)} users"
335
  }
336
+
337
+ except HTTPException:
338
+ raise
339
  except Exception as e:
340
+ print(f"❌ 배치 ν‘Έμ‹œ μ‹€νŒ¨: {e}")
341
+ raise HTTPException(status_code=500, detail=f"배치 ν‘Έμ‹œ μ‹€νŒ¨: {str(e)}")
342
 
crontab DELETED
@@ -1,4 +0,0 @@
1
- # 맀일 μžμ •μ— 배치 ν‘Έμ‹œ μ‹€ν–‰
2
- 0 0 * * * cd /app && python upload_hF_dataset.py >> /var/log/batch_push.log 2>&1
3
-
4
- # 빈 쀄 ν•„μš” (cron μš”κ΅¬μ‚¬ν•­)
 
 
 
 
 
last_push_date.txt DELETED
@@ -1 +0,0 @@
1
- 2025-10-23
 
 
start_with_cron.sh DELETED
@@ -1,25 +0,0 @@
1
- #!/bin/bash
2
-
3
- # 둜그 디렉토리 생성
4
- mkdir -p /var/log
5
-
6
- # cron μ„œλΉ„μŠ€ μ‹œμž‘
7
- service cron start
8
-
9
- # cron μƒνƒœ 확인
10
- echo "πŸ“… Cron μ„œλΉ„μŠ€ μ‹œμž‘λ¨"
11
- crontab -l
12
-
13
- # FastAPI μ„œλ²„ μ‹œμž‘ (λ°±κ·ΈλΌμš΄λ“œ)
14
- echo "πŸš€ FastAPI μ„œλ²„ μ‹œμž‘..."
15
- python start.py &
16
-
17
- # μ„œλ²„κ°€ μ‹œμž‘λ  λ•ŒκΉŒμ§€ λŒ€κΈ°
18
- sleep 5
19
-
20
- # μ„œλ²„ μ‹œμž‘ μ™„λ£Œ
21
- echo "βœ… FastAPI μ„œλ²„ μ‹œμž‘ μ™„λ£Œ"
22
-
23
- # 둜그 λͺ¨λ‹ˆν„°λ§ (선택사항)
24
- echo "πŸ“Š 배치 ν‘Έμ‹œ 둜그 λͺ¨λ‹ˆν„°λ§ μ‹œμž‘..."
25
- tail -f /var/log/batch_push.log &
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
upload_hf_dataset.py DELETED
@@ -1,108 +0,0 @@
1
- from datasets import Dataset, DatasetDict
2
- from datetime import datetime, date
3
- import pandas as pd, glob, json, os, shutil
4
- from dotenv import load_dotenv
5
-
6
- load_dotenv()
7
-
8
- HF_DATA_REPO_ID = os.getenv("HF_DATA_REPO_ID")
9
- HF_DATA_TOKEN = os.getenv("HF_DATA_TOKEN")
10
- CACHE_DIR = "./user_data"
11
- BACKUP_DIR = "./backup"
12
- LAST_PUSH_FILE = "./last_push_date.txt"
13
-
14
- def now_str():
15
- return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
16
-
17
- def get_last_push_date():
18
- """λ§ˆμ§€λ§‰ ν‘Έμ‹œ λ‚ μ§œ λ°˜ν™˜"""
19
- if os.path.exists(LAST_PUSH_FILE):
20
- with open(LAST_PUSH_FILE, "r") as f:
21
- return f.read().strip()
22
- return None
23
-
24
- def update_last_push_date():
25
- """λ§ˆμ§€λ§‰ ν‘Έμ‹œ λ‚ μ§œ 기둝"""
26
- with open(LAST_PUSH_FILE, "w") as f:
27
- f.write(str(date.today()))
28
-
29
- def should_push_today():
30
- """였늘 ν‘Έμ‹œ μ—¬λΆ€ 확인"""
31
- last_push = get_last_push_date()
32
- today = str(date.today())
33
- return last_push != today
34
-
35
- def batch_push_to_huggingface():
36
- """ν•˜λ£¨ 1회 Hugging Face Dataset μ—…λ‘œλ“œ"""
37
- # ν•„μˆ˜ ν™˜κ²½λ³€μˆ˜ 확인
38
- if not HF_DATA_REPO_ID or not HF_DATA_TOKEN:
39
- print(f"❌ {now_str()} - ν™˜κ²½λ³€μˆ˜ HF_DATA_REPO_ID λ˜λŠ” HF_DATA_TOKEN이 μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
40
- return
41
-
42
- # ν‘Έμ‹œ μ—¬λΆ€ 체크
43
- if not should_push_today():
44
- print(f"πŸ“… {now_str()} - 이미 였늘 ν‘Έμ‹œ μ™„λ£Œλ¨. μ’…λ£Œ.")
45
- return
46
-
47
- files = glob.glob(os.path.join(CACHE_DIR, "*.json"))
48
- if not files:
49
- print(f"πŸ“ {now_str()} - μΊμ‹œλœ 파일이 μ—†μŠ΅λ‹ˆλ‹€. μ’…λ£Œ.")
50
- return
51
-
52
- print(f"πŸš€ {now_str()} - 배치 ν‘Έμ‹œ μ‹œμž‘ ({len(files)}개 파일)")
53
-
54
- user_splits = {}
55
- for path in files:
56
- user_id = os.path.basename(path).split(".")[0]
57
- try:
58
- with open(path, "r", encoding="utf-8") as f:
59
- records = json.load(f)
60
- if not records:
61
- print(f"⚠️ {user_id}: λΉ„μ–΄μžˆλŠ” 파일, κ±΄λ„ˆλœ€")
62
- continue
63
-
64
- df = pd.DataFrame(records)
65
- user_splits[user_id] = Dataset.from_pandas(df)
66
- print(f"πŸ“Š {user_id}: {len(records)}개 λ ˆμ½”λ“œ λ³€ν™˜ μ™„λ£Œ")
67
-
68
- except Exception as e:
69
- print(f"❌ {user_id}: 파일 λ‘œλ“œ μ‹€νŒ¨ β†’ {e}")
70
- continue
71
-
72
- if not user_splits:
73
- print(f"❌ {now_str()} - μ²˜λ¦¬ν•  데이터가 μ—†μŠ΅λ‹ˆλ‹€. μ’…λ£Œ.")
74
- return
75
-
76
- # λ°±μ—… 디렉토리 생성
77
- os.makedirs(BACKUP_DIR, exist_ok=True)
78
- backup_path = os.path.join(BACKUP_DIR, date.today().isoformat())
79
- shutil.copytree(CACHE_DIR, backup_path, dirs_exist_ok=True)
80
- print(f"πŸ—‚οΈ {now_str()} - 데이터 λ°±μ—… μ™„λ£Œ β†’ {backup_path}")
81
-
82
- try:
83
- dataset_dict = DatasetDict(user_splits)
84
- dataset_dict.push_to_hub(HF_DATA_REPO_ID, token=HF_DATA_TOKEN, private=True)
85
- print(f"βœ… {now_str()} - Hugging Face Hub ν‘Έμ‹œ 성곡 ({len(user_splits)}λͺ…) β†’ {HF_DATA_REPO_ID}")
86
-
87
- # ν‘Έμ‹œ 성곡 μ‹œ μΊμ‹œ 정리
88
- shutil.rmtree(CACHE_DIR, ignore_errors=True)
89
- print(f"πŸ—‘οΈ {now_str()} - user_data 디렉토리 μ‚­μ œ μ™„λ£Œ")
90
-
91
- update_last_push_date()
92
- print(f"πŸ“… {now_str()} - λ§ˆμ§€λ§‰ ν‘Έμ‹œ λ‚ μ§œ μ—…λ°μ΄νŠΈ μ™„λ£Œ")
93
-
94
- except Exception as e:
95
- print(f"❌ {now_str()} - ν‘Έμ‹œ μ‹€νŒ¨: {e}")
96
- print(f"⚠️ {now_str()} - μΊμ‹œ μœ μ§€ (데이터 μœ μ‹€ λ°©μ§€)")
97
- # μ‹€νŒ¨ μ‹œ μΊμ‹œ μœ μ§€
98
- return
99
-
100
- def main():
101
- """CLI/cron μ§„μž…μ """
102
- try:
103
- batch_push_to_huggingface()
104
- except Exception as e:
105
- print(f"πŸ’₯ {now_str()} - 예기치 λͺ»ν•œ 였λ₯˜ λ°œμƒ: {e}")
106
-
107
- if __name__ == "__main__":
108
- main()