Spaces:
Sleeping
Sleeping
batch insert
Browse files- .gitignore +0 -4
- Dockerfile +3 -14
- app.py +175 -69
- crontab +0 -4
- last_push_date.txt +0 -1
- start_with_cron.sh +0 -25
- upload_hf_dataset.py +0 -108
.gitignore
CHANGED
|
@@ -34,7 +34,3 @@ Thumbs.db
|
|
| 34 |
|
| 35 |
# λ‘κ·Έ
|
| 36 |
*.log
|
| 37 |
-
|
| 38 |
-
# μ
λ‘λ λ°μ΄ν°
|
| 39 |
-
uploads/
|
| 40 |
-
training_data/
|
|
|
|
| 34 |
|
| 35 |
# λ‘κ·Έ
|
| 36 |
*.log
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
|
@@ -6,7 +6,6 @@ WORKDIR /app
|
|
| 6 |
# μμ€ν
ν¨ν€μ§ μ
λ°μ΄νΈ λ° νμν ν¨ν€μ§ μ€μΉ
|
| 7 |
RUN apt-get update && apt-get install -y \
|
| 8 |
build-essential \
|
| 9 |
-
cron \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
# requirements λ³΅μ¬ λ° μ€μΉ
|
|
@@ -16,22 +15,12 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 16 |
# μ ν리μΌμ΄μ
μ½λ 볡μ¬
|
| 17 |
COPY . .
|
| 18 |
|
| 19 |
-
# cron μ€μ νμΌ λ³΅μ¬
|
| 20 |
-
COPY crontab /etc/cron.d/batch-push-cron
|
| 21 |
-
|
| 22 |
-
# cron κΆν μ€μ
|
| 23 |
-
RUN chmod 0644 /etc/cron.d/batch-push-cron
|
| 24 |
-
RUN crontab /etc/cron.d/batch-push-cron
|
| 25 |
-
|
| 26 |
# λ‘κ·Έ λλ ν 리 μμ±
|
| 27 |
-
RUN mkdir -p /
|
| 28 |
|
| 29 |
# Hugging Face Spaceλ ν¬νΈ 7860μ μ¬μ©ν©λλ€
|
| 30 |
EXPOSE 7860
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
RUN chmod +x /start_with_cron.sh
|
| 35 |
-
|
| 36 |
-
CMD ["/start_with_cron.sh"]
|
| 37 |
|
|
|
|
| 6 |
# μμ€ν
ν¨ν€μ§ μ
λ°μ΄νΈ λ° νμν ν¨ν€μ§ μ€μΉ
|
| 7 |
RUN apt-get update && apt-get install -y \
|
| 8 |
build-essential \
|
|
|
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
# requirements λ³΅μ¬ λ° μ€μΉ
|
|
|
|
| 15 |
# μ ν리μΌμ΄μ
μ½λ 볡μ¬
|
| 16 |
COPY . .
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# λ‘κ·Έ λλ ν 리 μμ±
|
| 19 |
+
RUN mkdir -p /app/logs
|
| 20 |
|
| 21 |
# Hugging Face Spaceλ ν¬νΈ 7860μ μ¬μ©ν©λλ€
|
| 22 |
EXPOSE 7860
|
| 23 |
|
| 24 |
+
# FastAPI μλ² μ§μ μ€ν (APScheduler ν¬ν¨)
|
| 25 |
+
CMD ["python", "start.py"]
|
|
|
|
|
|
|
|
|
|
| 26 |
|
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import json
|
|
| 3 |
from typing import List, Optional
|
| 4 |
from fastapi import FastAPI, HTTPException, Request
|
| 5 |
from pydantic import BaseModel, Field, ConfigDict
|
| 6 |
-
from typing import List
|
| 7 |
import oracledb
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
import json
|
|
@@ -56,6 +55,24 @@ class StatePayload(BaseModel):
|
|
| 56 |
user_emb: Optional[List[float]] = Field(default=None, description="length=12")
|
| 57 |
model_version: Optional[str] = None
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# ----- μ νΈ -----
|
| 61 |
def clob_json(obj) -> str:
|
|
@@ -74,7 +91,7 @@ def root():
|
|
| 74 |
"health_db": "/health/db (DB μ°κ²° 체ν¬)",
|
| 75 |
"docs": "/docs",
|
| 76 |
"upload_state": "/upload_state",
|
| 77 |
-
"
|
| 78 |
"user_dataset": "/user_dataset/{user_id}"
|
| 79 |
}
|
| 80 |
}
|
|
@@ -146,91 +163,180 @@ def upload_state(p: StatePayload):
|
|
| 146 |
except Exception as e:
|
| 147 |
raise HTTPException(500, f"upload_state failed: {e}")
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
try:
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
# λ‘컬 λ°μ΄ν° λλ ν 리 μμ±
|
| 164 |
-
data_dir = "user_data"
|
| 165 |
-
os.makedirs(data_dir, exist_ok=True)
|
| 166 |
-
|
| 167 |
-
# μ¬μ©μλ³ JSON νμΌ κ²½λ‘
|
| 168 |
-
user_file = os.path.join(data_dir, f"{user_id}.json")
|
| 169 |
-
|
| 170 |
-
# κΈ°μ‘΄ λ°μ΄ν° λ‘λ
|
| 171 |
-
existing_data = []
|
| 172 |
-
if os.path.exists(user_file):
|
| 173 |
-
try:
|
| 174 |
-
with open(user_file, 'r', encoding='utf-8') as f:
|
| 175 |
-
existing_data = json.load(f)
|
| 176 |
-
print(f"π κΈ°μ‘΄ λ°μ΄ν° λ‘λ: {user_id} ({len(existing_data)}κ° λ μ½λ)")
|
| 177 |
-
except:
|
| 178 |
-
existing_data = []
|
| 179 |
-
|
| 180 |
-
# μ λ°μ΄ν° μΆκ°
|
| 181 |
-
existing_data.append(data)
|
| 182 |
-
|
| 183 |
-
# νμΌ μ μ₯
|
| 184 |
-
with open(user_file, 'w', encoding='utf-8') as f:
|
| 185 |
-
json.dump(existing_data, f, ensure_ascii=False, indent=2)
|
| 186 |
-
|
| 187 |
-
print(f"β
λ‘컬 νμΌ μ μ₯ μλ£: {user_id} ({len(existing_data)}κ° λ μ½λ)")
|
| 188 |
-
return {
|
| 189 |
-
"user_id": user_id,
|
| 190 |
-
"rows": len(existing_data),
|
| 191 |
-
"status": "success",
|
| 192 |
-
"filename": f"{user_id}.json",
|
| 193 |
-
"file_path": user_file,
|
| 194 |
-
"message": f"Data saved to local file: {user_file}"
|
| 195 |
-
}
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
except Exception as e:
|
| 198 |
-
print(f"β
|
| 199 |
-
|
| 200 |
|
| 201 |
@app.get("/user_dataset/{user_id}")
|
| 202 |
async def read_user_dataset(user_id: str):
|
| 203 |
-
"""
|
| 204 |
try:
|
| 205 |
-
#
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
return {
|
| 211 |
"user_id": user_id,
|
| 212 |
"count": 0,
|
| 213 |
"recent_data": [],
|
| 214 |
-
"source": "
|
|
|
|
| 215 |
"message": "No data found"
|
| 216 |
}
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
return {
|
| 226 |
-
"
|
| 227 |
-
"
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
"
|
|
|
|
| 231 |
}
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
except Exception as e:
|
| 234 |
-
print(f"β
|
| 235 |
-
raise HTTPException(status_code=500, detail=f"
|
| 236 |
|
|
|
|
| 3 |
from typing import List, Optional
|
| 4 |
from fastapi import FastAPI, HTTPException, Request
|
| 5 |
from pydantic import BaseModel, Field, ConfigDict
|
|
|
|
| 6 |
import oracledb
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
import json
|
|
|
|
| 55 |
user_emb: Optional[List[float]] = Field(default=None, description="length=12")
|
| 56 |
model_version: Optional[str] = None
|
| 57 |
|
| 58 |
+
# λ°°μΉ λ°μ΄ν°μ© μ€ν€λ§
|
| 59 |
+
class BatchDataItem(BaseModel):
|
| 60 |
+
user_id: str
|
| 61 |
+
session_id: str
|
| 62 |
+
measure_date: str
|
| 63 |
+
rms: float
|
| 64 |
+
freq: float
|
| 65 |
+
fatigue: float
|
| 66 |
+
mode: str
|
| 67 |
+
window_count: int
|
| 68 |
+
windows: List[dict] = Field(default_factory=list)
|
| 69 |
+
measurement_count: int
|
| 70 |
+
|
| 71 |
+
class BatchUploadPayload(BaseModel):
|
| 72 |
+
batch_data: List[BatchDataItem]
|
| 73 |
+
batch_size: int
|
| 74 |
+
batch_date: str
|
| 75 |
+
|
| 76 |
|
| 77 |
# ----- μ νΈ -----
|
| 78 |
def clob_json(obj) -> str:
|
|
|
|
| 91 |
"health_db": "/health/db (DB μ°κ²° 체ν¬)",
|
| 92 |
"docs": "/docs",
|
| 93 |
"upload_state": "/upload_state",
|
| 94 |
+
"upload_batch_dataset": "/upload_batch_dataset (λ°°μΉ λ°μ΄ν°)",
|
| 95 |
"user_dataset": "/user_dataset/{user_id}"
|
| 96 |
}
|
| 97 |
}
|
|
|
|
| 163 |
except Exception as e:
|
| 164 |
raise HTTPException(500, f"upload_state failed: {e}")
|
| 165 |
|
| 166 |
+
@app.on_event("startup")
|
| 167 |
+
async def startup_event():
|
| 168 |
+
"""μλ² μμ μ μ΄κΈ°ν"""
|
| 169 |
+
print("π MuscleCare API μλ² μμ μ€...")
|
| 170 |
+
|
| 171 |
+
# λ‘κ·Έ λλ ν 리 μμ± (λ‘컬/λ°°ν¬ νκ²½ ꡬλΆ)
|
| 172 |
+
log_dir = "/app/logs" if os.path.exists("/app") else "./logs"
|
| 173 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 174 |
+
print(f"π λ‘κ·Έ λλ ν 리 μμ±: {log_dir}")
|
| 175 |
+
|
| 176 |
+
# Oracle DB μ΄κΈ°ν
|
| 177 |
try:
|
| 178 |
+
db_initialized = init_db_from_env()
|
| 179 |
+
if db_initialized:
|
| 180 |
+
print("β
Oracle DB μ°κ²° μλ£")
|
| 181 |
+
else:
|
| 182 |
+
print("β οΈ Oracle DB μ°κ²° μ€ν¨ - DB κ΄λ ¨ κΈ°λ₯μ΄ λΉνμ±νλ©λλ€")
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"β Oracle DB μ΄κΈ°ν μ€λ₯: {e}")
|
| 185 |
+
|
| 186 |
+
print("β
μλ² μμ μλ£")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
@app.on_event("shutdown")
|
| 189 |
+
async def shutdown_event():
|
| 190 |
+
"""μλ² μ’
λ£ μ μ 리"""
|
| 191 |
+
print("π μλ² μ’
λ£ μ€...")
|
| 192 |
+
try:
|
| 193 |
+
db_manager = get_db_manager()
|
| 194 |
+
db_manager.close()
|
| 195 |
+
print("β
Oracle DB μ°κ²° μ’
λ£ μλ£")
|
| 196 |
except Exception as e:
|
| 197 |
+
print(f"β μ’
λ£ μ²λ¦¬ μ€λ₯: {e}")
|
| 198 |
+
|
| 199 |
|
| 200 |
@app.get("/user_dataset/{user_id}")
|
| 201 |
async def read_user_dataset(user_id: str):
|
| 202 |
+
"""Hugging Face Hubμμ μ¬μ©μ λ°μ΄ν° μ‘°ν"""
|
| 203 |
try:
|
| 204 |
+
# Hugging Face νκ²½λ³μ νμΈ
|
| 205 |
+
hf_repo_id = os.getenv("HF_DATA_REPO_ID")
|
| 206 |
+
hf_token = os.getenv("HF_DATA_TOKEN")
|
| 207 |
+
|
| 208 |
+
if not hf_repo_id or not hf_token:
|
| 209 |
+
raise HTTPException(status_code=500, detail="Hugging Face μ€μ μ΄ νμν©λλ€ (HF_DATA_REPO_ID, HF_DATA_TOKEN)")
|
| 210 |
|
| 211 |
+
# Hugging Face Hubμμ μ¬μ©μ λ°μ΄ν° λ‘λ
|
| 212 |
+
try:
|
| 213 |
+
dataset = load_dataset(hf_repo_id, split=user_id, token=hf_token)
|
| 214 |
+
data = dataset.to_pandas().to_dict(orient="records")
|
| 215 |
+
|
| 216 |
+
# μ΅κ·Ό 5κ° λ μ½λ λ°ν
|
| 217 |
+
recent_data = data[-5:] if len(data) > 5 else data
|
| 218 |
+
|
| 219 |
+
return {
|
| 220 |
+
"user_id": user_id,
|
| 221 |
+
"count": len(data),
|
| 222 |
+
"recent_data": recent_data,
|
| 223 |
+
"filename": f"{user_id}.parquet",
|
| 224 |
+
"source": "huggingface_hub",
|
| 225 |
+
"repo_id": hf_repo_id
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
# λ°μ΄ν°κ° μλ κ²½μ°
|
| 230 |
return {
|
| 231 |
"user_id": user_id,
|
| 232 |
"count": 0,
|
| 233 |
"recent_data": [],
|
| 234 |
+
"source": "huggingface_hub",
|
| 235 |
+
"repo_id": hf_repo_id,
|
| 236 |
"message": "No data found"
|
| 237 |
}
|
| 238 |
|
| 239 |
+
except HTTPException:
|
| 240 |
+
raise
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"β Hugging Face Hub μ‘°ν μ€ν¨: {e}")
|
| 243 |
+
raise HTTPException(status_code=500, detail=f"Hugging Face Hub μ‘°ν μ€ν¨: {str(e)}")
|
| 244 |
+
|
| 245 |
+
@app.post("/upload_batch_dataset")
|
| 246 |
+
async def upload_batch_dataset(payload: BatchUploadPayload):
|
| 247 |
+
"""λ°°μΉ λ¨μλ‘ μ¬μ©μ λ°μ΄ν°λ₯Ό Hugging Face Hubλ‘ νΈμ"""
|
| 248 |
+
try:
|
| 249 |
+
# Hugging Face νκ²½λ³μ νμΈ
|
| 250 |
+
hf_repo_id = os.getenv("HF_DATA_REPO_ID")
|
| 251 |
+
hf_token = os.getenv("HF_DATA_TOKEN")
|
| 252 |
|
| 253 |
+
if not hf_repo_id or not hf_token:
|
| 254 |
+
raise HTTPException(status_code=500, detail="Hugging Face μ€μ μ΄ νμν©λλ€ (HF_DATA_REPO_ID, HF_DATA_TOKEN)")
|
| 255 |
+
|
| 256 |
+
# μ¬μ©μλ³λ‘ λ°μ΄ν° κ·Έλ£Ήν
|
| 257 |
+
user_data_groups = {}
|
| 258 |
+
for item in payload.batch_data:
|
| 259 |
+
user_id = item.user_id
|
| 260 |
+
if user_id not in user_data_groups:
|
| 261 |
+
user_data_groups[user_id] = []
|
| 262 |
+
|
| 263 |
+
# λ°μ΄ν° λ³ν
|
| 264 |
+
record = {
|
| 265 |
+
"session_id": item.session_id,
|
| 266 |
+
"measure_date": item.measure_date,
|
| 267 |
+
"rms": item.rms,
|
| 268 |
+
"freq": item.freq,
|
| 269 |
+
"fatigue": item.fatigue,
|
| 270 |
+
"mode": item.mode,
|
| 271 |
+
"window_count": item.window_count,
|
| 272 |
+
"windows": item.windows,
|
| 273 |
+
"measurement_count": item.measurement_count,
|
| 274 |
+
"batch_date": payload.batch_date,
|
| 275 |
+
"batch_size": payload.batch_size,
|
| 276 |
+
"timestamp": datetime.now().isoformat()
|
| 277 |
+
}
|
| 278 |
+
user_data_groups[user_id].append(record)
|
| 279 |
+
|
| 280 |
+
results = {}
|
| 281 |
|
| 282 |
+
# νμ¬ repoμ μλ λͺ¨λ split λΆλ¬μ€κΈ°
|
| 283 |
+
try:
|
| 284 |
+
existing = load_dataset(hf_repo_id, token=hf_token)
|
| 285 |
+
all_splits = list(existing.keys())
|
| 286 |
+
print(f"π κΈ°μ‘΄ splits: {all_splits}")
|
| 287 |
+
except Exception:
|
| 288 |
+
existing = DatasetDict()
|
| 289 |
+
print("π κΈ°μ‘΄ repo μμ β μλ‘ μμ±")
|
| 290 |
+
|
| 291 |
+
# νμ¬ μ¬μ©μλ§ μ
λ°μ΄νΈ
|
| 292 |
+
for user_id, records in user_data_groups.items():
|
| 293 |
+
try:
|
| 294 |
+
df = pd.DataFrame(records)
|
| 295 |
+
new_dataset = Dataset.from_pandas(df)
|
| 296 |
+
|
| 297 |
+
if user_id in existing:
|
| 298 |
+
# κΈ°μ‘΄ λ°μ΄ν°νλ μκ³Ό λ³ν©
|
| 299 |
+
old_df = existing[user_id].to_pandas()
|
| 300 |
+
merged = pd.concat([old_df, df], ignore_index=True)
|
| 301 |
+
existing[user_id] = Dataset.from_pandas(merged)
|
| 302 |
+
print(f"π {user_id}: κΈ°μ‘΄ λ°μ΄ν°μ λ³ν© ({len(old_df)} + {len(df)} = {len(merged)}κ° λ μ½λ)")
|
| 303 |
+
else:
|
| 304 |
+
existing[user_id] = new_dataset
|
| 305 |
+
print(f"π {user_id}: μ κ· λ°μ΄ν° μΆκ° ({len(df)}κ° λ μ½λ)")
|
| 306 |
+
|
| 307 |
+
results[user_id] = {
|
| 308 |
+
"status": "success",
|
| 309 |
+
"new_rows": len(records),
|
| 310 |
+
"filename": f"{user_id}.parquet"
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
except Exception as e:
|
| 314 |
+
print(f"β {user_id} μ²λ¦¬ μ€ν¨: {e}")
|
| 315 |
+
results[user_id] = {
|
| 316 |
+
"status": "failed",
|
| 317 |
+
"error": str(e)
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
# λͺ¨λ split ν΅μ§Έλ‘ λ€μ push
|
| 321 |
+
try:
|
| 322 |
+
existing.push_to_hub(hf_repo_id, token=hf_token, private=True)
|
| 323 |
+
print(f"β
μ 체 DatasetDict νΈμ μλ£: {len(existing)}κ° μ¬μ©μ")
|
| 324 |
+
except Exception as e:
|
| 325 |
+
print(f"β μ 체 νΈμ μ€ν¨: {e}")
|
| 326 |
+
raise HTTPException(status_code=500, detail=f"μ 체 νΈμ μ€ν¨: {str(e)}")
|
| 327 |
+
|
| 328 |
return {
|
| 329 |
+
"batch_date": payload.batch_date,
|
| 330 |
+
"batch_size": payload.batch_size,
|
| 331 |
+
"processed_users": len(user_data_groups),
|
| 332 |
+
"results": results,
|
| 333 |
+
"repo_id": hf_repo_id,
|
| 334 |
+
"message": f"Batch upload completed for {len(user_data_groups)} users"
|
| 335 |
}
|
| 336 |
+
|
| 337 |
+
except HTTPException:
|
| 338 |
+
raise
|
| 339 |
except Exception as e:
|
| 340 |
+
print(f"β λ°°μΉ νΈμ μ€ν¨: {e}")
|
| 341 |
+
raise HTTPException(status_code=500, detail=f"λ°°μΉ νΈμ μ€ν¨: {str(e)}")
|
| 342 |
|
crontab
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
# λ§€μΌ μμ μ λ°°μΉ νΈμ μ€ν
|
| 2 |
-
0 0 * * * cd /app && python upload_hF_dataset.py >> /var/log/batch_push.log 2>&1
|
| 3 |
-
|
| 4 |
-
# λΉ μ€ νμ (cron μꡬμ¬ν)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
last_push_date.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
2025-10-23
|
|
|
|
|
|
start_with_cron.sh
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
# λ‘κ·Έ λλ ν 리 μμ±
|
| 4 |
-
mkdir -p /var/log
|
| 5 |
-
|
| 6 |
-
# cron μλΉμ€ μμ
|
| 7 |
-
service cron start
|
| 8 |
-
|
| 9 |
-
# cron μν νμΈ
|
| 10 |
-
echo "π
Cron μλΉμ€ μμλ¨"
|
| 11 |
-
crontab -l
|
| 12 |
-
|
| 13 |
-
# FastAPI μλ² μμ (λ°±κ·ΈλΌμ΄λ)
|
| 14 |
-
echo "π FastAPI μλ² μμ..."
|
| 15 |
-
python start.py &
|
| 16 |
-
|
| 17 |
-
# μλ²κ° μμλ λκΉμ§ λκΈ°
|
| 18 |
-
sleep 5
|
| 19 |
-
|
| 20 |
-
# μλ² μμ μλ£
|
| 21 |
-
echo "β
FastAPI μλ² μμ μλ£"
|
| 22 |
-
|
| 23 |
-
# λ‘κ·Έ λͺ¨λν°λ§ (μ νμ¬ν)
|
| 24 |
-
echo "π λ°°μΉ νΈμ λ‘κ·Έ λͺ¨λν°λ§ μμ..."
|
| 25 |
-
tail -f /var/log/batch_push.log &
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
upload_hf_dataset.py
DELETED
|
@@ -1,108 +0,0 @@
|
|
| 1 |
-
from datasets import Dataset, DatasetDict
|
| 2 |
-
from datetime import datetime, date
|
| 3 |
-
import pandas as pd, glob, json, os, shutil
|
| 4 |
-
from dotenv import load_dotenv
|
| 5 |
-
|
| 6 |
-
load_dotenv()
|
| 7 |
-
|
| 8 |
-
HF_DATA_REPO_ID = os.getenv("HF_DATA_REPO_ID")
|
| 9 |
-
HF_DATA_TOKEN = os.getenv("HF_DATA_TOKEN")
|
| 10 |
-
CACHE_DIR = "./user_data"
|
| 11 |
-
BACKUP_DIR = "./backup"
|
| 12 |
-
LAST_PUSH_FILE = "./last_push_date.txt"
|
| 13 |
-
|
| 14 |
-
def now_str():
|
| 15 |
-
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 16 |
-
|
| 17 |
-
def get_last_push_date():
|
| 18 |
-
"""λ§μ§λ§ νΈμ λ μ§ λ°ν"""
|
| 19 |
-
if os.path.exists(LAST_PUSH_FILE):
|
| 20 |
-
with open(LAST_PUSH_FILE, "r") as f:
|
| 21 |
-
return f.read().strip()
|
| 22 |
-
return None
|
| 23 |
-
|
| 24 |
-
def update_last_push_date():
|
| 25 |
-
"""λ§μ§λ§ νΈμ λ μ§ κΈ°λ‘"""
|
| 26 |
-
with open(LAST_PUSH_FILE, "w") as f:
|
| 27 |
-
f.write(str(date.today()))
|
| 28 |
-
|
| 29 |
-
def should_push_today():
|
| 30 |
-
"""μ€λ νΈμ μ¬λΆ νμΈ"""
|
| 31 |
-
last_push = get_last_push_date()
|
| 32 |
-
today = str(date.today())
|
| 33 |
-
return last_push != today
|
| 34 |
-
|
| 35 |
-
def batch_push_to_huggingface():
|
| 36 |
-
"""ν루 1ν Hugging Face Dataset μ
λ‘λ"""
|
| 37 |
-
# νμ νκ²½λ³μ νμΈ
|
| 38 |
-
if not HF_DATA_REPO_ID or not HF_DATA_TOKEN:
|
| 39 |
-
print(f"β {now_str()} - νκ²½λ³μ HF_DATA_REPO_ID λλ HF_DATA_TOKENμ΄ μ€μ λμ§ μμμ΅λλ€.")
|
| 40 |
-
return
|
| 41 |
-
|
| 42 |
-
# νΈμ μ¬λΆ 체ν¬
|
| 43 |
-
if not should_push_today():
|
| 44 |
-
print(f"π
{now_str()} - μ΄λ―Έ μ€λ νΈμ μλ£λ¨. μ’
λ£.")
|
| 45 |
-
return
|
| 46 |
-
|
| 47 |
-
files = glob.glob(os.path.join(CACHE_DIR, "*.json"))
|
| 48 |
-
if not files:
|
| 49 |
-
print(f"π {now_str()} - μΊμλ νμΌμ΄ μμ΅λλ€. μ’
λ£.")
|
| 50 |
-
return
|
| 51 |
-
|
| 52 |
-
print(f"π {now_str()} - λ°°μΉ νΈμ μμ ({len(files)}κ° νμΌ)")
|
| 53 |
-
|
| 54 |
-
user_splits = {}
|
| 55 |
-
for path in files:
|
| 56 |
-
user_id = os.path.basename(path).split(".")[0]
|
| 57 |
-
try:
|
| 58 |
-
with open(path, "r", encoding="utf-8") as f:
|
| 59 |
-
records = json.load(f)
|
| 60 |
-
if not records:
|
| 61 |
-
print(f"β οΈ {user_id}: λΉμ΄μλ νμΌ, 건λλ")
|
| 62 |
-
continue
|
| 63 |
-
|
| 64 |
-
df = pd.DataFrame(records)
|
| 65 |
-
user_splits[user_id] = Dataset.from_pandas(df)
|
| 66 |
-
print(f"π {user_id}: {len(records)}κ° λ μ½λ λ³ν μλ£")
|
| 67 |
-
|
| 68 |
-
except Exception as e:
|
| 69 |
-
print(f"β {user_id}: νμΌ λ‘λ μ€ν¨ β {e}")
|
| 70 |
-
continue
|
| 71 |
-
|
| 72 |
-
if not user_splits:
|
| 73 |
-
print(f"β {now_str()} - μ²λ¦¬ν λ°μ΄ν°κ° μμ΅λλ€. μ’
λ£.")
|
| 74 |
-
return
|
| 75 |
-
|
| 76 |
-
# λ°±μ
λλ ν 리 μμ±
|
| 77 |
-
os.makedirs(BACKUP_DIR, exist_ok=True)
|
| 78 |
-
backup_path = os.path.join(BACKUP_DIR, date.today().isoformat())
|
| 79 |
-
shutil.copytree(CACHE_DIR, backup_path, dirs_exist_ok=True)
|
| 80 |
-
print(f"ποΈ {now_str()} - λ°μ΄ν° λ°±μ
μλ£ β {backup_path}")
|
| 81 |
-
|
| 82 |
-
try:
|
| 83 |
-
dataset_dict = DatasetDict(user_splits)
|
| 84 |
-
dataset_dict.push_to_hub(HF_DATA_REPO_ID, token=HF_DATA_TOKEN, private=True)
|
| 85 |
-
print(f"β
{now_str()} - Hugging Face Hub νΈμ μ±κ³΅ ({len(user_splits)}λͺ
) β {HF_DATA_REPO_ID}")
|
| 86 |
-
|
| 87 |
-
# νΈμ μ±κ³΅ μ μΊμ μ 리
|
| 88 |
-
shutil.rmtree(CACHE_DIR, ignore_errors=True)
|
| 89 |
-
print(f"ποΈ {now_str()} - user_data λλ ν 리 μμ μλ£")
|
| 90 |
-
|
| 91 |
-
update_last_push_date()
|
| 92 |
-
print(f"π
{now_str()} - λ§μ§λ§ νΈμ λ μ§ μ
λ°μ΄νΈ μλ£")
|
| 93 |
-
|
| 94 |
-
except Exception as e:
|
| 95 |
-
print(f"β {now_str()} - νΈμ μ€ν¨: {e}")
|
| 96 |
-
print(f"β οΈ {now_str()} - μΊμ μ μ§ (λ°μ΄ν° μ μ€ λ°©μ§)")
|
| 97 |
-
# μ€ν¨ μ μΊμ μ μ§
|
| 98 |
-
return
|
| 99 |
-
|
| 100 |
-
def main():
|
| 101 |
-
"""CLI/cron μ§μ
μ """
|
| 102 |
-
try:
|
| 103 |
-
batch_push_to_huggingface()
|
| 104 |
-
except Exception as e:
|
| 105 |
-
print(f"π₯ {now_str()} - μκΈ°μΉ λͺ»ν μ€λ₯ λ°μ: {e}")
|
| 106 |
-
|
| 107 |
-
if __name__ == "__main__":
|
| 108 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|