seawolf2357 commited on
Commit
adf61ba
·
verified ·
1 Parent(s): 0a217e8

Upload cache_db.py

Browse files
Files changed (1) hide show
  1. cache_db.py +510 -0
cache_db.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 과제 공고 벡터 DB 캐시 시스템
3
+ - ChromaDB를 사용한 로컬 캐시
4
+ - 매일 KST 10:00, 22:00 자동 동기화
5
+ - Hugging Face Space 영구 스토리지 활용 (/data)
6
+ """
7
+ import os
8
+ import json
9
+ import hashlib
10
+ import threading
11
+ import logging
12
+ from datetime import datetime, timedelta
13
+ from typing import List, Dict, Tuple, Optional
14
+ from pathlib import Path
15
+ import pytz
16
+
17
+ # 로깅 설정
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # 영구 스토리지 경로 (HF Space)
22
+ PERSISTENT_DIR = Path("/data") if os.path.exists("/data") else Path("./data")
23
+ CACHE_DIR = PERSISTENT_DIR / "announcement_cache"
24
+ DB_PATH = CACHE_DIR / "chroma_db"
25
+ METADATA_FILE = CACHE_DIR / "sync_metadata.json"
26
+
27
+ # 디렉토리 생성
28
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
29
+
30
+ # ChromaDB 사용 가능 여부
31
+ try:
32
+ import chromadb
33
+ from chromadb.config import Settings
34
+ CHROMADB_AVAILABLE = True
35
+ except ImportError:
36
+ CHROMADB_AVAILABLE = False
37
+ logger.warning("ChromaDB not available. Using JSON fallback.")
38
+
39
+ # APScheduler 사용 가능 여부
40
+ try:
41
+ from apscheduler.schedulers.background import BackgroundScheduler
42
+ from apscheduler.triggers.cron import CronTrigger
43
+ SCHEDULER_AVAILABLE = True
44
+ except ImportError:
45
+ SCHEDULER_AVAILABLE = False
46
+ logger.warning("APScheduler not available. Auto-sync disabled.")
47
+
48
+ # 한국 시간대
49
+ KST = pytz.timezone('Asia/Seoul')
50
+
51
+
52
+ class AnnouncementCache:
53
+ """공고 캐시 관리 클래스"""
54
+
55
+ def __init__(self):
56
+ self.collection = None
57
+ self.client = None
58
+ self._init_db()
59
+ self._load_metadata()
60
+
61
+ def _init_db(self):
62
+ """ChromaDB 초기화"""
63
+ if CHROMADB_AVAILABLE:
64
+ try:
65
+ self.client = chromadb.PersistentClient(
66
+ path=str(DB_PATH),
67
+ settings=Settings(anonymized_telemetry=False)
68
+ )
69
+ self.collection = self.client.get_or_create_collection(
70
+ name="announcements",
71
+ metadata={"description": "기업마당 과제 공고 캐시"}
72
+ )
73
+ logger.info(f"ChromaDB initialized at {DB_PATH}")
74
+ except Exception as e:
75
+ logger.error(f"ChromaDB init error: {e}")
76
+ self.collection = None
77
+ else:
78
+ logger.info("Using JSON fallback storage")
79
+
80
+ def _load_metadata(self):
81
+ """동기화 메타데이터 로드"""
82
+ self.metadata = {
83
+ "last_sync": None,
84
+ "total_count": 0,
85
+ "sync_history": []
86
+ }
87
+ if METADATA_FILE.exists():
88
+ try:
89
+ with open(METADATA_FILE, 'r', encoding='utf-8') as f:
90
+ self.metadata = json.load(f)
91
+ except Exception as e:
92
+ logger.error(f"Metadata load error: {e}")
93
+
94
+ def _save_metadata(self):
95
+ """동기화 메타데이터 저장"""
96
+ try:
97
+ with open(METADATA_FILE, 'w', encoding='utf-8') as f:
98
+ json.dump(self.metadata, f, ensure_ascii=False, indent=2)
99
+ except Exception as e:
100
+ logger.error(f"Metadata save error: {e}")
101
+
102
+ def _generate_id(self, item: Dict) -> str:
103
+ """공고 고유 ID 생성"""
104
+ unique_str = f"{item.get('pblancId', '')}-{item.get('title', '')}-{item.get('pubDate', '')}"
105
+ return hashlib.md5(unique_str.encode()).hexdigest()
106
+
107
+ def _item_to_document(self, item: Dict) -> Tuple[str, str, Dict]:
108
+ """API 아이템을 ChromaDB 문서로 변환"""
109
+ doc_id = self._generate_id(item)
110
+
111
+ # 검색 가능한 텍스트 생성
112
+ searchable_text = " ".join(filter(None, [
113
+ item.get("title", ""),
114
+ item.get("author", ""),
115
+ item.get("description", ""),
116
+ item.get("hashTags", ""),
117
+ item.get("lcategory", ""),
118
+ item.get("trgetNm", ""),
119
+ item.get("excInsttNm", ""),
120
+ ]))
121
+
122
+ # 메타데이터
123
+ metadata = {
124
+ "pblancId": str(item.get("pblancId", item.get("seq", ""))),
125
+ "title": item.get("title", item.get("pblancNm", ""))[:500],
126
+ "author": item.get("author", item.get("jrsdInsttNm", ""))[:200],
127
+ "reqstDt": item.get("reqstDt", item.get("reqstBeginEndDe", ""))[:100],
128
+ "pubDate": item.get("pubDate", item.get("creatPnttm", ""))[:20],
129
+ "link": item.get("link", item.get("pblancUrl", ""))[:500],
130
+ "lcategory": item.get("lcategory", item.get("pldirSportRealmLclasCodeNm", ""))[:50],
131
+ "hashTags": item.get("hashTags", "")[:200],
132
+ "description": item.get("description", item.get("bsnsSumryCn", ""))[:1000],
133
+ "trgetNm": item.get("trgetNm", "")[:200],
134
+ "excInsttNm": item.get("excInsttNm", "")[:200],
135
+ "inqireCo": str(item.get("inqireCo", "0")),
136
+ "flpthNm": item.get("flpthNm", "")[:500],
137
+ "fileNm": item.get("fileNm", "")[:200],
138
+ "refrncNm": item.get("refrncNm", "")[:200],
139
+ "rceptEngnHmpgUrl": item.get("rceptEngnHmpgUrl", "")[:500],
140
+ "cached_at": datetime.now(KST).isoformat(),
141
+ }
142
+
143
+ return doc_id, searchable_text, metadata
144
+
145
+ def get_count(self) -> int:
146
+ """캐시된 공고 수 반환"""
147
+ if self.collection:
148
+ try:
149
+ return self.collection.count()
150
+ except:
151
+ return 0
152
+ return self.metadata.get("total_count", 0)
153
+
154
+ def get_all(self) -> List[Dict]:
155
+ """모든 캐시된 공고 반환"""
156
+ if not self.collection:
157
+ return self._get_all_from_json()
158
+
159
+ try:
160
+ result = self.collection.get(include=["metadatas", "documents"])
161
+ items = []
162
+ for i, meta in enumerate(result.get("metadatas", [])):
163
+ item = self._metadata_to_item(meta)
164
+ items.append(item)
165
+ return items
166
+ except Exception as e:
167
+ logger.error(f"Get all error: {e}")
168
+ return []
169
+
170
+ def _metadata_to_item(self, meta: Dict) -> Dict:
171
+ """메타데이터를 원본 아이템 형식으로 변환"""
172
+ return {
173
+ "pblancId": meta.get("pblancId", ""),
174
+ "seq": meta.get("pblancId", ""),
175
+ "title": meta.get("title", ""),
176
+ "pblancNm": meta.get("title", ""),
177
+ "author": meta.get("author", ""),
178
+ "jrsdInsttNm": meta.get("author", ""),
179
+ "reqstDt": meta.get("reqstDt", ""),
180
+ "reqstBeginEndDe": meta.get("reqstDt", ""),
181
+ "pubDate": meta.get("pubDate", ""),
182
+ "creatPnttm": meta.get("pubDate", ""),
183
+ "link": meta.get("link", ""),
184
+ "pblancUrl": meta.get("link", ""),
185
+ "lcategory": meta.get("lcategory", ""),
186
+ "pldirSportRealmLclasCodeNm": meta.get("lcategory", ""),
187
+ "hashTags": meta.get("hashTags", ""),
188
+ "description": meta.get("description", ""),
189
+ "bsnsSumryCn": meta.get("description", ""),
190
+ "trgetNm": meta.get("trgetNm", ""),
191
+ "excInsttNm": meta.get("excInsttNm", ""),
192
+ "inqireCo": meta.get("inqireCo", "0"),
193
+ "flpthNm": meta.get("flpthNm", ""),
194
+ "fileNm": meta.get("fileNm", ""),
195
+ "refrncNm": meta.get("refrncNm", ""),
196
+ "rceptEngnHmpgUrl": meta.get("rceptEngnHmpgUrl", ""),
197
+ }
198
+
199
+ def _get_all_from_json(self) -> List[Dict]:
200
+ """JSON 폴백에서 모든 공고 로드"""
201
+ json_file = CACHE_DIR / "announcements.json"
202
+ if json_file.exists():
203
+ try:
204
+ with open(json_file, 'r', encoding='utf-8') as f:
205
+ return json.load(f)
206
+ except:
207
+ return []
208
+ return []
209
+
210
+ def _save_to_json(self, items: List[Dict]):
211
+ """JSON 폴백으로 저장"""
212
+ json_file = CACHE_DIR / "announcements.json"
213
+ try:
214
+ with open(json_file, 'w', encoding='utf-8') as f:
215
+ json.dump(items, f, ensure_ascii=False, indent=2)
216
+ except Exception as e:
217
+ logger.error(f"JSON save error: {e}")
218
+
219
+ def bulk_upsert(self, items: List[Dict]) -> Tuple[int, int]:
220
+ """대량 삽입/업데이트"""
221
+ if not items:
222
+ return 0, 0
223
+
224
+ added = 0
225
+ updated = 0
226
+
227
+ if self.collection:
228
+ try:
229
+ # 기존 ID 목록 가져오기
230
+ existing = set()
231
+ try:
232
+ result = self.collection.get()
233
+ existing = set(result.get("ids", []))
234
+ except:
235
+ pass
236
+
237
+ ids = []
238
+ documents = []
239
+ metadatas = []
240
+
241
+ for item in items:
242
+ doc_id, doc_text, meta = self._item_to_document(item)
243
+ ids.append(doc_id)
244
+ documents.append(doc_text)
245
+ metadatas.append(meta)
246
+
247
+ if doc_id in existing:
248
+ updated += 1
249
+ else:
250
+ added += 1
251
+
252
+ # upsert
253
+ self.collection.upsert(
254
+ ids=ids,
255
+ documents=documents,
256
+ metadatas=metadatas
257
+ )
258
+
259
+ logger.info(f"Bulk upsert: {added} added, {updated} updated")
260
+
261
+ except Exception as e:
262
+ logger.error(f"Bulk upsert error: {e}")
263
+ # JSON 폴백
264
+ self._save_to_json(items)
265
+ added = len(items)
266
+ else:
267
+ # JSON 폴백
268
+ self._save_to_json(items)
269
+ added = len(items)
270
+
271
+ # 메타데이터 업데이트
272
+ self.metadata["total_count"] = self.get_count()
273
+ self.metadata["last_sync"] = datetime.now(KST).isoformat()
274
+ self._save_metadata()
275
+
276
+ return added, updated
277
+
278
+ def search(self, query: str, n_results: int = 20) -> List[Dict]:
279
+ """텍스트 검색"""
280
+ if not self.collection or not query.strip():
281
+ return self.get_all()[:n_results]
282
+
283
+ try:
284
+ result = self.collection.query(
285
+ query_texts=[query],
286
+ n_results=n_results,
287
+ include=["metadatas", "documents", "distances"]
288
+ )
289
+
290
+ items = []
291
+ for meta in result.get("metadatas", [[]])[0]:
292
+ items.append(self._metadata_to_item(meta))
293
+ return items
294
+
295
+ except Exception as e:
296
+ logger.error(f"Search error: {e}")
297
+ return []
298
+
299
+ def get_existing_ids(self) -> set:
300
+ """기존 공고 ID 집합 반환"""
301
+ if self.collection:
302
+ try:
303
+ result = self.collection.get(include=["metadatas"])
304
+ return {meta.get("pblancId", "") for meta in result.get("metadatas", [])}
305
+ except:
306
+ return set()
307
+
308
+ items = self._get_all_from_json()
309
+ return {item.get("pblancId", item.get("seq", "")) for item in items}
310
+
311
+ def remove_expired(self, days: int = 90) -> int:
312
+ """만료된 공고 삭제 (선택적)"""
313
+ # TODO: 필요시 구현
314
+ return 0
315
+
316
+
317
+ # 글로벌 캐시 인스턴스
318
+ _cache_instance = None
319
+
320
+
321
+ def get_cache() -> AnnouncementCache:
322
+ """싱글톤 캐시 인스턴스 반환"""
323
+ global _cache_instance
324
+ if _cache_instance is None:
325
+ _cache_instance = AnnouncementCache()
326
+ return _cache_instance
327
+
328
+
329
+ # ============================================================
330
+ # 동기화 함수
331
+ # ============================================================
332
+ def sync_from_api() -> Tuple[int, int, str]:
333
+ """
334
+ API에서 공고를 가져와 캐시에 동기화
335
+ Returns: (added_count, updated_count, status_message)
336
+ """
337
+ from file_api import fetch_all_from_api
338
+
339
+ cache = get_cache()
340
+ sync_time = datetime.now(KST)
341
+
342
+ logger.info(f"Starting sync at {sync_time.strftime('%Y-%m-%d %H:%M:%S')} KST")
343
+
344
+ try:
345
+ # API에서 전체 데이터 가져오기
346
+ items, error = fetch_all_from_api(category="전체", region="전체(지역)", keyword="")
347
+
348
+ if error and not items:
349
+ msg = f"❌ API 오류: {error}"
350
+ logger.error(msg)
351
+ return 0, 0, msg
352
+
353
+ if not items:
354
+ msg = "⚠️ API에서 데이터를 가져올 수 없습니다."
355
+ logger.warning(msg)
356
+ return 0, 0, msg
357
+
358
+ # 캐시에 저장
359
+ added, updated = cache.bulk_upsert(items)
360
+
361
+ # 동기화 이력 저장
362
+ sync_record = {
363
+ "timestamp": sync_time.isoformat(),
364
+ "api_count": len(items),
365
+ "added": added,
366
+ "updated": updated,
367
+ "total_cached": cache.get_count()
368
+ }
369
+
370
+ cache.metadata.setdefault("sync_history", []).append(sync_record)
371
+ # 최근 100개 이력만 유지
372
+ cache.metadata["sync_history"] = cache.metadata["sync_history"][-100:]
373
+ cache._save_metadata()
374
+
375
+ msg = f"✅ 동기화 완료: API {len(items)}건 → 신규 {added}건, 업데이트 {updated}건 (총 {cache.get_count()}건)"
376
+ logger.info(msg)
377
+ return added, updated, msg
378
+
379
+ except Exception as e:
380
+ msg = f"❌ 동기화 오류: {str(e)}"
381
+ logger.error(msg)
382
+ return 0, 0, msg
383
+
384
+
385
+ def get_cached_announcements() -> Tuple[List[Dict], str]:
386
+ """
387
+ 캐시에서 공고 목록 반환 (캐시가 비어있으면 API에서 로드)
388
+ Returns: (items, status_message)
389
+ """
390
+ cache = get_cache()
391
+ count = cache.get_count()
392
+
393
+ if count == 0:
394
+ # 초기 로드
395
+ logger.info("Cache empty, performing initial sync...")
396
+ added, updated, msg = sync_from_api()
397
+ if added == 0 and updated == 0:
398
+ return [], msg
399
+
400
+ items = cache.get_all()
401
+ last_sync = cache.metadata.get("last_sync", "알 수 없음")
402
+
403
+ status = f"📦 캐시에서 {len(items)}건 로드 (마지막 동기화: {last_sync})"
404
+ return items, status
405
+
406
+
407
+ def get_sync_status() -> Dict:
408
+ """동기화 상태 정보 반환"""
409
+ cache = get_cache()
410
+ return {
411
+ "total_count": cache.get_count(),
412
+ "last_sync": cache.metadata.get("last_sync"),
413
+ "sync_history": cache.metadata.get("sync_history", [])[-5:],
414
+ "db_path": str(DB_PATH),
415
+ "chromadb_available": CHROMADB_AVAILABLE,
416
+ "scheduler_available": SCHEDULER_AVAILABLE,
417
+ }
418
+
419
+
420
+ # ============================================================
421
+ # 스케줄러
422
+ # ============================================================
423
+ _scheduler = None
424
+
425
+
426
+ def start_scheduler():
427
+ """백그라운드 스케줄러 시작 (KST 10:00, 22:00)"""
428
+ global _scheduler
429
+
430
+ if not SCHEDULER_AVAILABLE:
431
+ logger.warning("Scheduler not available")
432
+ return False
433
+
434
+ if _scheduler is not None:
435
+ logger.info("Scheduler already running")
436
+ return True
437
+
438
+ try:
439
+ _scheduler = BackgroundScheduler(timezone=KST)
440
+
441
+ # 매일 오전 10시 (KST)
442
+ _scheduler.add_job(
443
+ sync_from_api,
444
+ CronTrigger(hour=10, minute=0, timezone=KST),
445
+ id='sync_10am',
446
+ name='Daily sync at 10:00 KST',
447
+ replace_existing=True
448
+ )
449
+
450
+ # 매일 오후 10시 (KST)
451
+ _scheduler.add_job(
452
+ sync_from_api,
453
+ CronTrigger(hour=22, minute=0, timezone=KST),
454
+ id='sync_10pm',
455
+ name='Daily sync at 22:00 KST',
456
+ replace_existing=True
457
+ )
458
+
459
+ _scheduler.start()
460
+ logger.info("Scheduler started: sync at 10:00 and 22:00 KST")
461
+ return True
462
+
463
+ except Exception as e:
464
+ logger.error(f"Scheduler start error: {e}")
465
+ return False
466
+
467
+
468
+ def stop_scheduler():
469
+ """스케줄러 중지"""
470
+ global _scheduler
471
+ if _scheduler:
472
+ _scheduler.shutdown()
473
+ _scheduler = None
474
+ logger.info("Scheduler stopped")
475
+
476
+
477
+ def manual_sync() -> str:
478
+ """수동 동기화 실행"""
479
+ added, updated, msg = sync_from_api()
480
+ return msg
481
+
482
+
483
+ # ============================================================
484
+ # 앱 시작 시 초기화
485
+ # ============================================================
486
+ def initialize_cache_system():
487
+ """캐시 시스템 초기화 (앱 시작 시 호출)"""
488
+ logger.info("Initializing cache system...")
489
+
490
+ # 캐시 초기화
491
+ cache = get_cache()
492
+ count = cache.get_count()
493
+
494
+ if count == 0:
495
+ logger.info("Cache is empty, performing initial sync...")
496
+ sync_from_api()
497
+ else:
498
+ logger.info(f"Cache loaded with {count} announcements")
499
+
500
+ # 스케줄러 시작
501
+ start_scheduler()
502
+
503
+ return get_sync_status()
504
+
505
+
506
+ if __name__ == "__main__":
507
+ # 테스트
508
+ print("Testing cache system...")
509
+ status = initialize_cache_system()
510
+ print(f"Status: {json.dumps(status, ensure_ascii=False, indent=2)}")