seawolf2357 commited on
Commit
41a6799
·
verified ·
1 Parent(s): 7e0d8ff

Update cache_db.py

Browse files
Files changed (1) hide show
  1. cache_db.py +315 -139
cache_db.py CHANGED
@@ -1,7 +1,9 @@
1
  """
2
  과제 공고 벡터 DB 캐시 시스템
3
  - ChromaDB를 사용한 로컬 캐시
 
4
  - 매일 KST 10:00, 22:00 자동 동기화
 
5
  - Hugging Face Space 영구 스토리지 활용 (/data)
6
  """
7
  import os
@@ -9,25 +11,21 @@ import json
9
  import hashlib
10
  import threading
11
  import logging
 
 
12
  from datetime import datetime, timedelta
13
- from typing import List, Dict, Tuple, Optional
14
  from pathlib import Path
15
  import pytz
16
-
17
- # 로깅 설정
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
-
21
- # 영구 스토리지 경로 (HF Space)
22
  PERSISTENT_DIR = Path("/data") if os.path.exists("/data") else Path("./data")
23
  CACHE_DIR = PERSISTENT_DIR / "announcement_cache"
24
  DB_PATH = CACHE_DIR / "chroma_db"
 
25
  METADATA_FILE = CACHE_DIR / "sync_metadata.json"
26
-
27
- # 디렉토리 생성
28
  CACHE_DIR.mkdir(parents=True, exist_ok=True)
29
-
30
- # ChromaDB 사용 가능 여부
31
  try:
32
  import chromadb
33
  from chromadb.config import Settings
@@ -35,8 +33,6 @@ try:
35
  except ImportError:
36
  CHROMADB_AVAILABLE = False
37
  logger.warning("ChromaDB not available. Using JSON fallback.")
38
-
39
- # APScheduler 사용 가능 여부
40
  try:
41
  from apscheduler.schedulers.background import BackgroundScheduler
42
  from apscheduler.triggers.cron import CronTrigger
@@ -44,30 +40,39 @@ try:
44
  except ImportError:
45
  SCHEDULER_AVAILABLE = False
46
  logger.warning("APScheduler not available. Auto-sync disabled.")
47
-
48
- # 한국 시간대
 
 
 
 
49
  KST = pytz.timezone('Asia/Seoul')
50
-
51
-
 
 
 
 
 
 
 
 
 
52
  def safe_str(value, max_len=None) -> str:
53
- """None-safe 문자열 변환 (전역 헬퍼)"""
54
  if value is None:
55
  return ""
56
  s = str(value)
57
  if max_len and len(s) > max_len:
58
  return s[:max_len]
59
  return s
60
-
61
-
62
  class AnnouncementCache:
63
- """공고 캐시 관리 클래스"""
64
-
65
  def __init__(self):
66
  self.collection = None
67
  self.client = None
68
  self._init_db()
69
  self._load_metadata()
70
-
71
  def _init_db(self):
72
  """ChromaDB 초기화"""
73
  if CHROMADB_AVAILABLE:
@@ -78,7 +83,7 @@ class AnnouncementCache:
78
  )
79
  self.collection = self.client.get_or_create_collection(
80
  name="announcements",
81
- metadata={"description": "기업마당 과제 공고 캐시"}
82
  )
83
  logger.info(f"ChromaDB initialized at {DB_PATH}")
84
  except Exception as e:
@@ -86,7 +91,6 @@ class AnnouncementCache:
86
  self.collection = None
87
  else:
88
  logger.info("Using JSON fallback storage")
89
-
90
  def _load_metadata(self):
91
  """동기화 메타데이터 로드"""
92
  self.metadata = {
@@ -100,7 +104,6 @@ class AnnouncementCache:
100
  self.metadata = json.load(f)
101
  except Exception as e:
102
  logger.error(f"Metadata load error: {e}")
103
-
104
  def _save_metadata(self):
105
  """동기화 메타데이터 저장"""
106
  try:
@@ -108,7 +111,6 @@ class AnnouncementCache:
108
  json.dump(self.metadata, f, ensure_ascii=False, indent=2)
109
  except Exception as e:
110
  logger.error(f"Metadata save error: {e}")
111
-
112
  def _generate_id(self, item: Dict) -> str:
113
  """공고 고유 ID 생성"""
114
  pblanc_id = safe_str(item.get("pblancId") or item.get("seq", ""))
@@ -116,12 +118,9 @@ class AnnouncementCache:
116
  pub_date = safe_str(item.get("pubDate") or item.get("creatPnttm", ""))
117
  unique_str = f"{pblanc_id}-{title}-{pub_date}"
118
  return hashlib.md5(unique_str.encode()).hexdigest()
119
-
120
  def _item_to_document(self, item: Dict) -> Tuple[str, str, Dict]:
121
  """API 아이템을 ChromaDB 문서로 변환"""
122
  doc_id = self._generate_id(item)
123
-
124
- # 검색 가능한 텍스트 생성
125
  text_parts = [
126
  safe_str(item.get("title")),
127
  safe_str(item.get("pblancNm")),
@@ -135,12 +134,8 @@ class AnnouncementCache:
135
  safe_str(item.get("excInsttNm")),
136
  ]
137
  searchable_text = " ".join([t for t in text_parts if t])
138
-
139
- # 빈 텍스트 방지
140
  if not searchable_text.strip():
141
  searchable_text = "공고"
142
-
143
- # 메타데이터 (모든 값에 safe_str 적용)
144
  metadata = {
145
  "pblancId": safe_str(item.get("pblancId") or item.get("seq")),
146
  "title": safe_str(item.get("title") or item.get("pblancNm"), 500),
@@ -156,13 +151,13 @@ class AnnouncementCache:
156
  "inqireCo": safe_str(item.get("inqireCo") or "0"),
157
  "flpthNm": safe_str(item.get("flpthNm"), 500),
158
  "fileNm": safe_str(item.get("fileNm"), 200),
 
 
159
  "refrncNm": safe_str(item.get("refrncNm"), 200),
160
  "rceptEngnHmpgUrl": safe_str(item.get("rceptEngnHmpgUrl"), 500),
161
  "cached_at": datetime.now(KST).isoformat(),
162
  }
163
-
164
  return doc_id, searchable_text, metadata
165
-
166
  def get_count(self) -> int:
167
  """캐시된 공고 수 반환"""
168
  if self.collection:
@@ -171,17 +166,14 @@ class AnnouncementCache:
171
  except:
172
  return 0
173
  return self.metadata.get("total_count", 0)
174
-
175
  def get_all(self) -> List[Dict]:
176
  """모든 캐시된 공고 반환"""
177
  if not self.collection:
178
  return self._get_all_from_json()
179
-
180
  try:
181
  count = self.collection.count()
182
  if count == 0:
183
  return []
184
-
185
  result = self.collection.get(include=["metadatas"])
186
  items = []
187
  for meta in result.get("metadatas", []):
@@ -191,7 +183,6 @@ class AnnouncementCache:
191
  except Exception as e:
192
  logger.error(f"Get all error: {e}")
193
  return self._get_all_from_json()
194
-
195
  def _metadata_to_item(self, meta: Dict) -> Dict:
196
  """메타데이터를 원본 아이템 형식으로 변환"""
197
  return {
@@ -217,10 +208,11 @@ class AnnouncementCache:
217
  "inqireCo": safe_str(meta.get("inqireCo")),
218
  "flpthNm": safe_str(meta.get("flpthNm")),
219
  "fileNm": safe_str(meta.get("fileNm")),
 
 
220
  "refrncNm": safe_str(meta.get("refrncNm")),
221
  "rceptEngnHmpgUrl": safe_str(meta.get("rceptEngnHmpgUrl")),
222
  }
223
-
224
  def _get_all_from_json(self) -> List[Dict]:
225
  """JSON 폴백에서 모든 공고 로드"""
226
  json_file = CACHE_DIR / "announcements.json"
@@ -231,7 +223,6 @@ class AnnouncementCache:
231
  except:
232
  return []
233
  return []
234
-
235
  def _save_to_json(self, items: List[Dict]):
236
  """JSON 폴백으로 저장"""
237
  json_file = CACHE_DIR / "announcements.json"
@@ -241,18 +232,14 @@ class AnnouncementCache:
241
  logger.info(f"Saved {len(items)} items to JSON fallback")
242
  except Exception as e:
243
  logger.error(f"JSON save error: {e}")
244
-
245
  def bulk_upsert(self, items: List[Dict]) -> Tuple[int, int]:
246
  """대량 삽입/업데이트"""
247
  if not items:
248
  return 0, 0
249
-
250
  added = 0
251
  updated = 0
252
-
253
  if self.collection:
254
  try:
255
- # 기존 ID 목록 가져오기
256
  existing = set()
257
  try:
258
  count = self.collection.count()
@@ -263,18 +250,15 @@ class AnnouncementCache:
263
  except Exception as e:
264
  logger.warning(f"Failed to get existing IDs: {e}")
265
  existing = set()
266
-
267
  ids = []
268
  documents = []
269
  metadatas = []
270
-
271
  for item in items:
272
  try:
273
  doc_id, doc_text, meta = self._item_to_document(item)
274
  ids.append(doc_id)
275
  documents.append(doc_text)
276
  metadatas.append(meta)
277
-
278
  if doc_id in existing:
279
  updated += 1
280
  else:
@@ -282,67 +266,51 @@ class AnnouncementCache:
282
  except Exception as e:
283
  logger.warning(f"Failed to process item: {e}")
284
  continue
285
-
286
  if not ids:
287
  logger.warning("No valid items to upsert")
288
  return 0, 0
289
-
290
- # 배치 단위로 upsert (ChromaDB 제한 고려)
291
  batch_size = 100
292
  for i in range(0, len(ids), batch_size):
293
  batch_ids = ids[i:i+batch_size]
294
  batch_docs = documents[i:i+batch_size]
295
  batch_metas = metadatas[i:i+batch_size]
296
-
297
  self.collection.upsert(
298
  ids=batch_ids,
299
  documents=batch_docs,
300
  metadatas=batch_metas
301
  )
302
  logger.info(f"Upserted batch {i//batch_size + 1}: {len(batch_ids)} items")
303
-
304
  logger.info(f"Bulk upsert complete: {added} added, {updated} updated")
305
-
306
  except Exception as e:
307
  logger.error(f"Bulk upsert error: {e}")
308
  import traceback
309
  logger.error(traceback.format_exc())
310
- # JSON 폴백
311
  self._save_to_json(items)
312
  added = len(items)
313
  else:
314
- # JSON 폴백
315
  self._save_to_json(items)
316
  added = len(items)
317
-
318
- # 메타데이터 업데이트
319
  self.metadata["total_count"] = self.get_count()
320
  self.metadata["last_sync"] = datetime.now(KST).isoformat()
321
  self._save_metadata()
322
-
323
  return added, updated
324
-
325
  def search(self, query: str, n_results: int = 20) -> List[Dict]:
326
  """텍스트 검색"""
327
  if not self.collection or not query.strip():
328
  return self.get_all()[:n_results]
329
-
330
  try:
331
  result = self.collection.query(
332
  query_texts=[query],
333
  n_results=n_results,
334
  include=["metadatas"]
335
  )
336
-
337
  items = []
338
  for meta in result.get("metadatas", [[]])[0]:
339
  items.append(self._metadata_to_item(meta))
340
  return items
341
-
342
  except Exception as e:
343
  logger.error(f"Search error: {e}")
344
  return []
345
-
346
  def get_existing_ids(self) -> set:
347
  """기존 공고 ID 집합 반환"""
348
  if self.collection:
@@ -354,56 +322,300 @@ class AnnouncementCache:
354
  return {safe_str(meta.get("pblancId")) for meta in result.get("metadatas", [])}
355
  except:
356
  return set()
357
-
358
  items = self._get_all_from_json()
359
  return {safe_str(item.get("pblancId") or item.get("seq")) for item in items}
360
-
361
-
362
- # 글로벌 캐시 인스턴스
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  _cache_instance = None
364
-
365
-
366
  def get_cache() -> AnnouncementCache:
367
- """싱글톤 캐시 인스턴스 반환"""
368
  global _cache_instance
369
  if _cache_instance is None:
370
  _cache_instance = AnnouncementCache()
371
  return _cache_instance
372
-
373
-
374
- # ============================================================
375
- # 동기화 함수
376
- # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  def sync_from_api() -> Tuple[int, int, str]:
378
- """
379
- API에서 공고를 가져와 캐시에 동기화
380
- Returns: (added_count, updated_count, status_message)
381
- """
382
  from file_api import fetch_all_from_api
383
-
384
  cache = get_cache()
385
  sync_time = datetime.now(KST)
386
-
387
  logger.info(f"Starting sync at {sync_time.strftime('%Y-%m-%d %H:%M:%S')} KST")
388
-
389
  try:
390
- # API에서 전체 데이터 가져오기
391
  items, error = fetch_all_from_api(category="전체", region="전체(지역)", keyword="")
392
-
393
  if error and not items:
394
  msg = f"❌ API 오류: {error}"
395
  logger.error(msg)
396
  return 0, 0, msg
397
-
398
  if not items:
399
  msg = "⚠️ API에서 데이터를 가져올 수 없습니다."
400
  logger.warning(msg)
401
  return 0, 0, msg
402
-
403
- # 캐시에 저장
404
  added, updated = cache.bulk_upsert(items)
405
-
406
- # 동기화 이력 저장
407
  sync_record = {
408
  "timestamp": sync_time.isoformat(),
409
  "api_count": len(items),
@@ -411,49 +623,38 @@ def sync_from_api() -> Tuple[int, int, str]:
411
  "updated": updated,
412
  "total_cached": cache.get_count()
413
  }
414
-
415
  cache.metadata.setdefault("sync_history", []).append(sync_record)
416
- # 최근 100개 이력만 유지
417
  cache.metadata["sync_history"] = cache.metadata["sync_history"][-100:]
418
  cache._save_metadata()
419
-
420
  msg = f"✅ 동기화 완료: API {len(items)}건 → 신규 {added}건, 업데이트 {updated}건 (총 {cache.get_count()}건)"
421
  logger.info(msg)
 
 
422
  return added, updated, msg
423
-
424
  except Exception as e:
425
  import traceback
426
  logger.error(f"Sync error: {e}")
427
  logger.error(traceback.format_exc())
428
  msg = f"❌ 동기화 오류: {str(e)}"
429
  return 0, 0, msg
430
-
431
-
432
  def get_cached_announcements() -> Tuple[List[Dict], str]:
433
- """
434
- 캐시에서 공고 목록 반환 (캐시가 비어있으면 API에서 로드)
435
- Returns: (items, status_message)
436
- """
437
  cache = get_cache()
438
  count = cache.get_count()
439
-
440
  if count == 0:
441
- # 초기 로드
442
  logger.info("Cache empty, performing initial sync...")
443
  added, updated, msg = sync_from_api()
444
  if added == 0 and updated == 0:
445
  return [], msg
446
-
447
  items = cache.get_all()
448
  last_sync = cache.metadata.get("last_sync", "알 수 없음")
449
-
450
  status = f"📦 캐시에서 {len(items)}건 로드 (마지막 동기화: {last_sync})"
451
  return items, status
452
-
453
-
454
  def get_sync_status() -> Dict:
455
  """동기화 상태 정보 반환"""
456
  cache = get_cache()
 
 
457
  return {
458
  "total_count": cache.get_count(),
459
  "last_sync": cache.metadata.get("last_sync"),
@@ -461,31 +662,24 @@ def get_sync_status() -> Dict:
461
  "db_path": str(DB_PATH),
462
  "chromadb_available": CHROMADB_AVAILABLE,
463
  "scheduler_available": SCHEDULER_AVAILABLE,
 
 
 
 
 
464
  }
465
-
466
-
467
- # ============================================================
468
- # 스케줄러
469
- # ============================================================
470
  _scheduler = None
471
-
472
-
473
  def start_scheduler():
474
  """백그라운드 스케줄러 시작 (KST 10:00, 22:00)"""
475
  global _scheduler
476
-
477
  if not SCHEDULER_AVAILABLE:
478
  logger.warning("Scheduler not available")
479
  return False
480
-
481
  if _scheduler is not None:
482
  logger.info("Scheduler already running")
483
  return True
484
-
485
  try:
486
  _scheduler = BackgroundScheduler(timezone=KST)
487
-
488
- # 매일 오전 10시 (KST)
489
  _scheduler.add_job(
490
  sync_from_api,
491
  CronTrigger(hour=10, minute=0, timezone=KST),
@@ -493,8 +687,6 @@ def start_scheduler():
493
  name='Daily sync at 10:00 KST',
494
  replace_existing=True
495
  )
496
-
497
- # 매일 오후 10시 (KST)
498
  _scheduler.add_job(
499
  sync_from_api,
500
  CronTrigger(hour=22, minute=0, timezone=KST),
@@ -502,16 +694,12 @@ def start_scheduler():
502
  name='Daily sync at 22:00 KST',
503
  replace_existing=True
504
  )
505
-
506
  _scheduler.start()
507
  logger.info("Scheduler started: sync at 10:00 and 22:00 KST")
508
  return True
509
-
510
  except Exception as e:
511
  logger.error(f"Scheduler start error: {e}")
512
  return False
513
-
514
-
515
  def stop_scheduler():
516
  """스케줄러 중지"""
517
  global _scheduler
@@ -519,39 +707,27 @@ def stop_scheduler():
519
  _scheduler.shutdown()
520
  _scheduler = None
521
  logger.info("Scheduler stopped")
522
-
523
-
524
  def manual_sync() -> str:
525
  """수동 동기화 실행"""
526
  added, updated, msg = sync_from_api()
527
  return msg
528
-
529
-
530
- # ============================================================
531
- # 앱 시작 시 초기화
532
- # ============================================================
533
  def initialize_cache_system():
534
  """캐시 시스템 초기화 (앱 시작 시 호출)"""
535
  logger.info("Initializing cache system...")
536
-
537
- # 캐시 초기화
538
  cache = get_cache()
539
  count = cache.get_count()
540
-
541
  if count == 0:
542
  logger.info("Cache is empty, performing initial sync...")
543
  sync_from_api()
544
  else:
545
  logger.info(f"Cache loaded with {count} announcements")
546
-
547
- # 스케줄러 시작
 
548
  start_scheduler()
549
-
550
  return get_sync_status()
551
-
552
-
553
  if __name__ == "__main__":
554
- # 테스트
555
  print("Testing cache system...")
556
  status = initialize_cache_system()
557
  print(f"Status: {json.dumps(status, ensure_ascii=False, indent=2)}")
 
1
  """
2
  과제 공고 벡터 DB 캐시 시스템
3
  - ChromaDB를 사용한 로컬 캐시
4
+ - 메타 캐시 + 본문 벡터 캐시 (2개 컬렉션)
5
  - 매일 KST 10:00, 22:00 자동 동기화
6
+ - 백그라운드 본문 인덱싱 (서비스 무중단)
7
  - Hugging Face Space 영구 스토리지 활용 (/data)
8
  """
9
  import os
 
11
  import hashlib
12
  import threading
13
  import logging
14
+ import tempfile
15
+ import time
16
  from datetime import datetime, timedelta
17
+ from typing import List, Dict, Tuple, Optional, Generator
18
  from pathlib import Path
19
  import pytz
 
 
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
 
 
22
  PERSISTENT_DIR = Path("/data") if os.path.exists("/data") else Path("./data")
23
  CACHE_DIR = PERSISTENT_DIR / "announcement_cache"
24
  DB_PATH = CACHE_DIR / "chroma_db"
25
+ CONTENT_DB_PATH = CACHE_DIR / "content_db"
26
  METADATA_FILE = CACHE_DIR / "sync_metadata.json"
27
+ CONTENT_INDEX_FILE = CACHE_DIR / "content_index_status.json"
 
28
  CACHE_DIR.mkdir(parents=True, exist_ok=True)
 
 
29
  try:
30
  import chromadb
31
  from chromadb.config import Settings
 
33
  except ImportError:
34
  CHROMADB_AVAILABLE = False
35
  logger.warning("ChromaDB not available. Using JSON fallback.")
 
 
36
  try:
37
  from apscheduler.schedulers.background import BackgroundScheduler
38
  from apscheduler.triggers.cron import CronTrigger
 
40
  except ImportError:
41
  SCHEDULER_AVAILABLE = False
42
  logger.warning("APScheduler not available. Auto-sync disabled.")
43
+ try:
44
+ from sentence_transformers import SentenceTransformer
45
+ EMBEDDING_AVAILABLE = True
46
+ except ImportError:
47
+ EMBEDDING_AVAILABLE = False
48
+ logger.warning("sentence-transformers not available. Using ChromaDB default embedding.")
49
  KST = pytz.timezone('Asia/Seoul')
50
+ _embedding_model = None
51
+ def get_embedding_model():
52
+ """임베딩 모델 싱글톤"""
53
+ global _embedding_model
54
+ if _embedding_model is None and EMBEDDING_AVAILABLE:
55
+ try:
56
+ _embedding_model = SentenceTransformer('jhgan/ko-sroberta-multitask')
57
+ logger.info("Loaded Korean embedding model: jhgan/ko-sroberta-multitask")
58
+ except Exception as e:
59
+ logger.error(f"Failed to load embedding model: {e}")
60
+ return _embedding_model
61
  def safe_str(value, max_len=None) -> str:
62
+ """None-safe 문자열 변환"""
63
  if value is None:
64
  return ""
65
  s = str(value)
66
  if max_len and len(s) > max_len:
67
  return s[:max_len]
68
  return s
 
 
69
  class AnnouncementCache:
70
+ """공고 메타정보 캐시 관리 클래스"""
 
71
  def __init__(self):
72
  self.collection = None
73
  self.client = None
74
  self._init_db()
75
  self._load_metadata()
 
76
  def _init_db(self):
77
  """ChromaDB 초기화"""
78
  if CHROMADB_AVAILABLE:
 
83
  )
84
  self.collection = self.client.get_or_create_collection(
85
  name="announcements",
86
+ metadata={"description": "기업마당 과제 공고 메타 캐시"}
87
  )
88
  logger.info(f"ChromaDB initialized at {DB_PATH}")
89
  except Exception as e:
 
91
  self.collection = None
92
  else:
93
  logger.info("Using JSON fallback storage")
 
94
  def _load_metadata(self):
95
  """동기화 메타데이터 로드"""
96
  self.metadata = {
 
104
  self.metadata = json.load(f)
105
  except Exception as e:
106
  logger.error(f"Metadata load error: {e}")
 
107
  def _save_metadata(self):
108
  """동기화 메타데이터 저장"""
109
  try:
 
111
  json.dump(self.metadata, f, ensure_ascii=False, indent=2)
112
  except Exception as e:
113
  logger.error(f"Metadata save error: {e}")
 
114
  def _generate_id(self, item: Dict) -> str:
115
  """공고 고유 ID 생성"""
116
  pblanc_id = safe_str(item.get("pblancId") or item.get("seq", ""))
 
118
  pub_date = safe_str(item.get("pubDate") or item.get("creatPnttm", ""))
119
  unique_str = f"{pblanc_id}-{title}-{pub_date}"
120
  return hashlib.md5(unique_str.encode()).hexdigest()
 
121
  def _item_to_document(self, item: Dict) -> Tuple[str, str, Dict]:
122
  """API 아이템을 ChromaDB 문서로 변환"""
123
  doc_id = self._generate_id(item)
 
 
124
  text_parts = [
125
  safe_str(item.get("title")),
126
  safe_str(item.get("pblancNm")),
 
134
  safe_str(item.get("excInsttNm")),
135
  ]
136
  searchable_text = " ".join([t for t in text_parts if t])
 
 
137
  if not searchable_text.strip():
138
  searchable_text = "공고"
 
 
139
  metadata = {
140
  "pblancId": safe_str(item.get("pblancId") or item.get("seq")),
141
  "title": safe_str(item.get("title") or item.get("pblancNm"), 500),
 
151
  "inqireCo": safe_str(item.get("inqireCo") or "0"),
152
  "flpthNm": safe_str(item.get("flpthNm"), 500),
153
  "fileNm": safe_str(item.get("fileNm"), 200),
154
+ "printFlpthNm": safe_str(item.get("printFlpthNm"), 500),
155
+ "printFileNm": safe_str(item.get("printFileNm"), 200),
156
  "refrncNm": safe_str(item.get("refrncNm"), 200),
157
  "rceptEngnHmpgUrl": safe_str(item.get("rceptEngnHmpgUrl"), 500),
158
  "cached_at": datetime.now(KST).isoformat(),
159
  }
 
160
  return doc_id, searchable_text, metadata
 
161
  def get_count(self) -> int:
162
  """캐시된 공고 수 반환"""
163
  if self.collection:
 
166
  except:
167
  return 0
168
  return self.metadata.get("total_count", 0)
 
169
  def get_all(self) -> List[Dict]:
170
  """모든 캐시된 공고 반환"""
171
  if not self.collection:
172
  return self._get_all_from_json()
 
173
  try:
174
  count = self.collection.count()
175
  if count == 0:
176
  return []
 
177
  result = self.collection.get(include=["metadatas"])
178
  items = []
179
  for meta in result.get("metadatas", []):
 
183
  except Exception as e:
184
  logger.error(f"Get all error: {e}")
185
  return self._get_all_from_json()
 
186
  def _metadata_to_item(self, meta: Dict) -> Dict:
187
  """메타데이터를 원본 아이템 형식으로 변환"""
188
  return {
 
208
  "inqireCo": safe_str(meta.get("inqireCo")),
209
  "flpthNm": safe_str(meta.get("flpthNm")),
210
  "fileNm": safe_str(meta.get("fileNm")),
211
+ "printFlpthNm": safe_str(meta.get("printFlpthNm")),
212
+ "printFileNm": safe_str(meta.get("printFileNm")),
213
  "refrncNm": safe_str(meta.get("refrncNm")),
214
  "rceptEngnHmpgUrl": safe_str(meta.get("rceptEngnHmpgUrl")),
215
  }
 
216
  def _get_all_from_json(self) -> List[Dict]:
217
  """JSON 폴백에서 모든 공고 로드"""
218
  json_file = CACHE_DIR / "announcements.json"
 
223
  except:
224
  return []
225
  return []
 
226
  def _save_to_json(self, items: List[Dict]):
227
  """JSON 폴백으로 저장"""
228
  json_file = CACHE_DIR / "announcements.json"
 
232
  logger.info(f"Saved {len(items)} items to JSON fallback")
233
  except Exception as e:
234
  logger.error(f"JSON save error: {e}")
 
235
  def bulk_upsert(self, items: List[Dict]) -> Tuple[int, int]:
236
  """대량 삽입/업데이트"""
237
  if not items:
238
  return 0, 0
 
239
  added = 0
240
  updated = 0
 
241
  if self.collection:
242
  try:
 
243
  existing = set()
244
  try:
245
  count = self.collection.count()
 
250
  except Exception as e:
251
  logger.warning(f"Failed to get existing IDs: {e}")
252
  existing = set()
 
253
  ids = []
254
  documents = []
255
  metadatas = []
 
256
  for item in items:
257
  try:
258
  doc_id, doc_text, meta = self._item_to_document(item)
259
  ids.append(doc_id)
260
  documents.append(doc_text)
261
  metadatas.append(meta)
 
262
  if doc_id in existing:
263
  updated += 1
264
  else:
 
266
  except Exception as e:
267
  logger.warning(f"Failed to process item: {e}")
268
  continue
 
269
  if not ids:
270
  logger.warning("No valid items to upsert")
271
  return 0, 0
 
 
272
  batch_size = 100
273
  for i in range(0, len(ids), batch_size):
274
  batch_ids = ids[i:i+batch_size]
275
  batch_docs = documents[i:i+batch_size]
276
  batch_metas = metadatas[i:i+batch_size]
 
277
  self.collection.upsert(
278
  ids=batch_ids,
279
  documents=batch_docs,
280
  metadatas=batch_metas
281
  )
282
  logger.info(f"Upserted batch {i//batch_size + 1}: {len(batch_ids)} items")
 
283
  logger.info(f"Bulk upsert complete: {added} added, {updated} updated")
 
284
  except Exception as e:
285
  logger.error(f"Bulk upsert error: {e}")
286
  import traceback
287
  logger.error(traceback.format_exc())
 
288
  self._save_to_json(items)
289
  added = len(items)
290
  else:
 
291
  self._save_to_json(items)
292
  added = len(items)
 
 
293
  self.metadata["total_count"] = self.get_count()
294
  self.metadata["last_sync"] = datetime.now(KST).isoformat()
295
  self._save_metadata()
 
296
  return added, updated
 
297
  def search(self, query: str, n_results: int = 20) -> List[Dict]:
298
  """텍스트 검색"""
299
  if not self.collection or not query.strip():
300
  return self.get_all()[:n_results]
 
301
  try:
302
  result = self.collection.query(
303
  query_texts=[query],
304
  n_results=n_results,
305
  include=["metadatas"]
306
  )
 
307
  items = []
308
  for meta in result.get("metadatas", [[]])[0]:
309
  items.append(self._metadata_to_item(meta))
310
  return items
 
311
  except Exception as e:
312
  logger.error(f"Search error: {e}")
313
  return []
 
314
  def get_existing_ids(self) -> set:
315
  """기존 공고 ID 집합 반환"""
316
  if self.collection:
 
322
  return {safe_str(meta.get("pblancId")) for meta in result.get("metadatas", [])}
323
  except:
324
  return set()
 
325
  items = self._get_all_from_json()
326
  return {safe_str(item.get("pblancId") or item.get("seq")) for item in items}
327
+ class ContentVectorCache:
328
+ """공고 본문 벡터 캐시 관리 클래스 (매칭용)"""
329
+ def __init__(self):
330
+ self.collection = None
331
+ self.client = None
332
+ self.index_status = {}
333
+ self._init_db()
334
+ self._load_index_status()
335
+ def _init_db(self):
336
+ """본문 벡터 DB 초기화"""
337
+ if CHROMADB_AVAILABLE:
338
+ try:
339
+ self.client = chromadb.PersistentClient(
340
+ path=str(CONTENT_DB_PATH),
341
+ settings=Settings(anonymized_telemetry=False)
342
+ )
343
+ self.collection = self.client.get_or_create_collection(
344
+ name="announcement_contents",
345
+ metadata={"description": "공고 본문 벡터 캐시 (매칭용)"}
346
+ )
347
+ logger.info(f"Content VectorDB initialized at {CONTENT_DB_PATH}")
348
+ except Exception as e:
349
+ logger.error(f"Content VectorDB init error: {e}")
350
+ self.collection = None
351
+ def _load_index_status(self):
352
+ """인덱싱 상태 로드"""
353
+ self.index_status = {
354
+ "total_indexed": 0,
355
+ "last_index_time": None,
356
+ "indexed_ids": [],
357
+ "failed_ids": [],
358
+ "in_progress": False,
359
+ "progress_current": 0,
360
+ "progress_total": 0
361
+ }
362
+ if CONTENT_INDEX_FILE.exists():
363
+ try:
364
+ with open(CONTENT_INDEX_FILE, 'r', encoding='utf-8') as f:
365
+ self.index_status = json.load(f)
366
+ except Exception as e:
367
+ logger.error(f"Index status load error: {e}")
368
+ def _save_index_status(self):
369
+ """인덱싱 상태 저장"""
370
+ try:
371
+ with open(CONTENT_INDEX_FILE, 'w', encoding='utf-8') as f:
372
+ json.dump(self.index_status, f, ensure_ascii=False, indent=2)
373
+ except Exception as e:
374
+ logger.error(f"Index status save error: {e}")
375
+ def get_indexed_count(self) -> int:
376
+ """인덱싱된 본문 수"""
377
+ if self.collection:
378
+ try:
379
+ return self.collection.count()
380
+ except:
381
+ return 0
382
+ return len(self.index_status.get("indexed_ids", []))
383
+ def get_indexed_ids(self) -> set:
384
+ """이미 인덱싱된 공고 ID 집합"""
385
+ if self.collection:
386
+ try:
387
+ count = self.collection.count()
388
+ if count == 0:
389
+ return set()
390
+ result = self.collection.get(include=["metadatas"])
391
+ return {safe_str(meta.get("pblancId")) for meta in result.get("metadatas", [])}
392
+ except:
393
+ return set()
394
+ return set(self.index_status.get("indexed_ids", []))
395
+ def add_content(self, pblanc_id: str, title: str, content_text: str,
396
+ structured_info: Dict = None) -> bool:
397
+ """본문 콘텐츠 추가"""
398
+ if not self.collection or not content_text.strip():
399
+ return False
400
+ try:
401
+ doc_id = hashlib.md5(pblanc_id.encode()).hexdigest()
402
+ metadata = {
403
+ "pblancId": safe_str(pblanc_id),
404
+ "title": safe_str(title, 500),
405
+ "content_length": len(content_text),
406
+ "indexed_at": datetime.now(KST).isoformat(),
407
+ }
408
+ if structured_info:
409
+ metadata["eligibility"] = safe_str(json.dumps(structured_info.get("eligibility", {}), ensure_ascii=False), 2000)
410
+ metadata["support_details"] = safe_str(json.dumps(structured_info.get("support_details", {}), ensure_ascii=False), 2000)
411
+ metadata["evaluation_criteria"] = safe_str(json.dumps(structured_info.get("evaluation_criteria", {}), ensure_ascii=False), 2000)
412
+ embedding = None
413
+ model = get_embedding_model()
414
+ if model:
415
+ try:
416
+ summary_text = content_text[:2000]
417
+ embedding = model.encode(summary_text).tolist()
418
+ except Exception as e:
419
+ logger.warning(f"Embedding generation failed: {e}")
420
+ if embedding:
421
+ self.collection.upsert(
422
+ ids=[doc_id],
423
+ embeddings=[embedding],
424
+ documents=[content_text[:10000]],
425
+ metadatas=[metadata]
426
+ )
427
+ else:
428
+ self.collection.upsert(
429
+ ids=[doc_id],
430
+ documents=[content_text[:10000]],
431
+ metadatas=[metadata]
432
+ )
433
+ if pblanc_id not in self.index_status.get("indexed_ids", []):
434
+ self.index_status.setdefault("indexed_ids", []).append(pblanc_id)
435
+ self.index_status["total_indexed"] = self.get_indexed_count()
436
+ self.index_status["last_index_time"] = datetime.now(KST).isoformat()
437
+ return True
438
+ except Exception as e:
439
+ logger.error(f"Add content error for {pblanc_id}: {e}")
440
+ if pblanc_id not in self.index_status.get("failed_ids", []):
441
+ self.index_status.setdefault("failed_ids", []).append(pblanc_id)
442
+ return False
443
+ def search_similar(self, query_text: str, n_results: int = 20) -> List[Dict]:
444
+ """유사 공고 검색 (벡터 검색)"""
445
+ if not self.collection:
446
+ return []
447
+ try:
448
+ model = get_embedding_model()
449
+ if model:
450
+ query_embedding = model.encode(query_text[:1000]).tolist()
451
+ result = self.collection.query(
452
+ query_embeddings=[query_embedding],
453
+ n_results=n_results,
454
+ include=["metadatas", "documents", "distances"]
455
+ )
456
+ else:
457
+ result = self.collection.query(
458
+ query_texts=[query_text],
459
+ n_results=n_results,
460
+ include=["metadatas", "documents", "distances"]
461
+ )
462
+ items = []
463
+ metadatas = result.get("metadatas", [[]])[0]
464
+ documents = result.get("documents", [[]])[0]
465
+ distances = result.get("distances", [[]])[0]
466
+ for i, meta in enumerate(metadatas):
467
+ item = {
468
+ "pblancId": safe_str(meta.get("pblancId")),
469
+ "title": safe_str(meta.get("title")),
470
+ "content_preview": documents[i][:500] if i < len(documents) else "",
471
+ "similarity_score": 1 - (distances[i] if i < len(distances) else 0),
472
+ }
473
+ if meta.get("eligibility"):
474
+ try:
475
+ item["eligibility"] = json.loads(meta["eligibility"])
476
+ except:
477
+ pass
478
+ if meta.get("support_details"):
479
+ try:
480
+ item["support_details"] = json.loads(meta["support_details"])
481
+ except:
482
+ pass
483
+ items.append(item)
484
+ return items
485
+ except Exception as e:
486
+ logger.error(f"Vector search error: {e}")
487
+ return []
488
+ def get_status(self) -> Dict:
489
+ """인덱싱 상태 반환"""
490
+ return {
491
+ "total_indexed": self.get_indexed_count(),
492
+ "last_index_time": self.index_status.get("last_index_time"),
493
+ "in_progress": self.index_status.get("in_progress", False),
494
+ "progress_current": self.index_status.get("progress_current", 0),
495
+ "progress_total": self.index_status.get("progress_total", 0),
496
+ "failed_count": len(self.index_status.get("failed_ids", []))
497
+ }
498
  _cache_instance = None
499
+ _content_cache_instance = None
 
500
  def get_cache() -> AnnouncementCache:
501
+ """메타 캐시 싱글톤"""
502
  global _cache_instance
503
  if _cache_instance is None:
504
  _cache_instance = AnnouncementCache()
505
  return _cache_instance
506
+ def get_content_cache() -> ContentVectorCache:
507
+ """본문 벡터 캐시 싱글톤"""
508
+ global _content_cache_instance
509
+ if _content_cache_instance is None:
510
+ _content_cache_instance = ContentVectorCache()
511
+ return _content_cache_instance
512
+ _indexing_thread = None
513
+ _indexing_stop_flag = False
514
+ def background_content_indexer():
515
+ """백그라운드 본문 인덱싱 (서비스 무중단)"""
516
+ global _indexing_stop_flag
517
+ from file_api import download_file, extract_text_from_file
518
+ content_cache = get_content_cache()
519
+ meta_cache = get_cache()
520
+ if content_cache.index_status.get("in_progress"):
521
+ logger.info("Content indexing already in progress")
522
+ return
523
+ content_cache.index_status["in_progress"] = True
524
+ content_cache._save_index_status()
525
+ try:
526
+ all_items = meta_cache.get_all()
527
+ indexed_ids = content_cache.get_indexed_ids()
528
+ items_to_index = []
529
+ for item in all_items:
530
+ pblanc_id = safe_str(item.get("pblancId") or item.get("seq"))
531
+ print_url = safe_str(item.get("printFlpthNm"))
532
+ print_name = safe_str(item.get("printFileNm"))
533
+ if pblanc_id and print_url and print_name:
534
+ if pblanc_id not in indexed_ids:
535
+ items_to_index.append({
536
+ "pblancId": pblanc_id,
537
+ "title": safe_str(item.get("title") or item.get("pblancNm")),
538
+ "print_url": print_url,
539
+ "print_name": print_name
540
+ })
541
+ total = len(items_to_index)
542
+ content_cache.index_status["progress_total"] = total
543
+ content_cache.index_status["progress_current"] = 0
544
+ content_cache._save_index_status()
545
+ logger.info(f"Starting background content indexing: {total} items to process")
546
+ for i, item in enumerate(items_to_index):
547
+ if _indexing_stop_flag:
548
+ logger.info("Content indexing stopped by flag")
549
+ break
550
+ pblanc_id = item["pblancId"]
551
+ title = item["title"]
552
+ print_url = item["print_url"]
553
+ print_name = item["print_name"]
554
+ try:
555
+ with tempfile.TemporaryDirectory() as tmp_dir:
556
+ file_path, error = download_file(print_url, tmp_dir, print_name)
557
+ if error or not file_path:
558
+ logger.warning(f"Download failed for {pblanc_id}: {error}")
559
+ content_cache.index_status.setdefault("failed_ids", []).append(pblanc_id)
560
+ continue
561
+ text, err = extract_text_from_file(file_path)
562
+ if not text or len(text) < 100:
563
+ logger.warning(f"Text extraction failed for {pblanc_id}: {err}")
564
+ content_cache.index_status.setdefault("failed_ids", []).append(pblanc_id)
565
+ continue
566
+ success = content_cache.add_content(pblanc_id, title, text)
567
+ if success:
568
+ logger.info(f"Indexed [{i+1}/{total}]: {title[:30]}...")
569
+ else:
570
+ logger.warning(f"Failed to add content for {pblanc_id}")
571
+ except Exception as e:
572
+ logger.error(f"Error indexing {pblanc_id}: {e}")
573
+ content_cache.index_status.setdefault("failed_ids", []).append(pblanc_id)
574
+ content_cache.index_status["progress_current"] = i + 1
575
+ if (i + 1) % 10 == 0:
576
+ content_cache._save_index_status()
577
+ time.sleep(0.5)
578
+ logger.info(f"Content indexing complete: {content_cache.get_indexed_count()} total indexed")
579
+ except Exception as e:
580
+ logger.error(f"Background indexing error: {e}")
581
+ import traceback
582
+ logger.error(traceback.format_exc())
583
+ finally:
584
+ content_cache.index_status["in_progress"] = False
585
+ content_cache._save_index_status()
586
+ def start_background_indexing():
587
+ """백그라운드 인덱싱 시작 (비동기)"""
588
+ global _indexing_thread, _indexing_stop_flag
589
+ if _indexing_thread and _indexing_thread.is_alive():
590
+ logger.info("Background indexing already running")
591
+ return False
592
+ _indexing_stop_flag = False
593
+ _indexing_thread = threading.Thread(target=background_content_indexer, daemon=True)
594
+ _indexing_thread.start()
595
+ logger.info("Background content indexing thread started")
596
+ return True
597
+ def stop_background_indexing():
598
+ """���그라운드 인덱싱 중지"""
599
+ global _indexing_stop_flag
600
+ _indexing_stop_flag = True
601
+ logger.info("Background indexing stop requested")
602
  def sync_from_api() -> Tuple[int, int, str]:
603
+ """API에서 공고를 가져와 캐시에 동기화"""
 
 
 
604
  from file_api import fetch_all_from_api
 
605
  cache = get_cache()
606
  sync_time = datetime.now(KST)
 
607
  logger.info(f"Starting sync at {sync_time.strftime('%Y-%m-%d %H:%M:%S')} KST")
 
608
  try:
 
609
  items, error = fetch_all_from_api(category="전체", region="전체(지역)", keyword="")
 
610
  if error and not items:
611
  msg = f"❌ API 오류: {error}"
612
  logger.error(msg)
613
  return 0, 0, msg
 
614
  if not items:
615
  msg = "⚠️ API에서 데이터를 가져올 수 없습니다."
616
  logger.warning(msg)
617
  return 0, 0, msg
 
 
618
  added, updated = cache.bulk_upsert(items)
 
 
619
  sync_record = {
620
  "timestamp": sync_time.isoformat(),
621
  "api_count": len(items),
 
623
  "updated": updated,
624
  "total_cached": cache.get_count()
625
  }
 
626
  cache.metadata.setdefault("sync_history", []).append(sync_record)
 
627
  cache.metadata["sync_history"] = cache.metadata["sync_history"][-100:]
628
  cache._save_metadata()
 
629
  msg = f"✅ 동기화 완료: API {len(items)}건 → 신규 {added}건, 업데이트 {updated}건 (총 {cache.get_count()}건)"
630
  logger.info(msg)
631
+ if added > 0:
632
+ start_background_indexing()
633
  return added, updated, msg
 
634
  except Exception as e:
635
  import traceback
636
  logger.error(f"Sync error: {e}")
637
  logger.error(traceback.format_exc())
638
  msg = f"❌ 동기화 오류: {str(e)}"
639
  return 0, 0, msg
 
 
640
  def get_cached_announcements() -> Tuple[List[Dict], str]:
641
+ """캐시에서 공고 목록 반환"""
 
 
 
642
  cache = get_cache()
643
  count = cache.get_count()
 
644
  if count == 0:
 
645
  logger.info("Cache empty, performing initial sync...")
646
  added, updated, msg = sync_from_api()
647
  if added == 0 and updated == 0:
648
  return [], msg
 
649
  items = cache.get_all()
650
  last_sync = cache.metadata.get("last_sync", "알 수 없음")
 
651
  status = f"📦 캐시에서 {len(items)}건 로드 (마지막 동기화: {last_sync})"
652
  return items, status
 
 
653
  def get_sync_status() -> Dict:
654
  """동기화 상태 정보 반환"""
655
  cache = get_cache()
656
+ content_cache = get_content_cache()
657
+ content_status = content_cache.get_status()
658
  return {
659
  "total_count": cache.get_count(),
660
  "last_sync": cache.metadata.get("last_sync"),
 
662
  "db_path": str(DB_PATH),
663
  "chromadb_available": CHROMADB_AVAILABLE,
664
  "scheduler_available": SCHEDULER_AVAILABLE,
665
+ "content_indexed": content_status["total_indexed"],
666
+ "content_indexing_in_progress": content_status["in_progress"],
667
+ "content_progress": f"{content_status['progress_current']}/{content_status['progress_total']}",
668
+ "content_failed_count": content_status["failed_count"],
669
+ "embedding_available": EMBEDDING_AVAILABLE,
670
  }
 
 
 
 
 
671
  _scheduler = None
 
 
672
  def start_scheduler():
673
  """백그라운드 스케줄러 시작 (KST 10:00, 22:00)"""
674
  global _scheduler
 
675
  if not SCHEDULER_AVAILABLE:
676
  logger.warning("Scheduler not available")
677
  return False
 
678
  if _scheduler is not None:
679
  logger.info("Scheduler already running")
680
  return True
 
681
  try:
682
  _scheduler = BackgroundScheduler(timezone=KST)
 
 
683
  _scheduler.add_job(
684
  sync_from_api,
685
  CronTrigger(hour=10, minute=0, timezone=KST),
 
687
  name='Daily sync at 10:00 KST',
688
  replace_existing=True
689
  )
 
 
690
  _scheduler.add_job(
691
  sync_from_api,
692
  CronTrigger(hour=22, minute=0, timezone=KST),
 
694
  name='Daily sync at 22:00 KST',
695
  replace_existing=True
696
  )
 
697
  _scheduler.start()
698
  logger.info("Scheduler started: sync at 10:00 and 22:00 KST")
699
  return True
 
700
  except Exception as e:
701
  logger.error(f"Scheduler start error: {e}")
702
  return False
 
 
703
  def stop_scheduler():
704
  """스케줄러 중지"""
705
  global _scheduler
 
707
  _scheduler.shutdown()
708
  _scheduler = None
709
  logger.info("Scheduler stopped")
 
 
710
  def manual_sync() -> str:
711
  """수동 동기화 실행"""
712
  added, updated, msg = sync_from_api()
713
  return msg
 
 
 
 
 
714
  def initialize_cache_system():
715
  """캐시 시스템 초기화 (앱 시작 시 호출)"""
716
  logger.info("Initializing cache system...")
 
 
717
  cache = get_cache()
718
  count = cache.get_count()
 
719
  if count == 0:
720
  logger.info("Cache is empty, performing initial sync...")
721
  sync_from_api()
722
  else:
723
  logger.info(f"Cache loaded with {count} announcements")
724
+ content_cache = get_content_cache()
725
+ content_count = content_cache.get_indexed_count()
726
+ logger.info(f"Content cache: {content_count} indexed")
727
  start_scheduler()
728
+ start_background_indexing()
729
  return get_sync_status()
 
 
730
  if __name__ == "__main__":
 
731
  print("Testing cache system...")
732
  status = initialize_cache_system()
733
  print(f"Status: {json.dumps(status, ensure_ascii=False, indent=2)}")