cobiz / cache_db.py
seawolf2357's picture
Update cache_db.py
ac32648 verified
"""
๊ณผ์ œ ๊ณต๊ณ  ๋ฒกํ„ฐ DB ์บ์‹œ ์‹œ์Šคํ…œ
- ChromaDB๋ฅผ ์‚ฌ์šฉํ•œ ๋กœ์ปฌ ์บ์‹œ
- ๋ฉ”ํƒ€ ์บ์‹œ + ๋ณธ๋ฌธ ๋ฒกํ„ฐ ์บ์‹œ (2๊ฐœ ์ปฌ๋ ‰์…˜)
- ์‚ฌ์šฉ์ž ํ”„๋กœํ•„ ์ €์žฅ (์ด๋ฉ”์ผ ๊ธฐ๋ฐ˜)
- ๋งค์ผ KST 10:00, 22:00 ์ž๋™ ๋™๊ธฐํ™”
- ๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ณธ๋ฌธ ์ธ๋ฑ์‹ฑ (์„œ๋น„์Šค ๋ฌด์ค‘๋‹จ)
- Hugging Face Space ์˜๊ตฌ ์Šคํ† ๋ฆฌ์ง€ ํ™œ์šฉ (/data)
"""
import os
import json
import hashlib
import threading
import logging
import tempfile
import time
from datetime import datetime, timedelta
from typing import List, Dict, Tuple, Optional, Generator
from pathlib import Path
import pytz
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
PERSISTENT_DIR = Path("/data") if os.path.exists("/data") else Path("./data")
CACHE_DIR = PERSISTENT_DIR / "announcement_cache"
DB_PATH = CACHE_DIR / "chroma_db"
METADATA_FILE = CACHE_DIR / "sync_metadata.json"
CONTENT_INDEX_FILE = CACHE_DIR / "content_index_status.json"
PROFILE_DIR = CACHE_DIR / "user_profiles"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
PROFILE_DIR.mkdir(parents=True, exist_ok=True)
try:
import chromadb
from chromadb.config import Settings
CHROMADB_AVAILABLE = True
except ImportError:
CHROMADB_AVAILABLE = False
logger.warning("ChromaDB not available. Using JSON fallback.")
try:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
SCHEDULER_AVAILABLE = True
except ImportError:
SCHEDULER_AVAILABLE = False
logger.warning("APScheduler not available. Auto-sync disabled.")
try:
from sentence_transformers import SentenceTransformer
EMBEDDING_AVAILABLE = True
except ImportError:
EMBEDDING_AVAILABLE = False
logger.warning("sentence-transformers not available. Using ChromaDB default embedding.")
KST = pytz.timezone('Asia/Seoul')
_embedding_model = None
def get_embedding_model():
"""ํ•œ๊ตญ์–ด ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ (์‹ฑ๊ธ€ํ†ค)"""
global _embedding_model
if _embedding_model is None and EMBEDDING_AVAILABLE:
try:
_embedding_model = SentenceTransformer('jhgan/ko-sroberta-multitask')
logger.info("Loaded Korean embedding model: jhgan/ko-sroberta-multitask")
except Exception as e:
logger.error(f"Failed to load embedding model: {e}")
return _embedding_model
class AnnouncementCache:
"""๋ฉ”ํƒ€ ์ •๋ณด ์บ์‹œ (ChromaDB ๊ธฐ๋ฐ˜)"""
def __init__(self):
self.client = None
self.collection = None
self._init_db()
def _init_db(self):
"""ChromaDB ์ดˆ๊ธฐํ™”"""
if not CHROMADB_AVAILABLE:
return
try:
self.client = chromadb.PersistentClient(path=str(DB_PATH))
self.collection = self.client.get_or_create_collection(
name="announcements",
metadata={"hnsw:space": "cosine"}
)
logger.info(f"ChromaDB initialized at {DB_PATH}")
except Exception as e:
logger.error(f"ChromaDB init error: {e}")
self.client = None
self.collection = None
def _generate_id(self, item: dict) -> str:
"""๊ณต๊ณ  ๊ณ ์œ  ID ์ƒ์„ฑ"""
pblanc_id = item.get("pblancId") or item.get("seq") or ""
title = item.get("title") or item.get("pblancNm") or ""
unique_str = f"{pblanc_id}_{title}"
return hashlib.md5(unique_str.encode()).hexdigest()
def _item_to_document(self, item: dict) -> str:
"""๊ณต๊ณ  ์ •๋ณด๋ฅผ ๊ฒ€์ƒ‰์šฉ ๋ฌธ์„œ๋กœ ๋ณ€ํ™˜"""
parts = []
title = item.get("title") or item.get("pblancNm") or ""
if title:
parts.append(title)
description = item.get("description") or item.get("bsnsSumryCn") or ""
if description:
import re
description = re.sub(r'<[^>]+>', '', description).strip()
parts.append(description[:500])
author = item.get("author") or item.get("jrsdInsttNm") or ""
if author:
parts.append(author)
category = item.get("lcategory") or item.get("pldirSportRealmLclasCodeNm") or ""
if category:
parts.append(category)
hash_tags = item.get("hashTags") or ""
if hash_tags:
parts.append(hash_tags)
target = item.get("trgetNm") or ""
if target:
parts.append(target)
return " ".join(parts)
def _item_to_metadata(self, item: dict) -> dict:
"""๊ณต๊ณ  ์ •๋ณด๋ฅผ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋กœ ๋ณ€ํ™˜"""
def safe_str(val):
if val is None:
return ""
return str(val)[:500]
return {
"pblancId": safe_str(item.get("pblancId") or item.get("seq")),
"title": safe_str(item.get("title") or item.get("pblancNm")),
"author": safe_str(item.get("author") or item.get("jrsdInsttNm")),
"category": safe_str(item.get("lcategory") or item.get("pldirSportRealmLclasCodeNm")),
"reqstDt": safe_str(item.get("reqstDt") or item.get("reqstBeginEndDe")),
"pubDate": safe_str(item.get("pubDate") or item.get("creatPnttm")),
"link": safe_str(item.get("link") or item.get("pblancUrl")),
"hashTags": safe_str(item.get("hashTags")),
"target": safe_str(item.get("trgetNm")),
"excInsttNm": safe_str(item.get("excInsttNm")),
"fileUrl": safe_str(item.get("flpthNm")),
"fileName": safe_str(item.get("fileNm")),
"printFileUrl": safe_str(item.get("printFlpthNm")),
"printFileName": safe_str(item.get("printFileNm")),
"description": safe_str(item.get("description") or item.get("bsnsSumryCn")),
}
def bulk_upsert(self, items: List[dict]) -> Tuple[int, int]:
"""๋Œ€๋Ÿ‰ ๋ฐ์ดํ„ฐ ์‚ฝ์ž…/์—…๋ฐ์ดํŠธ"""
if not self.collection or not items:
return 0, 0
added = 0
updated = 0
batch_size = 100
for i in range(0, len(items), batch_size):
batch = items[i:i+batch_size]
ids = []
documents = []
metadatas = []
for item in batch:
doc_id = self._generate_id(item)
ids.append(doc_id)
documents.append(self._item_to_document(item))
metadatas.append(self._item_to_metadata(item))
try:
existing = self.collection.get(ids=ids)
existing_ids = set(existing['ids']) if existing['ids'] else set()
self.collection.upsert(
ids=ids,
documents=documents,
metadatas=metadatas
)
for doc_id in ids:
if doc_id in existing_ids:
updated += 1
else:
added += 1
logger.info(f"Upserted batch {i//batch_size + 1}: {len(batch)} items")
except Exception as e:
logger.error(f"Batch upsert error: {e}")
logger.info(f"Bulk upsert complete: {added} added, {updated} updated")
return added, updated
def search(self, query: str, n_results: int = 20, filters: dict = None) -> List[dict]:
"""์‹œ๋งจํ‹ฑ ๊ฒ€์ƒ‰"""
if not self.collection:
return []
try:
where_filter = None
if filters:
conditions = []
if filters.get("category") and filters["category"] != "์ „์ฒด":
conditions.append({"category": {"$eq": filters["category"]}})
if filters.get("author"):
conditions.append({"author": {"$contains": filters["author"]}})
if conditions:
where_filter = {"$and": conditions} if len(conditions) > 1 else conditions[0]
results = self.collection.query(
query_texts=[query],
n_results=n_results,
where=where_filter,
include=["documents", "metadatas", "distances"]
)
items = []
if results and results['ids'] and results['ids'][0]:
for i, doc_id in enumerate(results['ids'][0]):
metadata = results['metadatas'][0][i] if results['metadatas'] else {}
distance = results['distances'][0][i] if results['distances'] else 0
item = {
"id": doc_id,
"similarity_score": 1 - distance,
**metadata
}
items.append(item)
return items
except Exception as e:
logger.error(f"Search error: {e}")
return []
def get_all(self, limit: int = 1000) -> List[dict]:
"""์ „์ฒด ๋ฐ์ดํ„ฐ ์กฐํšŒ"""
if not self.collection:
return []
try:
results = self.collection.get(
limit=limit,
include=["metadatas"]
)
items = []
if results and results['ids']:
for i, doc_id in enumerate(results['ids']):
metadata = results['metadatas'][i] if results['metadatas'] else {}
items.append({"id": doc_id, **metadata})
return items
except Exception as e:
logger.error(f"Get all error: {e}")
return []
def get_count(self) -> int:
"""์ด ๋ฐ์ดํ„ฐ ์ˆ˜"""
if not self.collection:
return 0
try:
return self.collection.count()
except:
return 0
def clear(self):
"""์บ์‹œ ์ดˆ๊ธฐํ™”"""
if self.client:
try:
self.client.delete_collection("announcements")
self._init_db()
logger.info("Cache cleared")
except Exception as e:
logger.error(f"Clear error: {e}")
class ContentVectorCache:
"""๋ณธ๋ฌธ ๋ฒกํ„ฐ ์บ์‹œ (ํ•œ๊ตญ์–ด ์ž„๋ฒ ๋”ฉ)"""
def __init__(self, shared_client=None):
self.client = shared_client
self.collection = None
self.embedding_model = None
self._index_status = {"in_progress": False, "current": 0, "total": 0, "failed": []}
self._load_index_status()
def _load_index_status(self):
"""์ธ๋ฑ์‹ฑ ์ƒํƒœ ๋กœ๋“œ"""
if CONTENT_INDEX_FILE.exists():
try:
with open(CONTENT_INDEX_FILE, 'r', encoding='utf-8') as f:
saved = json.load(f)
self._index_status["failed"] = saved.get("failed", [])
except:
pass
def _save_index_status(self):
"""์ธ๋ฑ์‹ฑ ์ƒํƒœ ์ €์žฅ"""
try:
with open(CONTENT_INDEX_FILE, 'w', encoding='utf-8') as f:
json.dump({
"failed": self._index_status["failed"][-100:],
"last_updated": datetime.now(KST).isoformat()
}, f, ensure_ascii=False)
except:
pass
def _init_db(self):
"""๋ณธ๋ฌธ ๋ฒกํ„ฐ DB ์ดˆ๊ธฐํ™” (๋ฉ”ํƒ€ ์บ์‹œ์™€ ํด๋ผ์ด์–ธํŠธ ๊ณต์œ )"""
if not CHROMADB_AVAILABLE:
return
try:
if self.client is None:
meta_cache = get_cache()
self.client = meta_cache.client
if self.client is None:
logger.error("No ChromaDB client available for content cache")
return
self.collection = self.client.get_or_create_collection(
name="announcement_contents",
metadata={"hnsw:space": "cosine"}
)
logger.info("Content VectorDB initialized (shared client)")
except Exception as e:
logger.error(f"Content VectorDB init error: {e}")
def _get_embedding(self, text: str) -> Optional[List[float]]:
"""ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ"""
if self.embedding_model is None:
self.embedding_model = get_embedding_model()
if self.embedding_model is None:
return None
try:
text = text[:8000]
embedding = self.embedding_model.encode(text, show_progress_bar=False)
return embedding.tolist()
except Exception as e:
logger.error(f"Embedding error: {e}")
return None
def add_content(self, pblanc_id: str, title: str, content: str) -> bool:
"""๋ณธ๋ฌธ ์ฝ˜ํ…์ธ  ์ถ”๊ฐ€"""
if not self.collection:
self._init_db()
if not self.collection:
return False
try:
embedding = self._get_embedding(content)
if embedding is None:
return False
doc_id = f"content_{pblanc_id}"
self.collection.upsert(
ids=[doc_id],
embeddings=[embedding],
documents=[content[:5000]],
metadatas=[{
"pblancId": pblanc_id,
"title": title[:200],
"content_preview": content[:500],
"indexed_at": datetime.now(KST).isoformat()
}]
)
return True
except Exception as e:
logger.error(f"Add content error: {e}")
return False
def search_similar(self, query: str, n_results: int = 20) -> List[dict]:
"""์œ ์‚ฌ ๋ณธ๋ฌธ ๊ฒ€์ƒ‰"""
if not self.collection:
self._init_db()
if not self.collection:
return []
try:
embedding = self._get_embedding(query)
if embedding is None:
return []
results = self.collection.query(
query_embeddings=[embedding],
n_results=n_results,
include=["documents", "metadatas", "distances"]
)
items = []
if results and results['ids'] and results['ids'][0]:
for i, doc_id in enumerate(results['ids'][0]):
metadata = results['metadatas'][0][i] if results['metadatas'] else {}
distance = results['distances'][0][i] if results['distances'] else 0
items.append({
"id": doc_id,
"similarity_score": 1 - distance,
"pblancId": metadata.get("pblancId", ""),
"title": metadata.get("title", ""),
"content_preview": metadata.get("content_preview", ""),
})
return items
except Exception as e:
logger.error(f"Content search error: {e}")
return []
def get_indexed_ids(self) -> set:
"""์ธ๋ฑ์‹ฑ๋œ ๊ณต๊ณ  ID ๋ชฉ๋ก"""
if not self.collection:
self._init_db()
if not self.collection:
return set()
try:
results = self.collection.get(include=["metadatas"])
ids = set()
if results:
# 1. ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์—์„œ pblancId ์ถ”์ถœ
if results.get('metadatas'):
for meta in results['metadatas']:
if meta and meta.get("pblancId"):
ids.add(meta["pblancId"])
# 2. doc_id์—์„œ๋„ ์ถ”์ถœ (content_XXX ํ˜•์‹)
if results.get('ids'):
for doc_id in results['ids']:
if doc_id and doc_id.startswith("content_"):
pblanc_id = doc_id.replace("content_", "", 1)
if pblanc_id:
ids.add(pblanc_id)
logger.info(f"Found {len(ids)} indexed content IDs")
return ids
except Exception as e:
logger.error(f"Get indexed IDs error: {e}")
return set()
def get_indexed_count(self) -> int:
"""์ธ๋ฑ์‹ฑ๋œ ์ฝ˜ํ…์ธ  ์ˆ˜"""
if not self.collection:
self._init_db()
if not self.collection:
return 0
try:
return self.collection.count()
except:
return 0
def get_status(self) -> dict:
"""์ธ๋ฑ์‹ฑ ์ƒํƒœ"""
return {
"total_indexed": self.get_indexed_count(),
"in_progress": self._index_status["in_progress"],
"progress_current": self._index_status["current"],
"progress_total": self._index_status["total"],
"failed_count": len(self._index_status["failed"])
}
class UserProfileCache:
"""์‚ฌ์šฉ์ž ํ”„๋กœํ•„ ์ €์žฅ/๋กœ๋“œ ํด๋ž˜์Šค (์ด๋ฉ”์ผ ๊ธฐ๋ฐ˜)"""
def __init__(self):
self.profiles_file = PROFILE_DIR / "profiles.json"
self.profiles = {}
self._load_profiles()
def _load_profiles(self):
"""ํ”„๋กœํ•„ ๋ฐ์ดํ„ฐ ๋กœ๋“œ"""
if self.profiles_file.exists():
try:
with open(self.profiles_file, 'r', encoding='utf-8') as f:
self.profiles = json.load(f)
logger.info(f"Loaded {len(self.profiles)} user profiles")
except Exception as e:
logger.error(f"Profile load error: {e}")
self.profiles = {}
def _save_profiles(self):
"""ํ”„๋กœํ•„ ๋ฐ์ดํ„ฐ ์ €์žฅ"""
try:
with open(self.profiles_file, 'w', encoding='utf-8') as f:
json.dump(self.profiles, f, ensure_ascii=False, indent=2)
except Exception as e:
logger.error(f"Profile save error: {e}")
def _normalize_email(self, email: str) -> str:
"""์ด๋ฉ”์ผ ์ •๊ทœํ™” (์†Œ๋ฌธ์ž ๋ณ€ํ™˜, ๊ณต๋ฐฑ ์ œ๊ฑฐ)"""
return email.strip().lower()
def save_profile(self, email: str, profile_data: dict) -> Tuple[bool, str]:
"""ํ”„๋กœํ•„ ์ €์žฅ"""
if not email or '@' not in email:
return False, "์˜ฌ๋ฐ”๋ฅธ ์ด๋ฉ”์ผ ์ฃผ์†Œ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
email_key = self._normalize_email(email)
self.profiles[email_key] = {
"email": email_key,
"profile": profile_data,
"updated_at": datetime.now(KST).isoformat(),
"created_at": self.profiles.get(email_key, {}).get("created_at", datetime.now(KST).isoformat())
}
self._save_profiles()
logger.info(f"Profile saved for: {email_key}")
return True, f"โœ… ํ”„๋กœํ•„์ด ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ({email_key})"
def load_profile(self, email: str) -> Tuple[Optional[dict], str]:
"""ํ”„๋กœํ•„ ๋กœ๋“œ"""
if not email or '@' not in email:
return None, "์˜ฌ๋ฐ”๋ฅธ ์ด๋ฉ”์ผ ์ฃผ์†Œ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
email_key = self._normalize_email(email)
if email_key in self.profiles:
data = self.profiles[email_key]
logger.info(f"Profile loaded for: {email_key}")
return data.get("profile", {}), f"โœ… ํ”„๋กœํ•„์„ ๋ถˆ๋Ÿฌ์™”์Šต๋‹ˆ๋‹ค. (๋งˆ์ง€๋ง‰ ์ˆ˜์ •: {data.get('updated_at', '์•Œ ์ˆ˜ ์—†์Œ')[:10]})"
return None, f"โ„น๏ธ '{email_key}'์— ์ €์žฅ๋œ ํ”„๋กœํ•„์ด ์—†์Šต๋‹ˆ๋‹ค. ์ƒˆ๋กœ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
def delete_profile(self, email: str) -> Tuple[bool, str]:
"""ํ”„๋กœํ•„ ์‚ญ์ œ"""
email_key = self._normalize_email(email)
if email_key in self.profiles:
del self.profiles[email_key]
self._save_profiles()
return True, f"โœ… ํ”„๋กœํ•„์ด ์‚ญ์ œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ({email_key})"
return False, f"โš ๏ธ ์‚ญ์ œํ•  ํ”„๋กœํ•„์ด ์—†์Šต๋‹ˆ๋‹ค."
def get_profile_count(self) -> int:
"""์ €์žฅ๋œ ํ”„๋กœํ•„ ์ˆ˜"""
return len(self.profiles)
_cache_instance = None
_content_cache_instance = None
_profile_cache_instance = None
_scheduler = None
def get_cache() -> AnnouncementCache:
"""๋ฉ”ํƒ€ ์บ์‹œ ์‹ฑ๊ธ€ํ†ค"""
global _cache_instance
if _cache_instance is None:
_cache_instance = AnnouncementCache()
return _cache_instance
def get_content_cache() -> ContentVectorCache:
"""๋ณธ๋ฌธ ์บ์‹œ ์‹ฑ๊ธ€ํ†ค (๋ฉ”ํƒ€ ์บ์‹œ์™€ ํด๋ผ์ด์–ธํŠธ ๊ณต์œ )"""
global _content_cache_instance
if _content_cache_instance is None:
meta_cache = get_cache()
_content_cache_instance = ContentVectorCache(shared_client=meta_cache.client)
return _content_cache_instance
def get_profile_cache() -> UserProfileCache:
"""ํ”„๋กœํ•„ ์บ์‹œ ์‹ฑ๊ธ€ํ†ค"""
global _profile_cache_instance
if _profile_cache_instance is None:
_profile_cache_instance = UserProfileCache()
return _profile_cache_instance
def save_sync_metadata(api_count: int, added: int, updated: int, total: int):
"""๋™๊ธฐํ™” ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ"""
try:
existing = {}
if METADATA_FILE.exists():
with open(METADATA_FILE, 'r', encoding='utf-8') as f:
existing = json.load(f)
history = existing.get("sync_history", [])
history.append({
"timestamp": datetime.now(KST).isoformat(),
"api_count": api_count,
"added": added,
"updated": updated,
"total_cached": total
})
history = history[-50:]
metadata = {
"last_sync": datetime.now(KST).isoformat(),
"total_count": total,
"sync_history": history
}
with open(METADATA_FILE, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
except Exception as e:
logger.error(f"Save metadata error: {e}")
def get_sync_status() -> dict:
"""๋™๊ธฐํ™” ์ƒํƒœ ์กฐํšŒ"""
cache = get_cache()
content_cache = get_content_cache()
content_status = content_cache.get_status()
status = {
"total_count": cache.get_count(),
"last_sync": None,
"sync_history": [],
"db_path": str(DB_PATH),
"chromadb_available": CHROMADB_AVAILABLE,
"scheduler_available": SCHEDULER_AVAILABLE,
"content_indexed": content_status["total_indexed"],
"content_indexing_in_progress": content_status["in_progress"],
"content_progress": f"{content_status['progress_current']}/{content_status['progress_total']}",
"content_failed_count": content_status["failed_count"],
"embedding_available": EMBEDDING_AVAILABLE
}
if METADATA_FILE.exists():
try:
with open(METADATA_FILE, 'r', encoding='utf-8') as f:
metadata = json.load(f)
status["last_sync"] = metadata.get("last_sync")
status["sync_history"] = metadata.get("sync_history", [])[-5:]
except:
pass
return status
def sync_from_api() -> Tuple[int, int, str]:
"""API์—์„œ ๋ฐ์ดํ„ฐ ๋™๊ธฐํ™”"""
try:
from file_api import fetch_all_from_api
logger.info(f"Starting sync at {datetime.now(KST).strftime('%Y-%m-%d %H:%M:%S %Z')}")
result = fetch_all_from_api()
# fetch_all_from_api๋Š” (items, error_msg) ํŠœํ”Œ ๋ฐ˜ํ™˜
if isinstance(result, tuple):
items, error_msg = result
if error_msg and not items:
return 0, 0, f"API ์˜ค๋ฅ˜: {error_msg}"
else:
items = result
if not items:
return 0, 0, "API์—์„œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์˜ค์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
cache = get_cache()
added, updated = cache.bulk_upsert(items)
total = cache.get_count()
save_sync_metadata(len(items), added, updated, total)
msg = f"โœ… ๋™๊ธฐํ™” ์™„๋ฃŒ: API {len(items)}๊ฑด โ†’ ์‹ ๊ทœ {added}๊ฑด, ์—…๋ฐ์ดํŠธ {updated}๊ฑด (์ด {total}๊ฑด)"
logger.info(msg)
start_background_indexing()
return added, updated, msg
except Exception as e:
logger.error(f"Sync error: {e}")
return 0, 0, f"๋™๊ธฐํ™” ์˜ค๋ฅ˜: {str(e)}"
def get_cached_announcements(category: str = "์ „์ฒด", region: str = "์ „์ฒด(์ง€์—ญ)", keyword: str = "") -> Tuple[List[dict], str]:
"""์บ์‹œ์—์„œ ๊ณต๊ณ  ์กฐํšŒ - (items, status_msg) ํŠœํ”Œ ๋ฐ˜ํ™˜"""
cache = get_cache()
if keyword:
items = cache.search(keyword, n_results=100)
return items, f"๐Ÿ” '{keyword}' ๊ฒ€์ƒ‰"
else:
items = cache.get_all(limit=1000)
return items, f"โšก ์บ์‹œ์—์„œ {len(items)}๊ฑด ๋กœ๋“œ"
_indexing_thread = None
_indexing_lock = threading.Lock()
def background_index_contents():
"""๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ณธ๋ฌธ ์ธ๋ฑ์‹ฑ"""
content_cache = get_content_cache()
if content_cache._index_status["in_progress"]:
logger.info("Background indexing already running")
return
content_cache._index_status["in_progress"] = True
try:
from file_api import fetch_announcement_detail, download_file, extract_text_from_file
cache = get_cache()
all_items = cache.get_all(limit=2000)
indexed_ids = content_cache.get_indexed_ids()
def safe_str(val):
return str(val) if val else ""
def is_ongoing_check(req_dt: str) -> bool:
"""๋งˆ๊ฐ์ผ ํ™•์ธ"""
if not req_dt:
return True
try:
import re
date_patterns = [
r'(\d{4})[.\-/](\d{1,2})[.\-/](\d{1,2})',
r'(\d{4})๋…„\s*(\d{1,2})์›”\s*(\d{1,2})์ผ'
]
dates_found = []
for pattern in date_patterns:
matches = re.findall(pattern, req_dt)
for m in matches:
try:
d = datetime(int(m[0]), int(m[1]), int(m[2]))
dates_found.append(d)
except:
continue
if dates_found:
end_date = max(dates_found)
today = datetime.now()
return end_date >= today
return True
except:
return True
items_to_index = []
skipped_expired = 0
skipped_indexed = 0
for item in all_items:
pblanc_id = safe_str(item.get("pblancId") or item.get("seq"))
if not pblanc_id:
continue
if pblanc_id in indexed_ids:
skipped_indexed += 1
continue
req_dt = safe_str(item.get("reqstDt") or item.get("reqstBeginEndDe"))
if not is_ongoing_check(req_dt):
skipped_expired += 1
continue
items_to_index.append(item)
total = len(items_to_index)
content_cache._index_status["total"] = total
content_cache._index_status["current"] = 0
logger.info(f"Starting background content indexing: {total} items (skipped {skipped_indexed} already indexed, {skipped_expired} expired)")
for i, item in enumerate(items_to_index):
if not content_cache._index_status["in_progress"]:
break
content_cache._index_status["current"] = i + 1
pblanc_id = safe_str(item.get("pblancId") or item.get("seq"))
title = safe_str(item.get("title") or item.get("pblancNm"))
print_url = safe_str(item.get("printFileUrl") or item.get("printFlpthNm"))
print_name = safe_str(item.get("printFileName") or item.get("printFileNm"))
content_text = ""
if print_url and print_name:
try:
with tempfile.TemporaryDirectory() as tmp_dir:
file_path, err = download_file(print_url, tmp_dir, print_name)
if file_path and not err:
text, ext_err = extract_text_from_file(file_path)
if text:
content_text = text
except Exception as e:
logger.warning(f"Text extraction failed for {pblanc_id}: {e}")
if not content_text:
link = safe_str(item.get("link") or item.get("pblancUrl"))
if link:
try:
detail_content, _, scraped_print = fetch_announcement_detail(link)
if detail_content:
content_text = detail_content
elif scraped_print and scraped_print.get("url"):
with tempfile.TemporaryDirectory() as tmp_dir:
file_path, err = download_file(
scraped_print["url"], tmp_dir,
scraped_print.get("filename", "file")
)
if file_path and not err:
text, _ = extract_text_from_file(file_path)
if text:
content_text = text
except Exception as e:
pass
if content_text and len(content_text) > 100:
success = content_cache.add_content(pblanc_id, title, content_text)
if success:
logger.info(f"Indexed [{i+1}/{total}]: {title[:50]}...")
else:
content_cache._index_status["failed"].append(pblanc_id)
else:
content_cache._index_status["failed"].append(pblanc_id)
if (i + 1) % 50 == 0:
content_cache._save_index_status()
content_cache._save_index_status()
logger.info(f"Background indexing complete: {content_cache.get_indexed_count()} indexed")
except Exception as e:
logger.error(f"Background indexing error: {e}")
finally:
content_cache._index_status["in_progress"] = False
def start_background_indexing() -> bool:
"""๋ฐฑ๊ทธ๋ผ์šด๋“œ ์ธ๋ฑ์‹ฑ ์‹œ์ž‘"""
global _indexing_thread
with _indexing_lock:
if _indexing_thread and _indexing_thread.is_alive():
logger.info("Background indexing already running")
return False
_indexing_thread = threading.Thread(target=background_index_contents, daemon=True)
_indexing_thread.start()
logger.info("Background content indexing thread started")
return True
def stop_scheduler():
"""์Šค์ผ€์ค„๋Ÿฌ ์ค‘์ง€"""
global _scheduler
if _scheduler:
_scheduler.shutdown()
_scheduler = None
logger.info("Scheduler stopped")
def start_scheduler():
"""์Šค์ผ€์ค„๋Ÿฌ ์‹œ์ž‘"""
global _scheduler
if not SCHEDULER_AVAILABLE:
logger.warning("Scheduler not available")
return False
if _scheduler:
return True
try:
_scheduler = BackgroundScheduler(timezone=KST)
_scheduler.add_job(
sync_from_api,
CronTrigger(hour=10, minute=0, timezone=KST),
id='sync_10am',
name='Daily sync at 10:00 KST',
replace_existing=True
)
_scheduler.add_job(
sync_from_api,
CronTrigger(hour=22, minute=0, timezone=KST),
id='sync_10pm',
name='Daily sync at 22:00 KST',
replace_existing=True
)
_scheduler.start()
logger.info("Scheduler started: sync at 10:00 and 22:00 KST")
return True
except Exception as e:
logger.error(f"Scheduler start error: {e}")
return False
def manual_sync() -> str:
"""์ˆ˜๋™ ๋™๊ธฐํ™” ์‹คํ–‰"""
added, updated, msg = sync_from_api()
return msg
def initialize_cache_system():
"""์บ์‹œ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (์•ฑ ์‹œ์ž‘ ์‹œ ํ˜ธ์ถœ)"""
logger.info("Initializing cache system...")
cache = get_cache()
count = cache.get_count()
if count == 0:
logger.info("Cache is empty, performing initial sync...")
sync_from_api()
else:
logger.info(f"Cache loaded with {count} announcements")
content_cache = get_content_cache()
content_count = content_cache.get_indexed_count()
logger.info(f"Content cache: {content_count} indexed")
profile_cache = get_profile_cache()
logger.info(f"Profile cache: {profile_cache.get_profile_count()} profiles")
start_scheduler()
start_background_indexing()
return get_sync_status()
if __name__ == "__main__":
print("Testing cache system...")
status = initialize_cache_system()
print(f"Status: {json.dumps(status, ensure_ascii=False, indent=2)}")