|
|
""" |
|
|
๊ณผ์ ๊ณต๊ณ ๋ฒกํฐ DB ์บ์ ์์คํ
|
|
|
- ChromaDB๋ฅผ ์ฌ์ฉํ ๋ก์ปฌ ์บ์ |
|
|
- ๋ฉํ ์บ์ + ๋ณธ๋ฌธ ๋ฒกํฐ ์บ์ (2๊ฐ ์ปฌ๋ ์
) |
|
|
- ์ฌ์ฉ์ ํ๋กํ ์ ์ฅ (์ด๋ฉ์ผ ๊ธฐ๋ฐ) |
|
|
- ๋งค์ผ KST 10:00, 22:00 ์๋ ๋๊ธฐํ |
|
|
- ๋ฐฑ๊ทธ๋ผ์ด๋ ๋ณธ๋ฌธ ์ธ๋ฑ์ฑ (์๋น์ค ๋ฌด์ค๋จ) |
|
|
- Hugging Face Space ์๊ตฌ ์คํ ๋ฆฌ์ง ํ์ฉ (/data) |
|
|
""" |
|
|
import os |
|
|
import json |
|
|
import hashlib |
|
|
import threading |
|
|
import logging |
|
|
import tempfile |
|
|
import time |
|
|
from datetime import datetime, timedelta |
|
|
from typing import List, Dict, Tuple, Optional, Generator |
|
|
from pathlib import Path |
|
|
import pytz |
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
PERSISTENT_DIR = Path("/data") if os.path.exists("/data") else Path("./data") |
|
|
CACHE_DIR = PERSISTENT_DIR / "announcement_cache" |
|
|
DB_PATH = CACHE_DIR / "chroma_db" |
|
|
METADATA_FILE = CACHE_DIR / "sync_metadata.json" |
|
|
CONTENT_INDEX_FILE = CACHE_DIR / "content_index_status.json" |
|
|
PROFILE_DIR = CACHE_DIR / "user_profiles" |
|
|
CACHE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
PROFILE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
try: |
|
|
import chromadb |
|
|
from chromadb.config import Settings |
|
|
CHROMADB_AVAILABLE = True |
|
|
except ImportError: |
|
|
CHROMADB_AVAILABLE = False |
|
|
logger.warning("ChromaDB not available. Using JSON fallback.") |
|
|
try: |
|
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
from apscheduler.triggers.cron import CronTrigger |
|
|
SCHEDULER_AVAILABLE = True |
|
|
except ImportError: |
|
|
SCHEDULER_AVAILABLE = False |
|
|
logger.warning("APScheduler not available. Auto-sync disabled.") |
|
|
try: |
|
|
from sentence_transformers import SentenceTransformer |
|
|
EMBEDDING_AVAILABLE = True |
|
|
except ImportError: |
|
|
EMBEDDING_AVAILABLE = False |
|
|
logger.warning("sentence-transformers not available. Using ChromaDB default embedding.") |
|
|
KST = pytz.timezone('Asia/Seoul') |
|
|
_embedding_model = None |
|
|
def get_embedding_model(): |
|
|
"""ํ๊ตญ์ด ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ (์ฑ๊ธํค)""" |
|
|
global _embedding_model |
|
|
if _embedding_model is None and EMBEDDING_AVAILABLE: |
|
|
try: |
|
|
_embedding_model = SentenceTransformer('jhgan/ko-sroberta-multitask') |
|
|
logger.info("Loaded Korean embedding model: jhgan/ko-sroberta-multitask") |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load embedding model: {e}") |
|
|
return _embedding_model |
|
|
class AnnouncementCache: |
|
|
"""๋ฉํ ์ ๋ณด ์บ์ (ChromaDB ๊ธฐ๋ฐ)""" |
|
|
def __init__(self): |
|
|
self.client = None |
|
|
self.collection = None |
|
|
self._init_db() |
|
|
def _init_db(self): |
|
|
"""ChromaDB ์ด๊ธฐํ""" |
|
|
if not CHROMADB_AVAILABLE: |
|
|
return |
|
|
try: |
|
|
self.client = chromadb.PersistentClient(path=str(DB_PATH)) |
|
|
self.collection = self.client.get_or_create_collection( |
|
|
name="announcements", |
|
|
metadata={"hnsw:space": "cosine"} |
|
|
) |
|
|
logger.info(f"ChromaDB initialized at {DB_PATH}") |
|
|
except Exception as e: |
|
|
logger.error(f"ChromaDB init error: {e}") |
|
|
self.client = None |
|
|
self.collection = None |
|
|
def _generate_id(self, item: dict) -> str: |
|
|
"""๊ณต๊ณ ๊ณ ์ ID ์์ฑ""" |
|
|
pblanc_id = item.get("pblancId") or item.get("seq") or "" |
|
|
title = item.get("title") or item.get("pblancNm") or "" |
|
|
unique_str = f"{pblanc_id}_{title}" |
|
|
return hashlib.md5(unique_str.encode()).hexdigest() |
|
|
def _item_to_document(self, item: dict) -> str: |
|
|
"""๊ณต๊ณ ์ ๋ณด๋ฅผ ๊ฒ์์ฉ ๋ฌธ์๋ก ๋ณํ""" |
|
|
parts = [] |
|
|
title = item.get("title") or item.get("pblancNm") or "" |
|
|
if title: |
|
|
parts.append(title) |
|
|
description = item.get("description") or item.get("bsnsSumryCn") or "" |
|
|
if description: |
|
|
import re |
|
|
description = re.sub(r'<[^>]+>', '', description).strip() |
|
|
parts.append(description[:500]) |
|
|
author = item.get("author") or item.get("jrsdInsttNm") or "" |
|
|
if author: |
|
|
parts.append(author) |
|
|
category = item.get("lcategory") or item.get("pldirSportRealmLclasCodeNm") or "" |
|
|
if category: |
|
|
parts.append(category) |
|
|
hash_tags = item.get("hashTags") or "" |
|
|
if hash_tags: |
|
|
parts.append(hash_tags) |
|
|
target = item.get("trgetNm") or "" |
|
|
if target: |
|
|
parts.append(target) |
|
|
return " ".join(parts) |
|
|
def _item_to_metadata(self, item: dict) -> dict: |
|
|
"""๊ณต๊ณ ์ ๋ณด๋ฅผ ๋ฉํ๋ฐ์ดํฐ๋ก ๋ณํ""" |
|
|
def safe_str(val): |
|
|
if val is None: |
|
|
return "" |
|
|
return str(val)[:500] |
|
|
return { |
|
|
"pblancId": safe_str(item.get("pblancId") or item.get("seq")), |
|
|
"title": safe_str(item.get("title") or item.get("pblancNm")), |
|
|
"author": safe_str(item.get("author") or item.get("jrsdInsttNm")), |
|
|
"category": safe_str(item.get("lcategory") or item.get("pldirSportRealmLclasCodeNm")), |
|
|
"reqstDt": safe_str(item.get("reqstDt") or item.get("reqstBeginEndDe")), |
|
|
"pubDate": safe_str(item.get("pubDate") or item.get("creatPnttm")), |
|
|
"link": safe_str(item.get("link") or item.get("pblancUrl")), |
|
|
"hashTags": safe_str(item.get("hashTags")), |
|
|
"target": safe_str(item.get("trgetNm")), |
|
|
"excInsttNm": safe_str(item.get("excInsttNm")), |
|
|
"fileUrl": safe_str(item.get("flpthNm")), |
|
|
"fileName": safe_str(item.get("fileNm")), |
|
|
"printFileUrl": safe_str(item.get("printFlpthNm")), |
|
|
"printFileName": safe_str(item.get("printFileNm")), |
|
|
"description": safe_str(item.get("description") or item.get("bsnsSumryCn")), |
|
|
} |
|
|
def bulk_upsert(self, items: List[dict]) -> Tuple[int, int]: |
|
|
"""๋๋ ๋ฐ์ดํฐ ์ฝ์
/์
๋ฐ์ดํธ""" |
|
|
if not self.collection or not items: |
|
|
return 0, 0 |
|
|
added = 0 |
|
|
updated = 0 |
|
|
batch_size = 100 |
|
|
for i in range(0, len(items), batch_size): |
|
|
batch = items[i:i+batch_size] |
|
|
ids = [] |
|
|
documents = [] |
|
|
metadatas = [] |
|
|
for item in batch: |
|
|
doc_id = self._generate_id(item) |
|
|
ids.append(doc_id) |
|
|
documents.append(self._item_to_document(item)) |
|
|
metadatas.append(self._item_to_metadata(item)) |
|
|
try: |
|
|
existing = self.collection.get(ids=ids) |
|
|
existing_ids = set(existing['ids']) if existing['ids'] else set() |
|
|
self.collection.upsert( |
|
|
ids=ids, |
|
|
documents=documents, |
|
|
metadatas=metadatas |
|
|
) |
|
|
for doc_id in ids: |
|
|
if doc_id in existing_ids: |
|
|
updated += 1 |
|
|
else: |
|
|
added += 1 |
|
|
logger.info(f"Upserted batch {i//batch_size + 1}: {len(batch)} items") |
|
|
except Exception as e: |
|
|
logger.error(f"Batch upsert error: {e}") |
|
|
logger.info(f"Bulk upsert complete: {added} added, {updated} updated") |
|
|
return added, updated |
|
|
def search(self, query: str, n_results: int = 20, filters: dict = None) -> List[dict]: |
|
|
"""์๋งจํฑ ๊ฒ์""" |
|
|
if not self.collection: |
|
|
return [] |
|
|
try: |
|
|
where_filter = None |
|
|
if filters: |
|
|
conditions = [] |
|
|
if filters.get("category") and filters["category"] != "์ ์ฒด": |
|
|
conditions.append({"category": {"$eq": filters["category"]}}) |
|
|
if filters.get("author"): |
|
|
conditions.append({"author": {"$contains": filters["author"]}}) |
|
|
if conditions: |
|
|
where_filter = {"$and": conditions} if len(conditions) > 1 else conditions[0] |
|
|
results = self.collection.query( |
|
|
query_texts=[query], |
|
|
n_results=n_results, |
|
|
where=where_filter, |
|
|
include=["documents", "metadatas", "distances"] |
|
|
) |
|
|
items = [] |
|
|
if results and results['ids'] and results['ids'][0]: |
|
|
for i, doc_id in enumerate(results['ids'][0]): |
|
|
metadata = results['metadatas'][0][i] if results['metadatas'] else {} |
|
|
distance = results['distances'][0][i] if results['distances'] else 0 |
|
|
item = { |
|
|
"id": doc_id, |
|
|
"similarity_score": 1 - distance, |
|
|
**metadata |
|
|
} |
|
|
items.append(item) |
|
|
return items |
|
|
except Exception as e: |
|
|
logger.error(f"Search error: {e}") |
|
|
return [] |
|
|
def get_all(self, limit: int = 1000) -> List[dict]: |
|
|
"""์ ์ฒด ๋ฐ์ดํฐ ์กฐํ""" |
|
|
if not self.collection: |
|
|
return [] |
|
|
try: |
|
|
results = self.collection.get( |
|
|
limit=limit, |
|
|
include=["metadatas"] |
|
|
) |
|
|
items = [] |
|
|
if results and results['ids']: |
|
|
for i, doc_id in enumerate(results['ids']): |
|
|
metadata = results['metadatas'][i] if results['metadatas'] else {} |
|
|
items.append({"id": doc_id, **metadata}) |
|
|
return items |
|
|
except Exception as e: |
|
|
logger.error(f"Get all error: {e}") |
|
|
return [] |
|
|
def get_count(self) -> int: |
|
|
"""์ด ๋ฐ์ดํฐ ์""" |
|
|
if not self.collection: |
|
|
return 0 |
|
|
try: |
|
|
return self.collection.count() |
|
|
except: |
|
|
return 0 |
|
|
def clear(self): |
|
|
"""์บ์ ์ด๊ธฐํ""" |
|
|
if self.client: |
|
|
try: |
|
|
self.client.delete_collection("announcements") |
|
|
self._init_db() |
|
|
logger.info("Cache cleared") |
|
|
except Exception as e: |
|
|
logger.error(f"Clear error: {e}") |
|
|
class ContentVectorCache: |
|
|
"""๋ณธ๋ฌธ ๋ฒกํฐ ์บ์ (ํ๊ตญ์ด ์๋ฒ ๋ฉ)""" |
|
|
def __init__(self, shared_client=None): |
|
|
self.client = shared_client |
|
|
self.collection = None |
|
|
self.embedding_model = None |
|
|
self._index_status = {"in_progress": False, "current": 0, "total": 0, "failed": []} |
|
|
self._load_index_status() |
|
|
def _load_index_status(self): |
|
|
"""์ธ๋ฑ์ฑ ์ํ ๋ก๋""" |
|
|
if CONTENT_INDEX_FILE.exists(): |
|
|
try: |
|
|
with open(CONTENT_INDEX_FILE, 'r', encoding='utf-8') as f: |
|
|
saved = json.load(f) |
|
|
self._index_status["failed"] = saved.get("failed", []) |
|
|
except: |
|
|
pass |
|
|
def _save_index_status(self): |
|
|
"""์ธ๋ฑ์ฑ ์ํ ์ ์ฅ""" |
|
|
try: |
|
|
with open(CONTENT_INDEX_FILE, 'w', encoding='utf-8') as f: |
|
|
json.dump({ |
|
|
"failed": self._index_status["failed"][-100:], |
|
|
"last_updated": datetime.now(KST).isoformat() |
|
|
}, f, ensure_ascii=False) |
|
|
except: |
|
|
pass |
|
|
def _init_db(self): |
|
|
"""๋ณธ๋ฌธ ๋ฒกํฐ DB ์ด๊ธฐํ (๋ฉํ ์บ์์ ํด๋ผ์ด์ธํธ ๊ณต์ )""" |
|
|
if not CHROMADB_AVAILABLE: |
|
|
return |
|
|
try: |
|
|
if self.client is None: |
|
|
meta_cache = get_cache() |
|
|
self.client = meta_cache.client |
|
|
if self.client is None: |
|
|
logger.error("No ChromaDB client available for content cache") |
|
|
return |
|
|
self.collection = self.client.get_or_create_collection( |
|
|
name="announcement_contents", |
|
|
metadata={"hnsw:space": "cosine"} |
|
|
) |
|
|
logger.info("Content VectorDB initialized (shared client)") |
|
|
except Exception as e: |
|
|
logger.error(f"Content VectorDB init error: {e}") |
|
|
def _get_embedding(self, text: str) -> Optional[List[float]]: |
|
|
"""ํ
์คํธ ์๋ฒ ๋ฉ ์์ฑ""" |
|
|
if self.embedding_model is None: |
|
|
self.embedding_model = get_embedding_model() |
|
|
if self.embedding_model is None: |
|
|
return None |
|
|
try: |
|
|
text = text[:8000] |
|
|
embedding = self.embedding_model.encode(text, show_progress_bar=False) |
|
|
return embedding.tolist() |
|
|
except Exception as e: |
|
|
logger.error(f"Embedding error: {e}") |
|
|
return None |
|
|
def add_content(self, pblanc_id: str, title: str, content: str) -> bool: |
|
|
"""๋ณธ๋ฌธ ์ฝํ
์ธ ์ถ๊ฐ""" |
|
|
if not self.collection: |
|
|
self._init_db() |
|
|
if not self.collection: |
|
|
return False |
|
|
try: |
|
|
embedding = self._get_embedding(content) |
|
|
if embedding is None: |
|
|
return False |
|
|
doc_id = f"content_{pblanc_id}" |
|
|
self.collection.upsert( |
|
|
ids=[doc_id], |
|
|
embeddings=[embedding], |
|
|
documents=[content[:5000]], |
|
|
metadatas=[{ |
|
|
"pblancId": pblanc_id, |
|
|
"title": title[:200], |
|
|
"content_preview": content[:500], |
|
|
"indexed_at": datetime.now(KST).isoformat() |
|
|
}] |
|
|
) |
|
|
return True |
|
|
except Exception as e: |
|
|
logger.error(f"Add content error: {e}") |
|
|
return False |
|
|
def search_similar(self, query: str, n_results: int = 20) -> List[dict]: |
|
|
"""์ ์ฌ ๋ณธ๋ฌธ ๊ฒ์""" |
|
|
if not self.collection: |
|
|
self._init_db() |
|
|
if not self.collection: |
|
|
return [] |
|
|
try: |
|
|
embedding = self._get_embedding(query) |
|
|
if embedding is None: |
|
|
return [] |
|
|
results = self.collection.query( |
|
|
query_embeddings=[embedding], |
|
|
n_results=n_results, |
|
|
include=["documents", "metadatas", "distances"] |
|
|
) |
|
|
items = [] |
|
|
if results and results['ids'] and results['ids'][0]: |
|
|
for i, doc_id in enumerate(results['ids'][0]): |
|
|
metadata = results['metadatas'][0][i] if results['metadatas'] else {} |
|
|
distance = results['distances'][0][i] if results['distances'] else 0 |
|
|
items.append({ |
|
|
"id": doc_id, |
|
|
"similarity_score": 1 - distance, |
|
|
"pblancId": metadata.get("pblancId", ""), |
|
|
"title": metadata.get("title", ""), |
|
|
"content_preview": metadata.get("content_preview", ""), |
|
|
}) |
|
|
return items |
|
|
except Exception as e: |
|
|
logger.error(f"Content search error: {e}") |
|
|
return [] |
|
|
def get_indexed_ids(self) -> set: |
|
|
"""์ธ๋ฑ์ฑ๋ ๊ณต๊ณ ID ๋ชฉ๋ก""" |
|
|
if not self.collection: |
|
|
self._init_db() |
|
|
if not self.collection: |
|
|
return set() |
|
|
try: |
|
|
results = self.collection.get(include=["metadatas"]) |
|
|
ids = set() |
|
|
if results: |
|
|
|
|
|
if results.get('metadatas'): |
|
|
for meta in results['metadatas']: |
|
|
if meta and meta.get("pblancId"): |
|
|
ids.add(meta["pblancId"]) |
|
|
|
|
|
if results.get('ids'): |
|
|
for doc_id in results['ids']: |
|
|
if doc_id and doc_id.startswith("content_"): |
|
|
pblanc_id = doc_id.replace("content_", "", 1) |
|
|
if pblanc_id: |
|
|
ids.add(pblanc_id) |
|
|
logger.info(f"Found {len(ids)} indexed content IDs") |
|
|
return ids |
|
|
except Exception as e: |
|
|
logger.error(f"Get indexed IDs error: {e}") |
|
|
return set() |
|
|
def get_indexed_count(self) -> int: |
|
|
"""์ธ๋ฑ์ฑ๋ ์ฝํ
์ธ ์""" |
|
|
if not self.collection: |
|
|
self._init_db() |
|
|
if not self.collection: |
|
|
return 0 |
|
|
try: |
|
|
return self.collection.count() |
|
|
except: |
|
|
return 0 |
|
|
def get_status(self) -> dict: |
|
|
"""์ธ๋ฑ์ฑ ์ํ""" |
|
|
return { |
|
|
"total_indexed": self.get_indexed_count(), |
|
|
"in_progress": self._index_status["in_progress"], |
|
|
"progress_current": self._index_status["current"], |
|
|
"progress_total": self._index_status["total"], |
|
|
"failed_count": len(self._index_status["failed"]) |
|
|
} |
|
|
class UserProfileCache: |
|
|
"""์ฌ์ฉ์ ํ๋กํ ์ ์ฅ/๋ก๋ ํด๋์ค (์ด๋ฉ์ผ ๊ธฐ๋ฐ)""" |
|
|
def __init__(self): |
|
|
self.profiles_file = PROFILE_DIR / "profiles.json" |
|
|
self.profiles = {} |
|
|
self._load_profiles() |
|
|
def _load_profiles(self): |
|
|
"""ํ๋กํ ๋ฐ์ดํฐ ๋ก๋""" |
|
|
if self.profiles_file.exists(): |
|
|
try: |
|
|
with open(self.profiles_file, 'r', encoding='utf-8') as f: |
|
|
self.profiles = json.load(f) |
|
|
logger.info(f"Loaded {len(self.profiles)} user profiles") |
|
|
except Exception as e: |
|
|
logger.error(f"Profile load error: {e}") |
|
|
self.profiles = {} |
|
|
def _save_profiles(self): |
|
|
"""ํ๋กํ ๋ฐ์ดํฐ ์ ์ฅ""" |
|
|
try: |
|
|
with open(self.profiles_file, 'w', encoding='utf-8') as f: |
|
|
json.dump(self.profiles, f, ensure_ascii=False, indent=2) |
|
|
except Exception as e: |
|
|
logger.error(f"Profile save error: {e}") |
|
|
def _normalize_email(self, email: str) -> str: |
|
|
"""์ด๋ฉ์ผ ์ ๊ทํ (์๋ฌธ์ ๋ณํ, ๊ณต๋ฐฑ ์ ๊ฑฐ)""" |
|
|
return email.strip().lower() |
|
|
def save_profile(self, email: str, profile_data: dict) -> Tuple[bool, str]: |
|
|
"""ํ๋กํ ์ ์ฅ""" |
|
|
if not email or '@' not in email: |
|
|
return False, "์ฌ๋ฐ๋ฅธ ์ด๋ฉ์ผ ์ฃผ์๋ฅผ ์
๋ ฅํด์ฃผ์ธ์." |
|
|
email_key = self._normalize_email(email) |
|
|
self.profiles[email_key] = { |
|
|
"email": email_key, |
|
|
"profile": profile_data, |
|
|
"updated_at": datetime.now(KST).isoformat(), |
|
|
"created_at": self.profiles.get(email_key, {}).get("created_at", datetime.now(KST).isoformat()) |
|
|
} |
|
|
self._save_profiles() |
|
|
logger.info(f"Profile saved for: {email_key}") |
|
|
return True, f"โ
ํ๋กํ์ด ์ ์ฅ๋์์ต๋๋ค. ({email_key})" |
|
|
def load_profile(self, email: str) -> Tuple[Optional[dict], str]: |
|
|
"""ํ๋กํ ๋ก๋""" |
|
|
if not email or '@' not in email: |
|
|
return None, "์ฌ๋ฐ๋ฅธ ์ด๋ฉ์ผ ์ฃผ์๋ฅผ ์
๋ ฅํด์ฃผ์ธ์." |
|
|
email_key = self._normalize_email(email) |
|
|
if email_key in self.profiles: |
|
|
data = self.profiles[email_key] |
|
|
logger.info(f"Profile loaded for: {email_key}") |
|
|
return data.get("profile", {}), f"โ
ํ๋กํ์ ๋ถ๋ฌ์์ต๋๋ค. (๋ง์ง๋ง ์์ : {data.get('updated_at', '์ ์ ์์')[:10]})" |
|
|
return None, f"โน๏ธ '{email_key}'์ ์ ์ฅ๋ ํ๋กํ์ด ์์ต๋๋ค. ์๋ก ์
๋ ฅํด์ฃผ์ธ์." |
|
|
def delete_profile(self, email: str) -> Tuple[bool, str]: |
|
|
"""ํ๋กํ ์ญ์ """ |
|
|
email_key = self._normalize_email(email) |
|
|
if email_key in self.profiles: |
|
|
del self.profiles[email_key] |
|
|
self._save_profiles() |
|
|
return True, f"โ
ํ๋กํ์ด ์ญ์ ๋์์ต๋๋ค. ({email_key})" |
|
|
return False, f"โ ๏ธ ์ญ์ ํ ํ๋กํ์ด ์์ต๋๋ค." |
|
|
def get_profile_count(self) -> int: |
|
|
"""์ ์ฅ๋ ํ๋กํ ์""" |
|
|
return len(self.profiles) |
|
|
_cache_instance = None |
|
|
_content_cache_instance = None |
|
|
_profile_cache_instance = None |
|
|
_scheduler = None |
|
|
def get_cache() -> AnnouncementCache: |
|
|
"""๋ฉํ ์บ์ ์ฑ๊ธํค""" |
|
|
global _cache_instance |
|
|
if _cache_instance is None: |
|
|
_cache_instance = AnnouncementCache() |
|
|
return _cache_instance |
|
|
def get_content_cache() -> ContentVectorCache: |
|
|
"""๋ณธ๋ฌธ ์บ์ ์ฑ๊ธํค (๋ฉํ ์บ์์ ํด๋ผ์ด์ธํธ ๊ณต์ )""" |
|
|
global _content_cache_instance |
|
|
if _content_cache_instance is None: |
|
|
meta_cache = get_cache() |
|
|
_content_cache_instance = ContentVectorCache(shared_client=meta_cache.client) |
|
|
return _content_cache_instance |
|
|
def get_profile_cache() -> UserProfileCache: |
|
|
"""ํ๋กํ ์บ์ ์ฑ๊ธํค""" |
|
|
global _profile_cache_instance |
|
|
if _profile_cache_instance is None: |
|
|
_profile_cache_instance = UserProfileCache() |
|
|
return _profile_cache_instance |
|
|
def save_sync_metadata(api_count: int, added: int, updated: int, total: int): |
|
|
"""๋๊ธฐํ ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ""" |
|
|
try: |
|
|
existing = {} |
|
|
if METADATA_FILE.exists(): |
|
|
with open(METADATA_FILE, 'r', encoding='utf-8') as f: |
|
|
existing = json.load(f) |
|
|
history = existing.get("sync_history", []) |
|
|
history.append({ |
|
|
"timestamp": datetime.now(KST).isoformat(), |
|
|
"api_count": api_count, |
|
|
"added": added, |
|
|
"updated": updated, |
|
|
"total_cached": total |
|
|
}) |
|
|
history = history[-50:] |
|
|
metadata = { |
|
|
"last_sync": datetime.now(KST).isoformat(), |
|
|
"total_count": total, |
|
|
"sync_history": history |
|
|
} |
|
|
with open(METADATA_FILE, 'w', encoding='utf-8') as f: |
|
|
json.dump(metadata, f, ensure_ascii=False, indent=2) |
|
|
except Exception as e: |
|
|
logger.error(f"Save metadata error: {e}") |
|
|
def get_sync_status() -> dict: |
|
|
"""๋๊ธฐํ ์ํ ์กฐํ""" |
|
|
cache = get_cache() |
|
|
content_cache = get_content_cache() |
|
|
content_status = content_cache.get_status() |
|
|
status = { |
|
|
"total_count": cache.get_count(), |
|
|
"last_sync": None, |
|
|
"sync_history": [], |
|
|
"db_path": str(DB_PATH), |
|
|
"chromadb_available": CHROMADB_AVAILABLE, |
|
|
"scheduler_available": SCHEDULER_AVAILABLE, |
|
|
"content_indexed": content_status["total_indexed"], |
|
|
"content_indexing_in_progress": content_status["in_progress"], |
|
|
"content_progress": f"{content_status['progress_current']}/{content_status['progress_total']}", |
|
|
"content_failed_count": content_status["failed_count"], |
|
|
"embedding_available": EMBEDDING_AVAILABLE |
|
|
} |
|
|
if METADATA_FILE.exists(): |
|
|
try: |
|
|
with open(METADATA_FILE, 'r', encoding='utf-8') as f: |
|
|
metadata = json.load(f) |
|
|
status["last_sync"] = metadata.get("last_sync") |
|
|
status["sync_history"] = metadata.get("sync_history", [])[-5:] |
|
|
except: |
|
|
pass |
|
|
return status |
|
|
def sync_from_api() -> Tuple[int, int, str]: |
|
|
"""API์์ ๋ฐ์ดํฐ ๋๊ธฐํ""" |
|
|
try: |
|
|
from file_api import fetch_all_from_api |
|
|
logger.info(f"Starting sync at {datetime.now(KST).strftime('%Y-%m-%d %H:%M:%S %Z')}") |
|
|
result = fetch_all_from_api() |
|
|
|
|
|
if isinstance(result, tuple): |
|
|
items, error_msg = result |
|
|
if error_msg and not items: |
|
|
return 0, 0, f"API ์ค๋ฅ: {error_msg}" |
|
|
else: |
|
|
items = result |
|
|
if not items: |
|
|
return 0, 0, "API์์ ๋ฐ์ดํฐ๋ฅผ ๊ฐ์ ธ์ค์ง ๋ชปํ์ต๋๋ค." |
|
|
cache = get_cache() |
|
|
added, updated = cache.bulk_upsert(items) |
|
|
total = cache.get_count() |
|
|
save_sync_metadata(len(items), added, updated, total) |
|
|
msg = f"โ
๋๊ธฐํ ์๋ฃ: API {len(items)}๊ฑด โ ์ ๊ท {added}๊ฑด, ์
๋ฐ์ดํธ {updated}๊ฑด (์ด {total}๊ฑด)" |
|
|
logger.info(msg) |
|
|
start_background_indexing() |
|
|
return added, updated, msg |
|
|
except Exception as e: |
|
|
logger.error(f"Sync error: {e}") |
|
|
return 0, 0, f"๋๊ธฐํ ์ค๋ฅ: {str(e)}" |
|
|
def get_cached_announcements(category: str = "์ ์ฒด", region: str = "์ ์ฒด(์ง์ญ)", keyword: str = "") -> Tuple[List[dict], str]: |
|
|
"""์บ์์์ ๊ณต๊ณ ์กฐํ - (items, status_msg) ํํ ๋ฐํ""" |
|
|
cache = get_cache() |
|
|
if keyword: |
|
|
items = cache.search(keyword, n_results=100) |
|
|
return items, f"๐ '{keyword}' ๊ฒ์" |
|
|
else: |
|
|
items = cache.get_all(limit=1000) |
|
|
return items, f"โก ์บ์์์ {len(items)}๊ฑด ๋ก๋" |
|
|
_indexing_thread = None |
|
|
_indexing_lock = threading.Lock() |
|
|
def background_index_contents(): |
|
|
"""๋ฐฑ๊ทธ๋ผ์ด๋ ๋ณธ๋ฌธ ์ธ๋ฑ์ฑ""" |
|
|
content_cache = get_content_cache() |
|
|
if content_cache._index_status["in_progress"]: |
|
|
logger.info("Background indexing already running") |
|
|
return |
|
|
content_cache._index_status["in_progress"] = True |
|
|
try: |
|
|
from file_api import fetch_announcement_detail, download_file, extract_text_from_file |
|
|
cache = get_cache() |
|
|
all_items = cache.get_all(limit=2000) |
|
|
indexed_ids = content_cache.get_indexed_ids() |
|
|
def safe_str(val): |
|
|
return str(val) if val else "" |
|
|
def is_ongoing_check(req_dt: str) -> bool: |
|
|
"""๋ง๊ฐ์ผ ํ์ธ""" |
|
|
if not req_dt: |
|
|
return True |
|
|
try: |
|
|
import re |
|
|
date_patterns = [ |
|
|
r'(\d{4})[.\-/](\d{1,2})[.\-/](\d{1,2})', |
|
|
r'(\d{4})๋
\s*(\d{1,2})์\s*(\d{1,2})์ผ' |
|
|
] |
|
|
dates_found = [] |
|
|
for pattern in date_patterns: |
|
|
matches = re.findall(pattern, req_dt) |
|
|
for m in matches: |
|
|
try: |
|
|
d = datetime(int(m[0]), int(m[1]), int(m[2])) |
|
|
dates_found.append(d) |
|
|
except: |
|
|
continue |
|
|
if dates_found: |
|
|
end_date = max(dates_found) |
|
|
today = datetime.now() |
|
|
return end_date >= today |
|
|
return True |
|
|
except: |
|
|
return True |
|
|
items_to_index = [] |
|
|
skipped_expired = 0 |
|
|
skipped_indexed = 0 |
|
|
for item in all_items: |
|
|
pblanc_id = safe_str(item.get("pblancId") or item.get("seq")) |
|
|
if not pblanc_id: |
|
|
continue |
|
|
if pblanc_id in indexed_ids: |
|
|
skipped_indexed += 1 |
|
|
continue |
|
|
req_dt = safe_str(item.get("reqstDt") or item.get("reqstBeginEndDe")) |
|
|
if not is_ongoing_check(req_dt): |
|
|
skipped_expired += 1 |
|
|
continue |
|
|
items_to_index.append(item) |
|
|
total = len(items_to_index) |
|
|
content_cache._index_status["total"] = total |
|
|
content_cache._index_status["current"] = 0 |
|
|
logger.info(f"Starting background content indexing: {total} items (skipped {skipped_indexed} already indexed, {skipped_expired} expired)") |
|
|
for i, item in enumerate(items_to_index): |
|
|
if not content_cache._index_status["in_progress"]: |
|
|
break |
|
|
content_cache._index_status["current"] = i + 1 |
|
|
pblanc_id = safe_str(item.get("pblancId") or item.get("seq")) |
|
|
title = safe_str(item.get("title") or item.get("pblancNm")) |
|
|
print_url = safe_str(item.get("printFileUrl") or item.get("printFlpthNm")) |
|
|
print_name = safe_str(item.get("printFileName") or item.get("printFileNm")) |
|
|
content_text = "" |
|
|
if print_url and print_name: |
|
|
try: |
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
file_path, err = download_file(print_url, tmp_dir, print_name) |
|
|
if file_path and not err: |
|
|
text, ext_err = extract_text_from_file(file_path) |
|
|
if text: |
|
|
content_text = text |
|
|
except Exception as e: |
|
|
logger.warning(f"Text extraction failed for {pblanc_id}: {e}") |
|
|
if not content_text: |
|
|
link = safe_str(item.get("link") or item.get("pblancUrl")) |
|
|
if link: |
|
|
try: |
|
|
detail_content, _, scraped_print = fetch_announcement_detail(link) |
|
|
if detail_content: |
|
|
content_text = detail_content |
|
|
elif scraped_print and scraped_print.get("url"): |
|
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
|
file_path, err = download_file( |
|
|
scraped_print["url"], tmp_dir, |
|
|
scraped_print.get("filename", "file") |
|
|
) |
|
|
if file_path and not err: |
|
|
text, _ = extract_text_from_file(file_path) |
|
|
if text: |
|
|
content_text = text |
|
|
except Exception as e: |
|
|
pass |
|
|
if content_text and len(content_text) > 100: |
|
|
success = content_cache.add_content(pblanc_id, title, content_text) |
|
|
if success: |
|
|
logger.info(f"Indexed [{i+1}/{total}]: {title[:50]}...") |
|
|
else: |
|
|
content_cache._index_status["failed"].append(pblanc_id) |
|
|
else: |
|
|
content_cache._index_status["failed"].append(pblanc_id) |
|
|
if (i + 1) % 50 == 0: |
|
|
content_cache._save_index_status() |
|
|
content_cache._save_index_status() |
|
|
logger.info(f"Background indexing complete: {content_cache.get_indexed_count()} indexed") |
|
|
except Exception as e: |
|
|
logger.error(f"Background indexing error: {e}") |
|
|
finally: |
|
|
content_cache._index_status["in_progress"] = False |
|
|
def start_background_indexing() -> bool: |
|
|
"""๋ฐฑ๊ทธ๋ผ์ด๋ ์ธ๋ฑ์ฑ ์์""" |
|
|
global _indexing_thread |
|
|
with _indexing_lock: |
|
|
if _indexing_thread and _indexing_thread.is_alive(): |
|
|
logger.info("Background indexing already running") |
|
|
return False |
|
|
_indexing_thread = threading.Thread(target=background_index_contents, daemon=True) |
|
|
_indexing_thread.start() |
|
|
logger.info("Background content indexing thread started") |
|
|
return True |
|
|
def stop_scheduler(): |
|
|
"""์ค์ผ์ค๋ฌ ์ค์ง""" |
|
|
global _scheduler |
|
|
if _scheduler: |
|
|
_scheduler.shutdown() |
|
|
_scheduler = None |
|
|
logger.info("Scheduler stopped") |
|
|
def start_scheduler(): |
|
|
"""์ค์ผ์ค๋ฌ ์์""" |
|
|
global _scheduler |
|
|
if not SCHEDULER_AVAILABLE: |
|
|
logger.warning("Scheduler not available") |
|
|
return False |
|
|
if _scheduler: |
|
|
return True |
|
|
try: |
|
|
_scheduler = BackgroundScheduler(timezone=KST) |
|
|
_scheduler.add_job( |
|
|
sync_from_api, |
|
|
CronTrigger(hour=10, minute=0, timezone=KST), |
|
|
id='sync_10am', |
|
|
name='Daily sync at 10:00 KST', |
|
|
replace_existing=True |
|
|
) |
|
|
_scheduler.add_job( |
|
|
sync_from_api, |
|
|
CronTrigger(hour=22, minute=0, timezone=KST), |
|
|
id='sync_10pm', |
|
|
name='Daily sync at 22:00 KST', |
|
|
replace_existing=True |
|
|
) |
|
|
_scheduler.start() |
|
|
logger.info("Scheduler started: sync at 10:00 and 22:00 KST") |
|
|
return True |
|
|
except Exception as e: |
|
|
logger.error(f"Scheduler start error: {e}") |
|
|
return False |
|
|
def manual_sync() -> str: |
|
|
"""์๋ ๋๊ธฐํ ์คํ""" |
|
|
added, updated, msg = sync_from_api() |
|
|
return msg |
|
|
def initialize_cache_system(): |
|
|
"""์บ์ ์์คํ
์ด๊ธฐํ (์ฑ ์์ ์ ํธ์ถ)""" |
|
|
logger.info("Initializing cache system...") |
|
|
cache = get_cache() |
|
|
count = cache.get_count() |
|
|
if count == 0: |
|
|
logger.info("Cache is empty, performing initial sync...") |
|
|
sync_from_api() |
|
|
else: |
|
|
logger.info(f"Cache loaded with {count} announcements") |
|
|
content_cache = get_content_cache() |
|
|
content_count = content_cache.get_indexed_count() |
|
|
logger.info(f"Content cache: {content_count} indexed") |
|
|
profile_cache = get_profile_cache() |
|
|
logger.info(f"Profile cache: {profile_cache.get_profile_count()} profiles") |
|
|
start_scheduler() |
|
|
start_background_indexing() |
|
|
return get_sync_status() |
|
|
if __name__ == "__main__": |
|
|
print("Testing cache system...") |
|
|
status = initialize_cache_system() |
|
|
print(f"Status: {json.dumps(status, ensure_ascii=False, indent=2)}") |