Spaces:
Sleeping
Sleeping
| """ | |
| Sistema de cache para evitar buscas repetidas de referências. | |
| Usa SQLite para persistência local. | |
| """ | |
| import sqlite3 | |
| import json | |
| import hashlib | |
| from typing import Optional, List | |
| from dataclasses import asdict | |
| from search_services import SearchResult | |
| # Constante para indicar resultado não encontrado (cacheado) | |
| CACHE_NOT_FOUND = "__NOT_FOUND__" | |
| class ReferenceCache: | |
| """Cache SQLite para resultados de busca de referências.""" | |
| def __init__(self, db_path: str = "reference_cache.db", expiry_days: int = 30): | |
| self.db_path = db_path | |
| self.expiry_days = expiry_days | |
| self._init_db() | |
| def _init_db(self): | |
| """Inicializa o banco de dados com a tabela de cache.""" | |
| with sqlite3.connect(self.db_path) as conn: | |
| # Cache de buscas de referências | |
| conn.execute(""" | |
| CREATE TABLE IF NOT EXISTS search_cache ( | |
| query_hash TEXT PRIMARY KEY, | |
| query_text TEXT, | |
| result_json TEXT, | |
| found INTEGER, | |
| source TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| hits INTEGER DEFAULT 0 | |
| ) | |
| """) | |
| conn.execute(""" | |
| CREATE INDEX IF NOT EXISTS idx_created_at | |
| ON search_cache(created_at) | |
| """) | |
| # Cache de parsing de PDFs (novo!) | |
| conn.execute(""" | |
| CREATE TABLE IF NOT EXISTS pdf_parse_cache ( | |
| content_hash TEXT PRIMARY KEY, | |
| pdf_name TEXT, | |
| references_json TEXT, | |
| ref_count INTEGER, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| hits INTEGER DEFAULT 0 | |
| ) | |
| """) | |
| conn.execute(""" | |
| CREATE INDEX IF NOT EXISTS idx_pdf_created_at | |
| ON pdf_parse_cache(created_at) | |
| """) | |
| conn.commit() | |
| def _hash_query(self, query: str) -> str: | |
| """Gera hash único para a query de busca.""" | |
| normalized = query.lower().strip() | |
| return hashlib.sha256(normalized.encode()).hexdigest()[:32] | |
| def _hash_content(self, content: str) -> str: | |
| """Gera hash único para conteúdo de texto (PDF).""" | |
| return hashlib.sha256(content.encode()).hexdigest()[:32] | |
| # ==================== Cache de Busca de Referências ==================== | |
| def get(self, query: str) -> Optional[SearchResult]: | |
| """ | |
| Busca resultado no cache. | |
| Retorna None se não encontrado ou expirado. | |
| """ | |
| query_hash = self._hash_query(query) | |
| with sqlite3.connect(self.db_path) as conn: | |
| conn.row_factory = sqlite3.Row | |
| cursor = conn.execute( | |
| """ | |
| SELECT * FROM search_cache | |
| WHERE query_hash = ? | |
| AND created_at > datetime('now', ?) | |
| """, | |
| (query_hash, f"-{self.expiry_days} days"), | |
| ) | |
| row = cursor.fetchone() | |
| if row: | |
| # Incrementa contador de hits | |
| conn.execute( | |
| "UPDATE search_cache SET hits = hits + 1 WHERE query_hash = ?", | |
| (query_hash,), | |
| ) | |
| conn.commit() | |
| if row["found"]: | |
| data = json.loads(row["result_json"]) | |
| return SearchResult(**data) | |
| else: | |
| # Resultado negativo cacheado (não encontrado) | |
| return CACHE_NOT_FOUND # type: ignore | |
| return None | |
| def set(self, query: str, result: Optional[SearchResult]): | |
| """Salva resultado no cache.""" | |
| query_hash = self._hash_query(query) | |
| if result is None: | |
| result_json = "{}" | |
| found = False | |
| source = None | |
| else: | |
| result_json = json.dumps(asdict(result), ensure_ascii=False) | |
| found = True | |
| source = result.source | |
| with sqlite3.connect(self.db_path) as conn: | |
| conn.execute( | |
| """ | |
| INSERT OR REPLACE INTO search_cache | |
| (query_hash, query_text, result_json, found, source, created_at, hits) | |
| VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP, 0) | |
| """, | |
| (query_hash, query[:500], result_json, found, source), | |
| ) | |
| conn.commit() | |
| def get_stats(self) -> dict: | |
| """Retorna estatísticas do cache.""" | |
| with sqlite3.connect(self.db_path) as conn: | |
| conn.row_factory = sqlite3.Row | |
| # Total de entradas | |
| total = conn.execute("SELECT COUNT(*) as cnt FROM search_cache").fetchone()[ | |
| "cnt" | |
| ] | |
| # Entradas encontradas vs não encontradas | |
| found = conn.execute( | |
| "SELECT COUNT(*) as cnt FROM search_cache WHERE found = 1" | |
| ).fetchone()["cnt"] | |
| # Total de hits | |
| hits = ( | |
| conn.execute("SELECT SUM(hits) as total FROM search_cache").fetchone()[ | |
| "total" | |
| ] | |
| or 0 | |
| ) | |
| # Por fonte | |
| sources = conn.execute( | |
| """ | |
| SELECT source, COUNT(*) as cnt | |
| FROM search_cache | |
| WHERE found = 1 | |
| GROUP BY source | |
| """ | |
| ).fetchall() | |
| return { | |
| "total_entries": total, | |
| "found": found, | |
| "not_found": total - found, | |
| "total_hits": hits, | |
| "by_source": {row["source"]: row["cnt"] for row in sources}, | |
| } | |
| def clear_expired(self) -> int: | |
| """Remove entradas expiradas. Retorna quantidade removida.""" | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.execute( | |
| """ | |
| DELETE FROM search_cache | |
| WHERE created_at < datetime('now', ?) | |
| """, | |
| (f"-{self.expiry_days} days",), | |
| ) | |
| conn.commit() | |
| return cursor.rowcount | |
| def clear_all(self) -> int: | |
| """Limpa todo o cache. Retorna quantidade removida.""" | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.execute("DELETE FROM search_cache") | |
| count1 = cursor.rowcount | |
| cursor = conn.execute("DELETE FROM pdf_parse_cache") | |
| count2 = cursor.rowcount | |
| conn.commit() | |
| return count1 + count2 | |
| # ==================== Cache de Parsing de PDF ==================== | |
| def get_pdf_refs( | |
| self, text_content: str, pdf_name: str = "" | |
| ) -> Optional[List[str]]: | |
| """ | |
| Busca referências cacheadas para um PDF baseado no hash do conteúdo. | |
| Retorna None se não encontrado ou expirado. | |
| """ | |
| content_hash = self._hash_content(text_content) | |
| with sqlite3.connect(self.db_path) as conn: | |
| conn.row_factory = sqlite3.Row | |
| cursor = conn.execute( | |
| """ | |
| SELECT * FROM pdf_parse_cache | |
| WHERE content_hash = ? | |
| AND created_at > datetime('now', ?) | |
| """, | |
| (content_hash, f"-{self.expiry_days} days"), | |
| ) | |
| row = cursor.fetchone() | |
| if row: | |
| # Incrementa contador de hits | |
| conn.execute( | |
| "UPDATE pdf_parse_cache SET hits = hits + 1 WHERE content_hash = ?", | |
| (content_hash,), | |
| ) | |
| conn.commit() | |
| refs = json.loads(row["references_json"]) | |
| return refs | |
| return None | |
| def set_pdf_refs( | |
| self, text_content: str, references: List[str], pdf_name: str = "" | |
| ): | |
| """Salva referências extraídas no cache.""" | |
| content_hash = self._hash_content(text_content) | |
| refs_json = json.dumps(references, ensure_ascii=False) | |
| with sqlite3.connect(self.db_path) as conn: | |
| conn.execute( | |
| """ | |
| INSERT OR REPLACE INTO pdf_parse_cache | |
| (content_hash, pdf_name, references_json, ref_count, created_at, hits) | |
| VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, 0) | |
| """, | |
| (content_hash, pdf_name[:200], refs_json, len(references)), | |
| ) | |
| conn.commit() | |
| def get_pdf_cache_stats(self) -> dict: | |
| """Retorna estatísticas do cache de parsing de PDFs.""" | |
| with sqlite3.connect(self.db_path) as conn: | |
| conn.row_factory = sqlite3.Row | |
| total = conn.execute( | |
| "SELECT COUNT(*) as cnt FROM pdf_parse_cache" | |
| ).fetchone()["cnt"] | |
| hits = ( | |
| conn.execute( | |
| "SELECT SUM(hits) as total FROM pdf_parse_cache" | |
| ).fetchone()["total"] | |
| or 0 | |
| ) | |
| refs = ( | |
| conn.execute( | |
| "SELECT SUM(ref_count) as total FROM pdf_parse_cache" | |
| ).fetchone()["total"] | |
| or 0 | |
| ) | |
| return {"pdfs_cached": total, "total_hits": hits, "total_refs_cached": refs} | |
| # Instância global do cache | |
| _cache_instance: Optional[ReferenceCache] = None | |
| def get_cache() -> ReferenceCache: | |
| """Retorna instância singleton do cache.""" | |
| global _cache_instance | |
| if _cache_instance is None: | |
| _cache_instance = ReferenceCache() | |
| return _cache_instance | |