Spaces:
Sleeping
Sleeping
File size: 4,592 Bytes
2cb327c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """
Database Utilities (Supabase)
=============================
Manages all Supabase operations: deduplication checks and article insertion.
Tables used:
- `registry` — tracks which article IDs have been processed (deduplication)
- `articles` — stores final processed articles with summaries and audio URLs
Configuration (required in .env):
SUPABASE_URL=https://your-project.supabase.co
SUPABASE_KEY=your_service_role_key
Usage:
from backend.utils.db_utils import DatabaseManager
db = DatabaseManager()
# Check which articles are already processed
existing = db.check_registry(["id1", "id2", "id3"])
# Insert a fully processed article
db.insert_article(article_dict)
"""
import os
from supabase import create_client, Client
from dotenv import load_dotenv
import logging
load_dotenv()
logger = logging.getLogger(__name__)
class DatabaseManager:
"""Thread-safe Supabase client for article deduplication and storage.
If SUPABASE_URL or SUPABASE_KEY are missing, all operations gracefully
return empty results / False instead of raising.
"""
def __init__(self):
url: str = os.environ.get("SUPABASE_URL", "").strip()
key: str = os.environ.get("SUPABASE_KEY", "").strip()
# Auto-correct missing https:// prefix
if url and not url.startswith("http"):
url = f"https://{url}"
if not url or not key or url == "https://":
logger.warning("Supabase URL or Key not found. Database operations will be skipped.")
self.supabase: Client = None
else:
try:
self.supabase: Client = create_client(url, key)
except Exception as e:
logger.error(f"Failed to initialize Supabase client: {e}")
self.supabase: Client = None
def check_registry(self, article_ids: list) -> set:
"""Check which article IDs are already in the registry table.
Args:
article_ids: List of article ID strings to check.
Returns:
Set of IDs that already exist in registry (should be skipped).
"""
if not self.supabase or not article_ids:
return set()
try:
response = self.supabase.table("registry").select("id").in_("id", article_ids).execute()
return {item['id'] for item in response.data}
except Exception as e:
logger.error(f"Error checking registry: {str(e)}")
return set()
def insert_article(self, article_data: dict) -> bool:
"""Insert a processed article into both `articles` and `registry` tables.
Uses upsert to handle re-runs gracefully. The article must have an 'id' key.
Args:
article_data: Dict with keys matching the articles table schema:
id, category, title, author, url, content, summary,
audio_url, published_date, scraped_at, summary_generated_at
Returns:
True on success, False on failure or missing Supabase config.
"""
if not self.supabase:
return False
try:
article_id = article_data.get('id')
if not article_id:
return False
article_record = {
"id": article_id,
"category": article_data.get('category', ''),
"title": article_data.get('title', ''),
"author": article_data.get('author', ''),
"url": article_data.get('url', ''),
"content": article_data.get('content', ''),
"summary": article_data.get('summary', ''),
"audio_url": article_data.get('audio_url', ''),
"published_at": article_data.get('published_date'),
"scraped_at": article_data.get('scraped_at'),
"summary_generated_at": article_data.get('summary_generated_at')
}
registry_record = {
"id": article_id,
"category": article_data.get('category', ''),
"title": article_data.get('title', ''),
"status": "completed"
}
self.supabase.table("articles").upsert(article_record).execute()
self.supabase.table("registry").upsert(registry_record).execute()
logger.debug(f"Successfully saved article {article_id} to database.")
return True
except Exception as e:
logger.error(f"Error inserting article {article_data.get('id')}: {str(e)}")
return False
|