madriClaro / config /database.py
Ruben
Replace DuckDB with Parquet file storage
4f48a7d
"""
Storage configuration for Parquet files
Persistent storage in /data/ directory on Hugging Face Spaces
"""
import pandas as pd
import json
from pathlib import Path
from typing import Dict, Any
# Storage path (persistent on HF Spaces!)
# Try /data first (HF Spaces persistent storage), fall back to local directory
try:
DATA_DIR = Path('/data')
DATA_DIR.mkdir(exist_ok=True)
# Test write access
test_file = DATA_DIR / '.test'
test_file.touch()
test_file.unlink()
print(f"βœ… Using persistent storage at {DATA_DIR}")
except (PermissionError, OSError) as e:
print(f"⚠️ Cannot use /data directory: {e}")
print("Using local directory for storage")
DATA_DIR = Path(__file__).parent.parent / 'data'
DATA_DIR.mkdir(exist_ok=True)
print(f"βœ… Using local storage at {DATA_DIR}")
# Parquet file paths
CONTENT_ITEMS_PATH = DATA_DIR / 'content_items.parquet'
CLARITY_ANALYSES_PATH = DATA_DIR / 'clarity_analyses.parquet'
FETCH_LOGS_PATH = DATA_DIR / 'fetch_logs.parquet'
SOURCES_CONFIG_PATH = DATA_DIR / 'content_sources.json'
def init_storage():
"""Initialize storage with empty parquet files and default config"""
# Create empty content_items parquet if doesn't exist
if not CONTENT_ITEMS_PATH.exists():
df = pd.DataFrame(columns=[
'url', 'content_hash', 'source_id', 'title', 'content_text',
'content_html', 'published_at', 'fetched_at', 'category',
'tags', 'is_processed', 'processing_error'
])
df.to_parquet(CONTENT_ITEMS_PATH, index=False)
print(f"βœ… Created content_items.parquet")
# Create empty clarity_analyses parquet if doesn't exist
if not CLARITY_ANALYSES_PATH.exists():
df = pd.DataFrame(columns=[
'content_hash', 'overall_score', 'readability_score',
'complexity_score', 'sentence_stats', 'vocabulary_stats',
'readability_metrics', 'grammar_stats', 'jargon_count',
'jargon_words', 'long_sentences_count', 'suggestions',
'analyzer_version', 'analysis_date', 'processing_time_ms'
])
df.to_parquet(CLARITY_ANALYSES_PATH, index=False)
print(f"βœ… Created clarity_analyses.parquet")
# Create empty fetch_logs parquet if doesn't exist
if not FETCH_LOGS_PATH.exists():
df = pd.DataFrame(columns=[
'source_id', 'fetch_start', 'fetch_end', 'items_fetched',
'items_new', 'items_updated', 'items_failed', 'status', 'error_message'
])
df.to_parquet(FETCH_LOGS_PATH, index=False)
print(f"βœ… Created fetch_logs.parquet")
# Create sources config if doesn't exist
if not SOURCES_CONFIG_PATH.exists():
sources = [
{
'id': 1,
'source_type': 'RSS',
'source_name': 'Madrid City Council News Feed',
'source_url': 'https://diario.madrid.es/feed',
'is_active': True,
'fetch_frequency_hours': 6
},
{
'id': 2,
'source_type': 'API',
'source_name': 'Madrid Open Data Portal',
'source_url': 'https://datos.madrid.es/portal/site/egob',
'is_active': True,
'fetch_frequency_hours': 24
}
]
with open(SOURCES_CONFIG_PATH, 'w', encoding='utf-8') as f:
json.dump(sources, f, indent=2, ensure_ascii=False)
print(f"βœ… Created content_sources.json")
print(f"βœ… Storage initialized at {DATA_DIR}")
def get_sources() -> list:
"""Load content sources from JSON config"""
if not SOURCES_CONFIG_PATH.exists():
init_storage()
with open(SOURCES_CONFIG_PATH, 'r', encoding='utf-8') as f:
return json.load(f)
# Initialize storage on module import
init_storage()