Spaces:
Sleeping
Sleeping
File size: 3,862 Bytes
28aa7d9 4f48a7d 28aa7d9 4f48a7d 28aa7d9 4f48a7d 28aa7d9 4f48a7d 3b56477 4f48a7d 3b56477 4f48a7d 3b56477 4f48a7d 28aa7d9 4f48a7d 28aa7d9 4f48a7d 611be19 4f48a7d 611be19 4f48a7d 611be19 4f48a7d 28aa7d9 4f48a7d 28aa7d9 4f48a7d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | """
Storage configuration for Parquet files
Persistent storage in /data/ directory on Hugging Face Spaces
"""
import pandas as pd
import json
from pathlib import Path
from typing import Dict, Any
# Storage path (persistent on HF Spaces!)
# Try /data first (HF Spaces persistent storage), fall back to local directory
try:
DATA_DIR = Path('/data')
DATA_DIR.mkdir(exist_ok=True)
# Test write access
test_file = DATA_DIR / '.test'
test_file.touch()
test_file.unlink()
print(f"✅ Using persistent storage at {DATA_DIR}")
except (PermissionError, OSError) as e:
print(f"⚠️ Cannot use /data directory: {e}")
print("Using local directory for storage")
DATA_DIR = Path(__file__).parent.parent / 'data'
DATA_DIR.mkdir(exist_ok=True)
print(f"✅ Using local storage at {DATA_DIR}")
# Parquet file paths
CONTENT_ITEMS_PATH = DATA_DIR / 'content_items.parquet'
CLARITY_ANALYSES_PATH = DATA_DIR / 'clarity_analyses.parquet'
FETCH_LOGS_PATH = DATA_DIR / 'fetch_logs.parquet'
SOURCES_CONFIG_PATH = DATA_DIR / 'content_sources.json'
def init_storage():
"""Initialize storage with empty parquet files and default config"""
# Create empty content_items parquet if doesn't exist
if not CONTENT_ITEMS_PATH.exists():
df = pd.DataFrame(columns=[
'url', 'content_hash', 'source_id', 'title', 'content_text',
'content_html', 'published_at', 'fetched_at', 'category',
'tags', 'is_processed', 'processing_error'
])
df.to_parquet(CONTENT_ITEMS_PATH, index=False)
print(f"✅ Created content_items.parquet")
# Create empty clarity_analyses parquet if doesn't exist
if not CLARITY_ANALYSES_PATH.exists():
df = pd.DataFrame(columns=[
'content_hash', 'overall_score', 'readability_score',
'complexity_score', 'sentence_stats', 'vocabulary_stats',
'readability_metrics', 'grammar_stats', 'jargon_count',
'jargon_words', 'long_sentences_count', 'suggestions',
'analyzer_version', 'analysis_date', 'processing_time_ms'
])
df.to_parquet(CLARITY_ANALYSES_PATH, index=False)
print(f"✅ Created clarity_analyses.parquet")
# Create empty fetch_logs parquet if doesn't exist
if not FETCH_LOGS_PATH.exists():
df = pd.DataFrame(columns=[
'source_id', 'fetch_start', 'fetch_end', 'items_fetched',
'items_new', 'items_updated', 'items_failed', 'status', 'error_message'
])
df.to_parquet(FETCH_LOGS_PATH, index=False)
print(f"✅ Created fetch_logs.parquet")
# Create sources config if doesn't exist
if not SOURCES_CONFIG_PATH.exists():
sources = [
{
'id': 1,
'source_type': 'RSS',
'source_name': 'Madrid City Council News Feed',
'source_url': 'https://diario.madrid.es/feed',
'is_active': True,
'fetch_frequency_hours': 6
},
{
'id': 2,
'source_type': 'API',
'source_name': 'Madrid Open Data Portal',
'source_url': 'https://datos.madrid.es/portal/site/egob',
'is_active': True,
'fetch_frequency_hours': 24
}
]
with open(SOURCES_CONFIG_PATH, 'w', encoding='utf-8') as f:
json.dump(sources, f, indent=2, ensure_ascii=False)
print(f"✅ Created content_sources.json")
print(f"✅ Storage initialized at {DATA_DIR}")
def get_sources() -> list:
"""Load content sources from JSON config"""
if not SOURCES_CONFIG_PATH.exists():
init_storage()
with open(SOURCES_CONFIG_PATH, 'r', encoding='utf-8') as f:
return json.load(f)
# Initialize storage on module import
init_storage()
|