Spaces:
Sleeping
Sleeping
| """ | |
| Storage configuration for Parquet files | |
| Persistent storage in /data/ directory on Hugging Face Spaces | |
| """ | |
| import pandas as pd | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, Any | |
| # Storage path (persistent on HF Spaces!) | |
| # Try /data first (HF Spaces persistent storage), fall back to local directory | |
| try: | |
| DATA_DIR = Path('/data') | |
| DATA_DIR.mkdir(exist_ok=True) | |
| # Test write access | |
| test_file = DATA_DIR / '.test' | |
| test_file.touch() | |
| test_file.unlink() | |
| print(f"β Using persistent storage at {DATA_DIR}") | |
| except (PermissionError, OSError) as e: | |
| print(f"β οΈ Cannot use /data directory: {e}") | |
| print("Using local directory for storage") | |
| DATA_DIR = Path(__file__).parent.parent / 'data' | |
| DATA_DIR.mkdir(exist_ok=True) | |
| print(f"β Using local storage at {DATA_DIR}") | |
| # Parquet file paths | |
| CONTENT_ITEMS_PATH = DATA_DIR / 'content_items.parquet' | |
| CLARITY_ANALYSES_PATH = DATA_DIR / 'clarity_analyses.parquet' | |
| FETCH_LOGS_PATH = DATA_DIR / 'fetch_logs.parquet' | |
| SOURCES_CONFIG_PATH = DATA_DIR / 'content_sources.json' | |
| def init_storage(): | |
| """Initialize storage with empty parquet files and default config""" | |
| # Create empty content_items parquet if doesn't exist | |
| if not CONTENT_ITEMS_PATH.exists(): | |
| df = pd.DataFrame(columns=[ | |
| 'url', 'content_hash', 'source_id', 'title', 'content_text', | |
| 'content_html', 'published_at', 'fetched_at', 'category', | |
| 'tags', 'is_processed', 'processing_error' | |
| ]) | |
| df.to_parquet(CONTENT_ITEMS_PATH, index=False) | |
| print(f"β Created content_items.parquet") | |
| # Create empty clarity_analyses parquet if doesn't exist | |
| if not CLARITY_ANALYSES_PATH.exists(): | |
| df = pd.DataFrame(columns=[ | |
| 'content_hash', 'overall_score', 'readability_score', | |
| 'complexity_score', 'sentence_stats', 'vocabulary_stats', | |
| 'readability_metrics', 'grammar_stats', 'jargon_count', | |
| 'jargon_words', 'long_sentences_count', 'suggestions', | |
| 'analyzer_version', 'analysis_date', 'processing_time_ms' | |
| ]) | |
| df.to_parquet(CLARITY_ANALYSES_PATH, index=False) | |
| print(f"β Created clarity_analyses.parquet") | |
| # Create empty fetch_logs parquet if doesn't exist | |
| if not FETCH_LOGS_PATH.exists(): | |
| df = pd.DataFrame(columns=[ | |
| 'source_id', 'fetch_start', 'fetch_end', 'items_fetched', | |
| 'items_new', 'items_updated', 'items_failed', 'status', 'error_message' | |
| ]) | |
| df.to_parquet(FETCH_LOGS_PATH, index=False) | |
| print(f"β Created fetch_logs.parquet") | |
| # Create sources config if doesn't exist | |
| if not SOURCES_CONFIG_PATH.exists(): | |
| sources = [ | |
| { | |
| 'id': 1, | |
| 'source_type': 'RSS', | |
| 'source_name': 'Madrid City Council News Feed', | |
| 'source_url': 'https://diario.madrid.es/feed', | |
| 'is_active': True, | |
| 'fetch_frequency_hours': 6 | |
| }, | |
| { | |
| 'id': 2, | |
| 'source_type': 'API', | |
| 'source_name': 'Madrid Open Data Portal', | |
| 'source_url': 'https://datos.madrid.es/portal/site/egob', | |
| 'is_active': True, | |
| 'fetch_frequency_hours': 24 | |
| } | |
| ] | |
| with open(SOURCES_CONFIG_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(sources, f, indent=2, ensure_ascii=False) | |
| print(f"β Created content_sources.json") | |
| print(f"β Storage initialized at {DATA_DIR}") | |
| def get_sources() -> list: | |
| """Load content sources from JSON config""" | |
| if not SOURCES_CONFIG_PATH.exists(): | |
| init_storage() | |
| with open(SOURCES_CONFIG_PATH, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| # Initialize storage on module import | |
| init_storage() | |