File size: 3,862 Bytes
28aa7d9
4f48a7d
28aa7d9
 
 
4f48a7d
 
28aa7d9
4f48a7d
28aa7d9
4f48a7d
3b56477
 
 
 
4f48a7d
 
 
 
 
3b56477
 
4f48a7d
3b56477
 
4f48a7d
28aa7d9
4f48a7d
 
 
 
 
28aa7d9
 
4f48a7d
 
611be19
4f48a7d
 
 
 
 
 
 
 
 
611be19
4f48a7d
 
 
 
 
 
 
 
 
 
 
611be19
4f48a7d
 
 
 
 
 
 
 
28aa7d9
4f48a7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28aa7d9
4f48a7d
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Storage configuration for Parquet files
Persistent storage in /data/ directory on Hugging Face Spaces
"""

import pandas as pd
import json
from pathlib import Path
from typing import Dict, Any

# Storage path (persistent on HF Spaces!)
# Try /data first (HF Spaces persistent storage), fall back to local directory
try:
    DATA_DIR = Path('/data')
    DATA_DIR.mkdir(exist_ok=True)
    # Test write access
    test_file = DATA_DIR / '.test'
    test_file.touch()
    test_file.unlink()
    print(f"✅ Using persistent storage at {DATA_DIR}")
except (PermissionError, OSError) as e:
    print(f"⚠️  Cannot use /data directory: {e}")
    print("Using local directory for storage")
    DATA_DIR = Path(__file__).parent.parent / 'data'
    DATA_DIR.mkdir(exist_ok=True)
    print(f"✅ Using local storage at {DATA_DIR}")

# Parquet file paths
CONTENT_ITEMS_PATH = DATA_DIR / 'content_items.parquet'
CLARITY_ANALYSES_PATH = DATA_DIR / 'clarity_analyses.parquet'
FETCH_LOGS_PATH = DATA_DIR / 'fetch_logs.parquet'
SOURCES_CONFIG_PATH = DATA_DIR / 'content_sources.json'


def init_storage():
    """Initialize storage with empty parquet files and default config"""

    # Create empty content_items parquet if doesn't exist
    if not CONTENT_ITEMS_PATH.exists():
        df = pd.DataFrame(columns=[
            'url', 'content_hash', 'source_id', 'title', 'content_text',
            'content_html', 'published_at', 'fetched_at', 'category',
            'tags', 'is_processed', 'processing_error'
        ])
        df.to_parquet(CONTENT_ITEMS_PATH, index=False)
        print(f"✅ Created content_items.parquet")

    # Create empty clarity_analyses parquet if doesn't exist
    if not CLARITY_ANALYSES_PATH.exists():
        df = pd.DataFrame(columns=[
            'content_hash', 'overall_score', 'readability_score',
            'complexity_score', 'sentence_stats', 'vocabulary_stats',
            'readability_metrics', 'grammar_stats', 'jargon_count',
            'jargon_words', 'long_sentences_count', 'suggestions',
            'analyzer_version', 'analysis_date', 'processing_time_ms'
        ])
        df.to_parquet(CLARITY_ANALYSES_PATH, index=False)
        print(f"✅ Created clarity_analyses.parquet")

    # Create empty fetch_logs parquet if doesn't exist
    if not FETCH_LOGS_PATH.exists():
        df = pd.DataFrame(columns=[
            'source_id', 'fetch_start', 'fetch_end', 'items_fetched',
            'items_new', 'items_updated', 'items_failed', 'status', 'error_message'
        ])
        df.to_parquet(FETCH_LOGS_PATH, index=False)
        print(f"✅ Created fetch_logs.parquet")

    # Create sources config if doesn't exist
    if not SOURCES_CONFIG_PATH.exists():
        sources = [
            {
                'id': 1,
                'source_type': 'RSS',
                'source_name': 'Madrid City Council News Feed',
                'source_url': 'https://diario.madrid.es/feed',
                'is_active': True,
                'fetch_frequency_hours': 6
            },
            {
                'id': 2,
                'source_type': 'API',
                'source_name': 'Madrid Open Data Portal',
                'source_url': 'https://datos.madrid.es/portal/site/egob',
                'is_active': True,
                'fetch_frequency_hours': 24
            }
        ]
        with open(SOURCES_CONFIG_PATH, 'w', encoding='utf-8') as f:
            json.dump(sources, f, indent=2, ensure_ascii=False)
        print(f"✅ Created content_sources.json")

    print(f"✅ Storage initialized at {DATA_DIR}")


def get_sources() -> list:
    """Load content sources from JSON config"""
    if not SOURCES_CONFIG_PATH.exists():
        init_storage()

    with open(SOURCES_CONFIG_PATH, 'r', encoding='utf-8') as f:
        return json.load(f)


# Initialize storage on module import
init_storage()