File size: 12,378 Bytes
679efc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c18cec
 
679efc4
 
 
 
 
 
 
 
 
 
6c18cec
679efc4
 
6c18cec
679efc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c18cec
 
679efc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c18cec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679efc4
 
 
 
 
 
6c18cec
679efc4
 
 
 
 
 
 
 
 
 
 
 
 
6c18cec
679efc4
 
 
 
 
 
 
 
 
 
 
 
6c18cec
679efc4
 
 
 
 
 
 
6c18cec
 
 
 
 
679efc4
6c18cec
 
 
679efc4
 
 
 
 
 
 
 
 
 
6c18cec
679efc4
 
 
6c18cec
679efc4
 
6c18cec
679efc4
 
 
 
 
 
 
 
6c18cec
 
 
 
 
 
 
 
 
679efc4
 
 
 
 
6c18cec
679efc4
 
 
 
6c18cec
679efc4
 
 
6c18cec
679efc4
 
 
 
 
 
 
 
 
6c18cec
679efc4
 
 
 
 
 
6c18cec
679efc4
 
 
 
 
 
 
 
 
 
6c18cec
 
679efc4
 
 
 
 
 
 
 
 
 
 
 
 
 
6c18cec
679efc4
6c18cec
 
679efc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
import gradio as gr
import pandas as pd
import geopandas as gpd
import plotly.express as px
import numpy as np

# 1. Load Data
# ---------------------------------------------------------
try:
    df = pd.read_parquet('Russia_regions_data.parquet')
    gdf = gpd.read_parquet('Russia_regions_geo_simplified.geoparquet')
except Exception as e:
    print(f"Error loading files: {e}")
    # Fallback dummy data
    df = pd.DataFrame(columns=['section', 'subsection', 'indicator_name', 'comment', 'year', 
                               'indicator_value', 'rel_indicator_value', 'indicator_unit', 'positive',
                               'indicator_code', 'Region', 'object_oktmo', 'ISO'])
    gdf = gpd.GeoDataFrame(columns=['ISO', 'geometry'])

# Ensure geometries are in standard lat/lon
gdf = gdf.to_crs(epsg=4326)

# 2. Prepare the Table Data (The "Menu")
# ---------------------------------------------------------
# Filter for unique indicator_code
df_unique = df.drop_duplicates(subset=['indicator_code']).copy()

# Define columns (added indicator_value and rel_indicator_value, hidden positive)
display_columns = [
    'section', 'subsection', 'indicator_name', 'comment', 
    'year', 'indicator_value', 'rel_indicator_value', 'indicator_unit'
]
hidden_link_key = 'indicator_code'

# Create the display dataframe
df_display = df_unique[display_columns].reset_index(drop=True)

# Calculate optimal column widths based on content with min/max constraints
def calculate_column_widths(df, columns):
    """Calculate column widths based on content length with constraints"""
    widths = []
    for col in columns:
        # Get max of header length and content lengths
        header_len = len(str(col))
        if len(df) > 0:
            content_lengths = df[col].astype(str).str.len()
            max_content_len = content_lengths.max()
            avg_content_len = content_lengths.mean()
            
            # Use average of max and mean, bounded by min/max
            optimal_len = (max_content_len + avg_content_len) / 2
            final_len = max(header_len, min(optimal_len, 50))  # Cap at 50 chars
        else:
            final_len = header_len
        
        # Scale to pixel width (roughly 8 pixels per character)
        # Minimum width of 100px, maximum of 400px
        width = max(100, min(int(final_len * 8), 400))
        widths.append(width)
    
    return widths

column_widths = calculate_column_widths(df_display, display_columns)

# Create styling for smaller font
def create_styling(df):
    """Create styling array for smaller font"""
    num_rows = len(df)
    num_cols = len(df.columns)
    # Apply small font style to all cells
    styling = [["font-size: 10px; white-space: normal; word-wrap: break-word;" for _ in range(num_cols)] for _ in range(num_rows)]
    return styling

table_styling = create_styling(df_display)

# 3. Utility Functions
# ---------------------------------------------------------
def remove_outliers(series):
    """Remove outliers using IQR method, replacing them with NaN"""
    Q1 = series.quantile(0.01)
    Q3 = series.quantile(0.99)
    IQR = Q3 - Q1
    lower_bound = Q1 - 2 * IQR
    upper_bound = Q3 + 2 * IQR
    return series.where((series >= lower_bound) & (series <= upper_bound), np.nan)

def should_use_log_scale(values):
    """
    Heuristic to decide if logarithmic scale should be used.
    Returns True if:
    1. All values are positive
    2. The ratio of max to min is greater than 100
    3. The distribution is highly skewed (skewness > 2)
    """
    # Remove NaN values for analysis
    clean_values = values.dropna()
    
    if len(clean_values) == 0:
        return False
    
    # Check if all values are positive
    if (clean_values <= 0).any():
        return False
    
    # Check ratio of max to min
    max_val = clean_values.max()
    min_val = clean_values.min()
    
    if min_val == 0:
        return False
    
    ratio = max_val / min_val
    
    # Check skewness
    skewness = clean_values.skew()
    
    # Use log scale if ratio is high and distribution is skewed
    return ratio > 100 and skewness > 2

def format_value(value):
    """Format value for tooltip: integers without decimals, floats with max 3 decimals"""
    if pd.isna(value):
        return "N/A"
    if isinstance(value, (int, np.integer)) or value == int(value):
        return f"{int(value)}"
    else:
        return f"{value:.3f}".rstrip('0').rstrip('.')

# 4. Define App Logic
# ---------------------------------------------------------
def create_ranking_map():
    """Create the default map showing overall rankings"""
    # Filter for the overall ranking indicator
    df_ranking = df[df['indicator_code'] == 'OVERALL_RANKING'].copy()
    
    if df_ranking.empty:
        return px.choropleth(title="Overall Ranking not found")
    
    # Ensure one row per region
    df_ranking = df_ranking.drop_duplicates(subset=['ISO'], keep='first')
    
    # Use rel_indicator_value (which is the same as indicator_value for ranking)
    df_ranking['color_value'] = df_ranking['rel_indicator_value']
    
    # Merge with Geometry
    merged = gdf.merge(df_ranking, on='ISO', how='inner')
    
    if merged.empty:
        return px.choropleth(title="No ranking data available")

    # Color scale: blue (lower/better) to red (higher/worse)
    fig = px.choropleth_map(
        merged,
        geojson=merged.geometry,
        locations=merged.index, 
        color='color_value',
        color_continuous_scale="RdYlBu_r",  # Blue (low/good) to Red (high/bad)
        map_style="satellite-streets",
        zoom=2,
        center={"lat": 60, "lon": 90},
        opacity=0.6,
        labels={'color_value': 'Overall Ranking'}
    )

    # Format values for tooltip
    merged['formatted_value'] = merged['rel_indicator_value'].apply(format_value)

    # Tooltip Configuration
    fig.update_traces(
        customdata=merged[['Region', 'formatted_value']],
        hovertemplate=(
            "<b>Region:</b> %{customdata[0]}<br>"
            "<b>Overall Ranking:</b> %{customdata[1]}"
            "<extra></extra>"
        )
    )
    
    fig.update_layout(
        margin={"r":0,"t":0,"l":0,"b":0},
        height=800
    )
    
    return fig

def update_map(select_data: gr.SelectData, current_table_data):
    """
    Triggered when a cell in the table is clicked.
    select_data.index is a tuple (row, col) or int depending on version.
    """
    if select_data is None:
        return create_ranking_map()

    # Handle index format (it often comes as [row, col] or just row index)
    # We safely extract the row index
    if isinstance(select_data.index, (list, tuple)):
        row_index = select_data.index[0]
    else:
        row_index = select_data.index
    
    # Get the row data from the visible table
    # In newer Gradio, current_table_data is a DataFrame
    if isinstance(current_table_data, pd.DataFrame):
        selected_row = current_table_data.iloc[row_index]
    else:
        return create_ranking_map()
    
    # Find the corresponding unique indicator code
    match = df_unique[
        (df_unique['indicator_name'] == selected_row['indicator_name']) &
        (df_unique['section'] == selected_row['section'])
    ]
    
    if match.empty:
        return px.choropleth(title="Indicator not found")
        
    selected_code = match.iloc[0][hidden_link_key]
    selected_unit = match.iloc[0]['indicator_unit']
    selected_positive = match.iloc[0]['positive']
    
    # Filter main data for this indicator
    df_filtered = df[df['indicator_code'] == selected_code].copy()
    
    # Ensure one row per region (remove duplicates, keep first occurrence)
    df_filtered = df_filtered.drop_duplicates(subset=['ISO'], keep='first')
    
    # Remove outliers (replace with NaN) - use rel_indicator_value
    df_filtered['value_clean'] = remove_outliers(df_filtered['rel_indicator_value'])
    
    # Calculate Rankings based on positive column
    df_filtered_for_ranking = df_filtered.dropna(subset=['value_clean']).copy()
    
    # If P: higher is better (ascending=False), if N or other: lower is better (ascending=True)
    ascending = (selected_positive != 'P')
    df_filtered_for_ranking = df_filtered_for_ranking.sort_values('value_clean', ascending=ascending).reset_index(drop=True)
    df_filtered_for_ranking['Ranking'] = range(1, len(df_filtered_for_ranking) + 1)
    
    # Merge rankings back
    df_filtered = df_filtered.merge(
        df_filtered_for_ranking[['ISO', 'Ranking']], 
        on='ISO', 
        how='left'
    )
    
    # Decide if we should use log scale
    use_log = should_use_log_scale(df_filtered['value_clean'])
    
    # Create color scale values (log if needed)
    if use_log:
        df_filtered['color_value'] = np.log10(df_filtered['value_clean'])
        color_label = f"{selected_unit} (log scale)"
    else:
        df_filtered['color_value'] = df_filtered['value_clean']
        color_label = selected_unit

    # Merge with Geometry
    merged = gdf.merge(df_filtered, on='ISO', how='inner')
    
    if merged.empty:
        return px.choropleth(title="No data for this indicator")

    # Determine color scale based on positive column
    # If P: red (low/bad) to blue (high/good)
    # If N or other: blue (low/good) to red (high/bad)
    if selected_positive == 'P':
        color_scale = "RdYlBu"  # Blue (low/bad) to Red (high/good)
    else:
        color_scale = "RdYlBu_r"  # Red (low/good) to Blue (high/bad) reversed

    # Construct Map with appropriate color scale
    fig = px.choropleth_map(
        merged,
        geojson=merged.geometry,
        locations=merged.index, 
        color='color_value',
        color_continuous_scale=color_scale,  # Red (low/good) to Blue (high/bad)
        map_style="satellite-streets",
        zoom=2,
        center={"lat": 60, "lon": 90},
        opacity=0.6,
        labels={'color_value': 'Overall Ranking'}
    )

    # Format values for tooltip
    merged['formatted_value'] = merged['rel_indicator_value'].apply(format_value)
    merged['formatted_ranking'] = merged['Ranking'].apply(lambda x: str(int(x)) if pd.notna(x) else "N/A")

    # Tooltip Configuration
    fig.update_traces(
        customdata=merged[['formatted_ranking', 'Region', 'indicator_name', 'formatted_value']],
        hovertemplate=(
            "<b>Rank:</b> %{customdata[0]}<br>"
            "<b>Region:</b> %{customdata[1]}<br>"
            "<b>Indicator Name:</b> %{customdata[2]}<br>"
            "<b>Relative Value:</b> %{customdata[3]}"
            "<extra></extra>"
        )
    )
    
    fig.update_layout(
        margin={"r":0,"t":0,"l":0,"b":0},
        height=800
    )
    
    return fig

# 5. Build Gradio UI
# ---------------------------------------------------------
with gr.Blocks(title="Russian Regions Analytics") as demo:
    gr.Markdown("## Russian Regional Indicators")
    
    with gr.Row():
        # Initialize with ranking map
        map_plot = gr.Plot(label="Regional Distribution", value=create_ranking_map())
    
    with gr.Row():
        # Prepare table value with styling metadata
        table_value = {
            "data": df_display.values.tolist(),
            "headers": display_columns,
            "metadata": {
                "styling": table_styling
            }
        }
        
        table = gr.DataFrame(
            value=table_value, 
            label="Select an Indicator",
            datatype=["str", "str", "str", "str", "number", "number", "number", "str"],
            interactive=True,
            max_height=700,
            column_widths=column_widths
        )

    # Wire the selection event
    table.select(
        fn=update_map,
        inputs=[table],
        outputs=[map_plot]
    )
    
    # Footer with data sources
    gr.Markdown("""
    ---
    ### Data Sources & Attribution
    
    **Statistical Data:** [Tochno.st Regional Datasets](https://tochno.st/datasets/regions_collection)  
    **Administrative Boundaries:** [geoBoundaries](https://www.geoboundaries.org/countryDownloads.html)  
    **Regional Codes:** [Codes of Subjects of the Russian Federation](https://en.everybodywiki.com/Codes_of_subjects_of_the_Russian_Federation)  
    **Translation Model:** [Tencent HY-MT1.5-7B](https://huggingface.co/tencent/HY-MT1.5-7B)
    """)

if __name__ == "__main__":
    demo.launch()