Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- .gitattributes +1 -0
- Russia_regions_data.parquet +3 -0
- Russia_regions_geo_simplified.geoparquet +3 -0
- app.py +286 -0
- requirements.txt +8 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Russia_regions_geo_simplified.geoparquet filter=lfs diff=lfs merge=lfs -text
|
Russia_regions_data.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c87635401d364d438db96c1bcf0fbf513ab22a299b7b23c6082e6fac165b69d
|
| 3 |
+
size 60001
|
Russia_regions_geo_simplified.geoparquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5913a0c68df7f5bac24bd6f93adb4531493ba89a0576127a37fbc66a63bc4608
|
| 3 |
+
size 1039864
|
app.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import geopandas as gpd
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
# 1. Load Data
|
| 8 |
+
# ---------------------------------------------------------
|
| 9 |
+
try:
|
| 10 |
+
df = pd.read_parquet('Russia_regions_data.parquet')
|
| 11 |
+
gdf = gpd.read_parquet('Russia_regions_geo_simplified.geoparquet')
|
| 12 |
+
except Exception as e:
|
| 13 |
+
print(f"Error loading files: {e}")
|
| 14 |
+
# Fallback dummy data
|
| 15 |
+
df = pd.DataFrame(columns=['section', 'subsection', 'indicator_name', 'comment', 'year',
|
| 16 |
+
'indicator_value', 'indicator_unit', 'indicator_code',
|
| 17 |
+
'Region', 'object_oktmo', 'ISO'])
|
| 18 |
+
gdf = gpd.GeoDataFrame(columns=['ISO', 'geometry'])
|
| 19 |
+
|
| 20 |
+
# Ensure geometries are in standard lat/lon
|
| 21 |
+
gdf = gdf.to_crs(epsg=4326)
|
| 22 |
+
|
| 23 |
+
# 2. Prepare the Table Data (The "Menu")
|
| 24 |
+
# ---------------------------------------------------------
|
| 25 |
+
# Filter for unique indicator_code
|
| 26 |
+
df_unique = df.drop_duplicates(subset=['indicator_code']).copy()
|
| 27 |
+
|
| 28 |
+
# Define columns (removed indicator_value from display)
|
| 29 |
+
display_columns = [
|
| 30 |
+
'section', 'subsection', 'indicator_name', 'comment',
|
| 31 |
+
'year', 'indicator_unit'
|
| 32 |
+
]
|
| 33 |
+
hidden_link_key = 'indicator_code'
|
| 34 |
+
|
| 35 |
+
# Create the display dataframe
|
| 36 |
+
df_display = df_unique[display_columns].reset_index(drop=True)
|
| 37 |
+
|
| 38 |
+
# Calculate optimal column widths based on content with min/max constraints
|
| 39 |
+
def calculate_column_widths(df, columns):
|
| 40 |
+
"""Calculate column widths based on content length with constraints"""
|
| 41 |
+
widths = []
|
| 42 |
+
for col in columns:
|
| 43 |
+
# Get max of header length and content lengths
|
| 44 |
+
header_len = len(str(col))
|
| 45 |
+
if len(df) > 0:
|
| 46 |
+
content_lengths = df[col].astype(str).str.len()
|
| 47 |
+
max_content_len = content_lengths.max()
|
| 48 |
+
avg_content_len = content_lengths.mean()
|
| 49 |
+
|
| 50 |
+
# Use average of max and mean, bounded by min/max
|
| 51 |
+
optimal_len = (max_content_len + avg_content_len) / 2
|
| 52 |
+
final_len = max(header_len, min(optimal_len, 50)) # Cap at 50 chars
|
| 53 |
+
else:
|
| 54 |
+
final_len = header_len
|
| 55 |
+
|
| 56 |
+
# Scale to pixel width (roughly 8 pixels per character)
|
| 57 |
+
# Minimum width of 100px, maximum of 400px
|
| 58 |
+
width = max(100, min(int(final_len * 8), 400))
|
| 59 |
+
widths.append(width)
|
| 60 |
+
|
| 61 |
+
return widths
|
| 62 |
+
|
| 63 |
+
column_widths = calculate_column_widths(df_display, display_columns)
|
| 64 |
+
|
| 65 |
+
# Create styling for smaller font
|
| 66 |
+
def create_styling(df):
|
| 67 |
+
"""Create styling array for smaller font"""
|
| 68 |
+
num_rows = len(df)
|
| 69 |
+
num_cols = len(df.columns)
|
| 70 |
+
# Apply small font style to all cells
|
| 71 |
+
styling = [["font-size: 10px; white-space: normal; word-wrap: break-word;" for _ in range(num_cols)] for _ in range(num_rows)]
|
| 72 |
+
return styling
|
| 73 |
+
|
| 74 |
+
table_styling = create_styling(df_display)
|
| 75 |
+
|
| 76 |
+
# 3. Utility Functions
|
| 77 |
+
# ---------------------------------------------------------
|
| 78 |
+
def remove_outliers(series):
|
| 79 |
+
"""Remove outliers using IQR method, replacing them with NaN"""
|
| 80 |
+
Q1 = series.quantile(0.01)
|
| 81 |
+
Q3 = series.quantile(0.99)
|
| 82 |
+
IQR = Q3 - Q1
|
| 83 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 84 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 85 |
+
return series.where((series >= lower_bound) & (series <= upper_bound), np.nan)
|
| 86 |
+
|
| 87 |
+
def should_use_log_scale(values):
|
| 88 |
+
"""
|
| 89 |
+
Heuristic to decide if logarithmic scale should be used.
|
| 90 |
+
Returns True if:
|
| 91 |
+
1. All values are positive
|
| 92 |
+
2. The ratio of max to min is greater than 100
|
| 93 |
+
3. The distribution is highly skewed (skewness > 2)
|
| 94 |
+
"""
|
| 95 |
+
# Remove NaN values for analysis
|
| 96 |
+
clean_values = values.dropna()
|
| 97 |
+
|
| 98 |
+
if len(clean_values) == 0:
|
| 99 |
+
return False
|
| 100 |
+
|
| 101 |
+
# Check if all values are positive
|
| 102 |
+
if (clean_values <= 0).any():
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
# Check ratio of max to min
|
| 106 |
+
max_val = clean_values.max()
|
| 107 |
+
min_val = clean_values.min()
|
| 108 |
+
|
| 109 |
+
if min_val == 0:
|
| 110 |
+
return False
|
| 111 |
+
|
| 112 |
+
ratio = max_val / min_val
|
| 113 |
+
|
| 114 |
+
# Check skewness
|
| 115 |
+
skewness = clean_values.skew()
|
| 116 |
+
|
| 117 |
+
# Use log scale if ratio is high and distribution is skewed
|
| 118 |
+
return ratio > 100 and skewness > 2
|
| 119 |
+
|
| 120 |
+
def format_value(value):
|
| 121 |
+
"""Format value for tooltip: integers without decimals, floats with max 3 decimals"""
|
| 122 |
+
if pd.isna(value):
|
| 123 |
+
return "N/A"
|
| 124 |
+
if isinstance(value, (int, np.integer)) or value == int(value):
|
| 125 |
+
return f"{int(value)}"
|
| 126 |
+
else:
|
| 127 |
+
return f"{value:.3f}".rstrip('0').rstrip('.')
|
| 128 |
+
|
| 129 |
+
# 4. Define App Logic
|
| 130 |
+
# ---------------------------------------------------------
|
| 131 |
+
def update_map(select_data: gr.SelectData, current_table_data):
|
| 132 |
+
"""
|
| 133 |
+
Triggered when a cell in the table is clicked.
|
| 134 |
+
select_data.index is a tuple (row, col) or int depending on version.
|
| 135 |
+
"""
|
| 136 |
+
if select_data is None:
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
# Handle index format (it often comes as [row, col] or just row index)
|
| 140 |
+
# We safely extract the row index
|
| 141 |
+
if isinstance(select_data.index, (list, tuple)):
|
| 142 |
+
row_index = select_data.index[0]
|
| 143 |
+
else:
|
| 144 |
+
row_index = select_data.index
|
| 145 |
+
|
| 146 |
+
# Get the row data from the visible table
|
| 147 |
+
# In newer Gradio, current_table_data is a DataFrame
|
| 148 |
+
if isinstance(current_table_data, pd.DataFrame):
|
| 149 |
+
selected_row = current_table_data.iloc[row_index]
|
| 150 |
+
else:
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
# Find the corresponding unique indicator code
|
| 154 |
+
match = df_unique[
|
| 155 |
+
(df_unique['indicator_name'] == selected_row['indicator_name']) &
|
| 156 |
+
(df_unique['section'] == selected_row['section'])
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
if match.empty:
|
| 160 |
+
return px.choropleth(title="Indicator not found")
|
| 161 |
+
|
| 162 |
+
selected_code = match.iloc[0][hidden_link_key]
|
| 163 |
+
selected_unit = match.iloc[0]['indicator_unit']
|
| 164 |
+
|
| 165 |
+
# Filter main data for this indicator
|
| 166 |
+
df_filtered = df[df['indicator_code'] == selected_code].copy()
|
| 167 |
+
|
| 168 |
+
# Ensure one row per region (remove duplicates, keep first occurrence)
|
| 169 |
+
df_filtered = df_filtered.drop_duplicates(subset=['ISO'], keep='first')
|
| 170 |
+
|
| 171 |
+
# Remove outliers (replace with NaN)
|
| 172 |
+
df_filtered['indicator_value_clean'] = remove_outliers(df_filtered['indicator_value'])
|
| 173 |
+
|
| 174 |
+
# Calculate Rankings (Desc: Higher Value = Rank 1) - excluding NaN values
|
| 175 |
+
df_filtered_for_ranking = df_filtered.dropna(subset=['indicator_value_clean']).copy()
|
| 176 |
+
df_filtered_for_ranking = df_filtered_for_ranking.sort_values('indicator_value_clean', ascending=False).reset_index(drop=True)
|
| 177 |
+
df_filtered_for_ranking['Ranking'] = range(1, len(df_filtered_for_ranking) + 1)
|
| 178 |
+
|
| 179 |
+
# Merge rankings back
|
| 180 |
+
df_filtered = df_filtered.merge(
|
| 181 |
+
df_filtered_for_ranking[['ISO', 'Ranking']],
|
| 182 |
+
on='ISO',
|
| 183 |
+
how='left'
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Decide if we should use log scale
|
| 187 |
+
use_log = should_use_log_scale(df_filtered['indicator_value_clean'])
|
| 188 |
+
|
| 189 |
+
# Create color scale values (log if needed)
|
| 190 |
+
if use_log:
|
| 191 |
+
df_filtered['color_value'] = np.log10(df_filtered['indicator_value_clean'])
|
| 192 |
+
color_label = f"{selected_unit} (log scale)"
|
| 193 |
+
else:
|
| 194 |
+
df_filtered['color_value'] = df_filtered['indicator_value_clean']
|
| 195 |
+
color_label = selected_unit
|
| 196 |
+
|
| 197 |
+
# Merge with Geometry
|
| 198 |
+
merged = gdf.merge(df_filtered, on='ISO', how='inner')
|
| 199 |
+
|
| 200 |
+
if merged.empty:
|
| 201 |
+
return px.choropleth(title="No data for this indicator")
|
| 202 |
+
|
| 203 |
+
# Construct Map with divergent color scale
|
| 204 |
+
fig = px.choropleth_map(
|
| 205 |
+
merged,
|
| 206 |
+
geojson=merged.geometry,
|
| 207 |
+
locations=merged.index,
|
| 208 |
+
color='color_value',
|
| 209 |
+
color_continuous_scale="RdYlBu_r", # Divergent color scale (red-yellow-blue reversed)
|
| 210 |
+
map_style="satellite-streets",
|
| 211 |
+
zoom=2,
|
| 212 |
+
center={"lat": 60, "lon": 90},
|
| 213 |
+
opacity=0.6,
|
| 214 |
+
labels={'color_value': color_label}
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# Format values for tooltip
|
| 218 |
+
merged['formatted_value'] = merged['indicator_value'].apply(format_value)
|
| 219 |
+
merged['formatted_ranking'] = merged['Ranking'].apply(lambda x: str(int(x)) if pd.notna(x) else "N/A")
|
| 220 |
+
|
| 221 |
+
# Tooltip Configuration
|
| 222 |
+
fig.update_traces(
|
| 223 |
+
customdata=merged[['formatted_ranking', 'Region', 'indicator_name', 'formatted_value']],
|
| 224 |
+
hovertemplate=(
|
| 225 |
+
"<b>Rank:</b> %{customdata[0]}<br>"
|
| 226 |
+
"<b>Region:</b> %{customdata[1]}<br>"
|
| 227 |
+
"<b>Indicator Name:</b> %{customdata[2]}<br>"
|
| 228 |
+
"<b>Indicator Value:</b> %{customdata[3]}"
|
| 229 |
+
"<extra></extra>"
|
| 230 |
+
)
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
fig.update_layout(
|
| 234 |
+
margin={"r":0,"t":0,"l":0,"b":0},
|
| 235 |
+
height=800 # Increased map height
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
return fig
|
| 239 |
+
|
| 240 |
+
# 5. Build Gradio UI
|
| 241 |
+
# ---------------------------------------------------------
|
| 242 |
+
with gr.Blocks(title="Russian Regions Analytics") as demo:
|
| 243 |
+
gr.Markdown("## Russian Regional Indicators")
|
| 244 |
+
|
| 245 |
+
with gr.Row():
|
| 246 |
+
map_plot = gr.Plot(label="Regional Distribution")
|
| 247 |
+
|
| 248 |
+
with gr.Row():
|
| 249 |
+
# Prepare table value with styling metadata
|
| 250 |
+
table_value = {
|
| 251 |
+
"data": df_display.values.tolist(),
|
| 252 |
+
"headers": display_columns,
|
| 253 |
+
"metadata": {
|
| 254 |
+
"styling": table_styling
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
table = gr.DataFrame(
|
| 259 |
+
value=table_value,
|
| 260 |
+
label="Select an Indicator",
|
| 261 |
+
datatype=["str", "str", "str", "str", "number", "str"],
|
| 262 |
+
interactive=True,
|
| 263 |
+
max_height=700, # Increased table height
|
| 264 |
+
column_widths=column_widths # Smart column widths based on content
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Wire the selection event
|
| 268 |
+
table.select(
|
| 269 |
+
fn=update_map,
|
| 270 |
+
inputs=[table],
|
| 271 |
+
outputs=[map_plot]
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Footer with data sources
|
| 275 |
+
gr.Markdown("""
|
| 276 |
+
---
|
| 277 |
+
### Data Sources & Attribution
|
| 278 |
+
|
| 279 |
+
**Statistical Data:** [Tochno.st Regional Datasets](https://tochno.st/datasets/regions_collection)
|
| 280 |
+
**Administrative Boundaries:** [geoBoundaries](https://www.geoboundaries.org/countryDownloads.html)
|
| 281 |
+
**Regional Codes:** [Codes of Subjects of the Russian Federation](https://en.everybodywiki.com/Codes_of_subjects_of_the_Russian_Federation)
|
| 282 |
+
**Translation Model:** [Tencent HY-MT1.5-7B](https://huggingface.co/tencent/HY-MT1.5-7B)
|
| 283 |
+
""")
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
| 3 |
+
lxml
|
| 4 |
+
geopandas
|
| 5 |
+
plotly
|
| 6 |
+
pyarrow
|
| 7 |
+
fastparquet
|
| 8 |
+
numpy
|