mirix commited on
Commit
679efc4
·
verified ·
1 Parent(s): 2239bca

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Russia_regions_geo_simplified.geoparquet filter=lfs diff=lfs merge=lfs -text
Russia_regions_data.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c87635401d364d438db96c1bcf0fbf513ab22a299b7b23c6082e6fac165b69d
3
+ size 60001
Russia_regions_geo_simplified.geoparquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5913a0c68df7f5bac24bd6f93adb4531493ba89a0576127a37fbc66a63bc4608
3
+ size 1039864
app.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import geopandas as gpd
4
+ import plotly.express as px
5
+ import numpy as np
6
+
7
+ # 1. Load Data
8
+ # ---------------------------------------------------------
9
+ try:
10
+ df = pd.read_parquet('Russia_regions_data.parquet')
11
+ gdf = gpd.read_parquet('Russia_regions_geo_simplified.geoparquet')
12
+ except Exception as e:
13
+ print(f"Error loading files: {e}")
14
+ # Fallback dummy data
15
+ df = pd.DataFrame(columns=['section', 'subsection', 'indicator_name', 'comment', 'year',
16
+ 'indicator_value', 'indicator_unit', 'indicator_code',
17
+ 'Region', 'object_oktmo', 'ISO'])
18
+ gdf = gpd.GeoDataFrame(columns=['ISO', 'geometry'])
19
+
20
+ # Ensure geometries are in standard lat/lon
21
+ gdf = gdf.to_crs(epsg=4326)
22
+
23
+ # 2. Prepare the Table Data (The "Menu")
24
+ # ---------------------------------------------------------
25
+ # Filter for unique indicator_code
26
+ df_unique = df.drop_duplicates(subset=['indicator_code']).copy()
27
+
28
+ # Define columns (removed indicator_value from display)
29
+ display_columns = [
30
+ 'section', 'subsection', 'indicator_name', 'comment',
31
+ 'year', 'indicator_unit'
32
+ ]
33
+ hidden_link_key = 'indicator_code'
34
+
35
+ # Create the display dataframe
36
+ df_display = df_unique[display_columns].reset_index(drop=True)
37
+
38
+ # Calculate optimal column widths based on content with min/max constraints
39
+ def calculate_column_widths(df, columns):
40
+ """Calculate column widths based on content length with constraints"""
41
+ widths = []
42
+ for col in columns:
43
+ # Get max of header length and content lengths
44
+ header_len = len(str(col))
45
+ if len(df) > 0:
46
+ content_lengths = df[col].astype(str).str.len()
47
+ max_content_len = content_lengths.max()
48
+ avg_content_len = content_lengths.mean()
49
+
50
+ # Use average of max and mean, bounded by min/max
51
+ optimal_len = (max_content_len + avg_content_len) / 2
52
+ final_len = max(header_len, min(optimal_len, 50)) # Cap at 50 chars
53
+ else:
54
+ final_len = header_len
55
+
56
+ # Scale to pixel width (roughly 8 pixels per character)
57
+ # Minimum width of 100px, maximum of 400px
58
+ width = max(100, min(int(final_len * 8), 400))
59
+ widths.append(width)
60
+
61
+ return widths
62
+
63
+ column_widths = calculate_column_widths(df_display, display_columns)
64
+
65
+ # Create styling for smaller font
66
+ def create_styling(df):
67
+ """Create styling array for smaller font"""
68
+ num_rows = len(df)
69
+ num_cols = len(df.columns)
70
+ # Apply small font style to all cells
71
+ styling = [["font-size: 10px; white-space: normal; word-wrap: break-word;" for _ in range(num_cols)] for _ in range(num_rows)]
72
+ return styling
73
+
74
+ table_styling = create_styling(df_display)
75
+
76
+ # 3. Utility Functions
77
+ # ---------------------------------------------------------
78
+ def remove_outliers(series):
79
+ """Remove outliers using IQR method, replacing them with NaN"""
80
+ Q1 = series.quantile(0.01)
81
+ Q3 = series.quantile(0.99)
82
+ IQR = Q3 - Q1
83
+ lower_bound = Q1 - 1.5 * IQR
84
+ upper_bound = Q3 + 1.5 * IQR
85
+ return series.where((series >= lower_bound) & (series <= upper_bound), np.nan)
86
+
87
+ def should_use_log_scale(values):
88
+ """
89
+ Heuristic to decide if logarithmic scale should be used.
90
+ Returns True if:
91
+ 1. All values are positive
92
+ 2. The ratio of max to min is greater than 100
93
+ 3. The distribution is highly skewed (skewness > 2)
94
+ """
95
+ # Remove NaN values for analysis
96
+ clean_values = values.dropna()
97
+
98
+ if len(clean_values) == 0:
99
+ return False
100
+
101
+ # Check if all values are positive
102
+ if (clean_values <= 0).any():
103
+ return False
104
+
105
+ # Check ratio of max to min
106
+ max_val = clean_values.max()
107
+ min_val = clean_values.min()
108
+
109
+ if min_val == 0:
110
+ return False
111
+
112
+ ratio = max_val / min_val
113
+
114
+ # Check skewness
115
+ skewness = clean_values.skew()
116
+
117
+ # Use log scale if ratio is high and distribution is skewed
118
+ return ratio > 100 and skewness > 2
119
+
120
+ def format_value(value):
121
+ """Format value for tooltip: integers without decimals, floats with max 3 decimals"""
122
+ if pd.isna(value):
123
+ return "N/A"
124
+ if isinstance(value, (int, np.integer)) or value == int(value):
125
+ return f"{int(value)}"
126
+ else:
127
+ return f"{value:.3f}".rstrip('0').rstrip('.')
128
+
129
+ # 4. Define App Logic
130
+ # ---------------------------------------------------------
131
+ def update_map(select_data: gr.SelectData, current_table_data):
132
+ """
133
+ Triggered when a cell in the table is clicked.
134
+ select_data.index is a tuple (row, col) or int depending on version.
135
+ """
136
+ if select_data is None:
137
+ return None
138
+
139
+ # Handle index format (it often comes as [row, col] or just row index)
140
+ # We safely extract the row index
141
+ if isinstance(select_data.index, (list, tuple)):
142
+ row_index = select_data.index[0]
143
+ else:
144
+ row_index = select_data.index
145
+
146
+ # Get the row data from the visible table
147
+ # In newer Gradio, current_table_data is a DataFrame
148
+ if isinstance(current_table_data, pd.DataFrame):
149
+ selected_row = current_table_data.iloc[row_index]
150
+ else:
151
+ return None
152
+
153
+ # Find the corresponding unique indicator code
154
+ match = df_unique[
155
+ (df_unique['indicator_name'] == selected_row['indicator_name']) &
156
+ (df_unique['section'] == selected_row['section'])
157
+ ]
158
+
159
+ if match.empty:
160
+ return px.choropleth(title="Indicator not found")
161
+
162
+ selected_code = match.iloc[0][hidden_link_key]
163
+ selected_unit = match.iloc[0]['indicator_unit']
164
+
165
+ # Filter main data for this indicator
166
+ df_filtered = df[df['indicator_code'] == selected_code].copy()
167
+
168
+ # Ensure one row per region (remove duplicates, keep first occurrence)
169
+ df_filtered = df_filtered.drop_duplicates(subset=['ISO'], keep='first')
170
+
171
+ # Remove outliers (replace with NaN)
172
+ df_filtered['indicator_value_clean'] = remove_outliers(df_filtered['indicator_value'])
173
+
174
+ # Calculate Rankings (Desc: Higher Value = Rank 1) - excluding NaN values
175
+ df_filtered_for_ranking = df_filtered.dropna(subset=['indicator_value_clean']).copy()
176
+ df_filtered_for_ranking = df_filtered_for_ranking.sort_values('indicator_value_clean', ascending=False).reset_index(drop=True)
177
+ df_filtered_for_ranking['Ranking'] = range(1, len(df_filtered_for_ranking) + 1)
178
+
179
+ # Merge rankings back
180
+ df_filtered = df_filtered.merge(
181
+ df_filtered_for_ranking[['ISO', 'Ranking']],
182
+ on='ISO',
183
+ how='left'
184
+ )
185
+
186
+ # Decide if we should use log scale
187
+ use_log = should_use_log_scale(df_filtered['indicator_value_clean'])
188
+
189
+ # Create color scale values (log if needed)
190
+ if use_log:
191
+ df_filtered['color_value'] = np.log10(df_filtered['indicator_value_clean'])
192
+ color_label = f"{selected_unit} (log scale)"
193
+ else:
194
+ df_filtered['color_value'] = df_filtered['indicator_value_clean']
195
+ color_label = selected_unit
196
+
197
+ # Merge with Geometry
198
+ merged = gdf.merge(df_filtered, on='ISO', how='inner')
199
+
200
+ if merged.empty:
201
+ return px.choropleth(title="No data for this indicator")
202
+
203
+ # Construct Map with divergent color scale
204
+ fig = px.choropleth_map(
205
+ merged,
206
+ geojson=merged.geometry,
207
+ locations=merged.index,
208
+ color='color_value',
209
+ color_continuous_scale="RdYlBu_r", # Divergent color scale (red-yellow-blue reversed)
210
+ map_style="satellite-streets",
211
+ zoom=2,
212
+ center={"lat": 60, "lon": 90},
213
+ opacity=0.6,
214
+ labels={'color_value': color_label}
215
+ )
216
+
217
+ # Format values for tooltip
218
+ merged['formatted_value'] = merged['indicator_value'].apply(format_value)
219
+ merged['formatted_ranking'] = merged['Ranking'].apply(lambda x: str(int(x)) if pd.notna(x) else "N/A")
220
+
221
+ # Tooltip Configuration
222
+ fig.update_traces(
223
+ customdata=merged[['formatted_ranking', 'Region', 'indicator_name', 'formatted_value']],
224
+ hovertemplate=(
225
+ "<b>Rank:</b> %{customdata[0]}<br>"
226
+ "<b>Region:</b> %{customdata[1]}<br>"
227
+ "<b>Indicator Name:</b> %{customdata[2]}<br>"
228
+ "<b>Indicator Value:</b> %{customdata[3]}"
229
+ "<extra></extra>"
230
+ )
231
+ )
232
+
233
+ fig.update_layout(
234
+ margin={"r":0,"t":0,"l":0,"b":0},
235
+ height=800 # Increased map height
236
+ )
237
+
238
+ return fig
239
+
240
+ # 5. Build Gradio UI
241
+ # ---------------------------------------------------------
242
+ with gr.Blocks(title="Russian Regions Analytics") as demo:
243
+ gr.Markdown("## Russian Regional Indicators")
244
+
245
+ with gr.Row():
246
+ map_plot = gr.Plot(label="Regional Distribution")
247
+
248
+ with gr.Row():
249
+ # Prepare table value with styling metadata
250
+ table_value = {
251
+ "data": df_display.values.tolist(),
252
+ "headers": display_columns,
253
+ "metadata": {
254
+ "styling": table_styling
255
+ }
256
+ }
257
+
258
+ table = gr.DataFrame(
259
+ value=table_value,
260
+ label="Select an Indicator",
261
+ datatype=["str", "str", "str", "str", "number", "str"],
262
+ interactive=True,
263
+ max_height=700, # Increased table height
264
+ column_widths=column_widths # Smart column widths based on content
265
+ )
266
+
267
+ # Wire the selection event
268
+ table.select(
269
+ fn=update_map,
270
+ inputs=[table],
271
+ outputs=[map_plot]
272
+ )
273
+
274
+ # Footer with data sources
275
+ gr.Markdown("""
276
+ ---
277
+ ### Data Sources & Attribution
278
+
279
+ **Statistical Data:** [Tochno.st Regional Datasets](https://tochno.st/datasets/regions_collection)
280
+ **Administrative Boundaries:** [geoBoundaries](https://www.geoboundaries.org/countryDownloads.html)
281
+ **Regional Codes:** [Codes of Subjects of the Russian Federation](https://en.everybodywiki.com/Codes_of_subjects_of_the_Russian_Federation)
282
+ **Translation Model:** [Tencent HY-MT1.5-7B](https://huggingface.co/tencent/HY-MT1.5-7B)
283
+ """)
284
+
285
+ if __name__ == "__main__":
286
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ lxml
4
+ geopandas
5
+ plotly
6
+ pyarrow
7
+ fastparquet
8
+ numpy