Spaces:

mirix
/

RussianRegions

Sleeping

App Files Files Community

mirix commited on Jan 15

Commit

679efc4

verified ·

1 Parent(s): 2239bca

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
Russia_regions_data.parquet +3 -0
Russia_regions_geo_simplified.geoparquet +3 -0
app.py +286 -0
requirements.txt +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Russia_regions_geo_simplified.geoparquet filter=lfs diff=lfs merge=lfs -text

Russia_regions_data.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c87635401d364d438db96c1bcf0fbf513ab22a299b7b23c6082e6fac165b69d
+size 60001

Russia_regions_geo_simplified.geoparquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5913a0c68df7f5bac24bd6f93adb4531493ba89a0576127a37fbc66a63bc4608
+size 1039864

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import gradio as gr
+import pandas as pd
+import geopandas as gpd
+import plotly.express as px
+import numpy as np
+# 1. Load Data
+# ---------------------------------------------------------
+try:
+    df = pd.read_parquet('Russia_regions_data.parquet')
+    gdf = gpd.read_parquet('Russia_regions_geo_simplified.geoparquet')
+except Exception as e:
+    print(f"Error loading files: {e}")
+    # Fallback dummy data
+    df = pd.DataFrame(columns=['section', 'subsection', 'indicator_name', 'comment', 'year',
+                               'indicator_value', 'indicator_unit', 'indicator_code',
+                               'Region', 'object_oktmo', 'ISO'])
+    gdf = gpd.GeoDataFrame(columns=['ISO', 'geometry'])
+# Ensure geometries are in standard lat/lon
+gdf = gdf.to_crs(epsg=4326)
+# 2. Prepare the Table Data (The "Menu")
+# ---------------------------------------------------------
+# Filter for unique indicator_code
+df_unique = df.drop_duplicates(subset=['indicator_code']).copy()
+# Define columns (removed indicator_value from display)
+display_columns = [
+    'section', 'subsection', 'indicator_name', 'comment',
+    'year', 'indicator_unit'
+]
+hidden_link_key = 'indicator_code'
+# Create the display dataframe
+df_display = df_unique[display_columns].reset_index(drop=True)
+# Calculate optimal column widths based on content with min/max constraints
+def calculate_column_widths(df, columns):
+    """Calculate column widths based on content length with constraints"""
+    widths = []
+    for col in columns:
+        # Get max of header length and content lengths
+        header_len = len(str(col))
+        if len(df) > 0:
+            content_lengths = df[col].astype(str).str.len()
+            max_content_len = content_lengths.max()
+            avg_content_len = content_lengths.mean()
+            # Use average of max and mean, bounded by min/max
+            optimal_len = (max_content_len + avg_content_len) / 2
+            final_len = max(header_len, min(optimal_len, 50))  # Cap at 50 chars
+        else:
+            final_len = header_len
+        # Scale to pixel width (roughly 8 pixels per character)
+        # Minimum width of 100px, maximum of 400px
+        width = max(100, min(int(final_len * 8), 400))
+        widths.append(width)
+    return widths
+column_widths = calculate_column_widths(df_display, display_columns)
+# Create styling for smaller font
+def create_styling(df):
+    """Create styling array for smaller font"""
+    num_rows = len(df)
+    num_cols = len(df.columns)
+    # Apply small font style to all cells
+    styling = [["font-size: 10px; white-space: normal; word-wrap: break-word;" for _ in range(num_cols)] for _ in range(num_rows)]
+    return styling
+table_styling = create_styling(df_display)
+# 3. Utility Functions
+# ---------------------------------------------------------
+def remove_outliers(series):
+    """Remove outliers using IQR method, replacing them with NaN"""
+    Q1 = series.quantile(0.01)
+    Q3 = series.quantile(0.99)
+    IQR = Q3 - Q1
+    lower_bound = Q1 - 1.5 * IQR
+    upper_bound = Q3 + 1.5 * IQR
+    return series.where((series >= lower_bound) & (series <= upper_bound), np.nan)
+def should_use_log_scale(values):
+    """
+    Heuristic to decide if logarithmic scale should be used.
+    Returns True if:
+    1. All values are positive
+    2. The ratio of max to min is greater than 100
+    3. The distribution is highly skewed (skewness > 2)
+    """
+    # Remove NaN values for analysis
+    clean_values = values.dropna()
+    if len(clean_values) == 0:
+        return False
+    # Check if all values are positive
+    if (clean_values <= 0).any():
+        return False
+    # Check ratio of max to min
+    max_val = clean_values.max()
+    min_val = clean_values.min()
+    if min_val == 0:
+        return False
+    ratio = max_val / min_val
+    # Check skewness
+    skewness = clean_values.skew()
+    # Use log scale if ratio is high and distribution is skewed
+    return ratio > 100 and skewness > 2
+def format_value(value):
+    """Format value for tooltip: integers without decimals, floats with max 3 decimals"""
+    if pd.isna(value):
+        return "N/A"
+    if isinstance(value, (int, np.integer)) or value == int(value):
+        return f"{int(value)}"
+    else:
+        return f"{value:.3f}".rstrip('0').rstrip('.')
+# 4. Define App Logic
+# ---------------------------------------------------------
+def update_map(select_data: gr.SelectData, current_table_data):
+    """
+    Triggered when a cell in the table is clicked.
+    select_data.index is a tuple (row, col) or int depending on version.
+    """
+    if select_data is None:
+        return None
+    # Handle index format (it often comes as [row, col] or just row index)
+    # We safely extract the row index
+    if isinstance(select_data.index, (list, tuple)):
+        row_index = select_data.index[0]
+    else:
+        row_index = select_data.index
+    # Get the row data from the visible table
+    # In newer Gradio, current_table_data is a DataFrame
+    if isinstance(current_table_data, pd.DataFrame):
+        selected_row = current_table_data.iloc[row_index]
+    else:
+        return None
+    # Find the corresponding unique indicator code
+    match = df_unique[
+        (df_unique['indicator_name'] == selected_row['indicator_name']) &
+        (df_unique['section'] == selected_row['section'])
+    ]
+    if match.empty:
+        return px.choropleth(title="Indicator not found")
+    selected_code = match.iloc[0][hidden_link_key]
+    selected_unit = match.iloc[0]['indicator_unit']
+    # Filter main data for this indicator
+    df_filtered = df[df['indicator_code'] == selected_code].copy()
+    # Ensure one row per region (remove duplicates, keep first occurrence)
+    df_filtered = df_filtered.drop_duplicates(subset=['ISO'], keep='first')
+    # Remove outliers (replace with NaN)
+    df_filtered['indicator_value_clean'] = remove_outliers(df_filtered['indicator_value'])
+    # Calculate Rankings (Desc: Higher Value = Rank 1) - excluding NaN values
+    df_filtered_for_ranking = df_filtered.dropna(subset=['indicator_value_clean']).copy()
+    df_filtered_for_ranking = df_filtered_for_ranking.sort_values('indicator_value_clean', ascending=False).reset_index(drop=True)
+    df_filtered_for_ranking['Ranking'] = range(1, len(df_filtered_for_ranking) + 1)
+    # Merge rankings back
+    df_filtered = df_filtered.merge(
+        df_filtered_for_ranking[['ISO', 'Ranking']],
+        on='ISO',
+        how='left'
+    )
+    # Decide if we should use log scale
+    use_log = should_use_log_scale(df_filtered['indicator_value_clean'])
+    # Create color scale values (log if needed)
+    if use_log:
+        df_filtered['color_value'] = np.log10(df_filtered['indicator_value_clean'])
+        color_label = f"{selected_unit} (log scale)"
+    else:
+        df_filtered['color_value'] = df_filtered['indicator_value_clean']
+        color_label = selected_unit
+    # Merge with Geometry
+    merged = gdf.merge(df_filtered, on='ISO', how='inner')
+    if merged.empty:
+        return px.choropleth(title="No data for this indicator")
+    # Construct Map with divergent color scale
+    fig = px.choropleth_map(
+        merged,
+        geojson=merged.geometry,
+        locations=merged.index,
+        color='color_value',
+        color_continuous_scale="RdYlBu_r",  # Divergent color scale (red-yellow-blue reversed)
+        map_style="satellite-streets",
+        zoom=2,
+        center={"lat": 60, "lon": 90},
+        opacity=0.6,
+        labels={'color_value': color_label}
+    )
+    # Format values for tooltip
+    merged['formatted_value'] = merged['indicator_value'].apply(format_value)
+    merged['formatted_ranking'] = merged['Ranking'].apply(lambda x: str(int(x)) if pd.notna(x) else "N/A")
+    # Tooltip Configuration
+    fig.update_traces(
+        customdata=merged[['formatted_ranking', 'Region', 'indicator_name', 'formatted_value']],
+        hovertemplate=(
+            "<b>Rank:</b> %{customdata[0]}<br>"
+            "<b>Region:</b> %{customdata[1]}<br>"
+            "<b>Indicator Name:</b> %{customdata[2]}<br>"
+            "<b>Indicator Value:</b> %{customdata[3]}"
+            "<extra></extra>"
+        )
+    )
+    fig.update_layout(
+        margin={"r":0,"t":0,"l":0,"b":0},
+        height=800  # Increased map height
+    )
+    return fig
+# 5. Build Gradio UI
+# ---------------------------------------------------------
+with gr.Blocks(title="Russian Regions Analytics") as demo:
+    gr.Markdown("## Russian Regional Indicators")
+    with gr.Row():
+        map_plot = gr.Plot(label="Regional Distribution")
+    with gr.Row():
+        # Prepare table value with styling metadata
+        table_value = {
+            "data": df_display.values.tolist(),
+            "headers": display_columns,
+            "metadata": {
+                "styling": table_styling
+            }
+        }
+        table = gr.DataFrame(
+            value=table_value,
+            label="Select an Indicator",
+            datatype=["str", "str", "str", "str", "number", "str"],
+            interactive=True,
+            max_height=700,  # Increased table height
+            column_widths=column_widths  # Smart column widths based on content
+        )
+    # Wire the selection event
+    table.select(
+        fn=update_map,
+        inputs=[table],
+        outputs=[map_plot]
+    )
+    # Footer with data sources
+    gr.Markdown("""
+    ---
+    ### Data Sources & Attribution
+    **Statistical Data:** [Tochno.st Regional Datasets](https://tochno.st/datasets/regions_collection)
+    **Administrative Boundaries:** [geoBoundaries](https://www.geoboundaries.org/countryDownloads.html)
+    **Regional Codes:** [Codes of Subjects of the Russian Federation](https://en.everybodywiki.com/Codes_of_subjects_of_the_Russian_Federation)
+    **Translation Model:** [Tencent HY-MT1.5-7B](https://huggingface.co/tencent/HY-MT1.5-7B)
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+pandas
+lxml
+geopandas
+plotly
+pyarrow
+fastparquet
+numpy