CSP-data / app.py
Aivis's picture
Update app.py
8e254b2 verified
import ast
import gradio as gr # pip install "gradio[mcp]"
from pycspwrapper import LVStat # pip install pycspwrapper
import requests
from typing import Any, Dict, List, Tuple
def get_topics(name:str = '') -> dict:
"""Available topics from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
Args:
name (str): name of the topic. If not defined, function will return all available topics.
Returns:
dict: The dictionary of topics, where key is topic name and value is topic code.
Examples:
>>> get_topics('vide')
{'Vide': 'ENV'}
>>> print(get_topics())
{'Iedzīvotāji': 'POP', 'Darbs': 'EMP', 'Sociālā aizsardzība un veselība': 'VES',...
"""
name_capit = name.capitalize()
base_url = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB/'
content = requests.get(base_url)
content_short = {i['text']: i['id'] for i in content.json()}
if name_capit in content_short.keys():
return {name_capit: content_short[name_capit]}
else:
return content_short
def get_topic_content(topic: str) -> dict:
"""Available contents of the topic from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
Args:
topic (str): topic code. Use get_topics to get topic code.
Returns:
dict: The dictionary of the contents of the topic, where key is the topic content and value is the topic content code.
Examples:
>>> # First get topic code
... get_topics('vide')
{'Vide': 'ENV'}
>>> # Then use this code to get content
... print(get_topic_content('ENV'))
{'Vides konti': 'VI', 'Atkritumu apsaimniekošana': 'AK', 'Agro-vides rādītāji': 'AV',...
>>> get_topics('Iedzīvotāji')
{'Iedzīvotāji': 'POP'}
>>> print(get_topic_content('POP'))
{'Iedzīvotāju skaits un raksturojošie rādītāji': 'IR', 'Dzimstība': 'ID', 'Mirstība': 'IM', 'Nāves cēloņi': 'NC',...
"""
base_url = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB/START/'
content = requests.get(base_url+topic.upper())
content_short = {i['text']: i['id'] for i in content.json()}
return content_short
def get_titles(topic_content_code:str = '',
url:str = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB?query=*&filter=*') -> dict:
"""Available data (titles) from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
Args:
topic_content_code (str): topic content code. Use get_topic_content to get topic content code.
If not defined, function will return all available titles.
url (str): URL from where to get list of available titles. Default value: 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB?query=*&filter=*'.
Returns:
dict: The dictionary of the titles available from Official Statistics Portal,
where key is the title name and value is the list of 4 elements: topic code, topic content code, topic sub-content code and report ID.
Examples:
>>> # First get topic code
... get_topics('Darbs')
{'Darbs': 'EMP'}
>>> # Then use this code to get content
... print(get_topic_content('EMP'))
{'Darba samaksa (algas)': 'DS', 'Darbaspēka izmaksas': 'DI', 'Darbvietas un darba laiks': 'DV',...
>>> # Then use this content code to extract report titles
... print(get_titles('DS'))
{'Dzīvi un nedzīvi dzimušo skaits pēc dzimuma 1920 - 2020': ['POP', 'ID', 'IDS', 'IDS010'],...
"""
alldb = requests.get(url)
dict_result = {}
for i in alldb.json():
if topic_content_code == i['path'].split('/')[-2]:
dict_result[i['title']] = [j for j in i['path'].split('/') if j]+[i['id']]
return dict_result
def get_query_values(topic_params: list[str] = []) -> List[Dict]:
"""Get query code and values for particular report.
Args:
topic_params (list[str]): arguments as a list that are needed for data extraction.
Arguments in the list should be in the following order:
- topic code,
- topic content code,
- topic sub-content code
- report ID.
These codes you can get from the function get_titles.
Returns:
Dict: A dictionary where each key is a query parameter code and the value is another dictionary mapping possible values to their descriptive texts.
Examples:
>>> # First get report topic parameters from get_titles
... print(get_titles('DS'))
{'Dzīvi un nedzīvi dzimušo skaits pēc dzimuma 1920 - 2020': ['POP', 'ID', 'IDS', 'IDS010'],...
>>> # Then use these values to get possible query values
... print(get_query_values(['POP', 'ID', 'IDS', 'IDS010']))
{'SEX_NEWBORN': {'T': 'Pavisam', 'M': 'Vīrieši', 'F': 'Sievietes'}, 'ContentsCode': {'IDS010': 'Dzīvi dzimuši', 'IDS0101': 'Nedzīvi dzimuši', 'IDS0102': 'Nedzīvi dzimuši uz 1000 dzīvi dzimušiem'},
'TIME': {'1920': '1920' ... '2024': '2024'}}
"""
base_url = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB/START/'
url = base_url + '/'.join(topic_params)
response = requests.get(url)
try:
response.raise_for_status() # Raises HTTPError for bad responses (4xx, 5xx)
data = response.json()
if 'variables' not in data:
raise ValueError("Unexpected JSON structure: 'variables' key missing")
result = {}
for var in data['variables']:
code = var.get('code', '')
values = var.get('values', [])
value_texts = var.get('valueTexts', [])
# build dict mapping value -> valueText
mapping = dict(zip(values, value_texts))
result[code] = mapping
return result
except requests.exceptions.RequestException as e:
raise RuntimeError(f"Request failed: {e}")
except ValueError as ve:
raise RuntimeError(f"Parsing failed: {ve}")
def construct_csp_link(params: list[str]) -> str:
base_url = 'https://data.stat.gov.lv/pxweb/lv/OSP_PUB/START__'
mid_path = '__'.join(params[:3])
last_part = params[3]
return f"{base_url}{mid_path}/{last_part}/"
def get_csp_data(lang: str = 'en', topic_params: list[str] = [], **kwargs) -> List[Dict]:
"""Get statistics from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde). Use 'Source URL' from the Returns to cite the data source.
Args:
lang (str): Language. Default value 'en'.
topic_params (list[str]): arguments as a list that are needed for data extraction.
Arguments in the list should be in the following order:
- topic code,
- topic content code,
- topic sub-content code
- report ID.
These codes you can get from the function get_titles.
kwargs: Keyword arguments for query configuration.
Possible query argument names and their possible values
can be obtained using the function get_query_values.
Returns:
list: The list of the dictionaries, where dictionary's key 'key' contains query parameters and key 'values' contains values. First list element is data source (URL) from CSB.
Examples:
>>> topics = ['POP', 'IR', 'IRE', 'IRE010']
>>> query_args = get_query_values(topics)
>>> print(query_args)
{'ETHNICITY': {'TOTAL': 'Pavisam', 'E_LAT': 'Latvieši', 'E_ABZ': 'Abāzi', 'E_ABK': 'Abhāzi',...
'E_SWE': 'Zviedri', 'OTH': 'Cita tautība', 'UNK_NSP': 'Nezināma, neizvēlēta'}, 'ContentsCode': {'IRE010': 'Skaits'},
'TIME': {'1935': '1935', ... '2025': '2025'}}
>>> # Then use these codes and values to get data for example Latvians for years 2024 and 2025.
>>> # Value text 'Latvieši' explains what 'E_LAT' means.
>>> data = get_csp_data(
... lang='en',
... topic_params=topics,
... ETHNICITY=['E_LAT'],
... TIME=['2024', '2025']
... )
>>> print(data[0])
{'key': ['E_LAT', '2024'], 'values': ['1186337']}
"""
csp2 = LVStat(lang, *topic_params)
csp2.set_query(**kwargs)
link = construct_csp_link(topic_params)
cspdata2 = csp2.get_data()
return [{"Source URL": link}] + cspdata2['data']
with gr.Blocks() as demo:
gr.Markdown("### Latvian CSP Data Query Interface")
lang = gr.Dropdown(["en", "lv"], value="en", label="Language")
# Step 1: Topic Selection
topic_dict = get_topics()
topic_dropdown = gr.Dropdown(choices=list(topic_dict.keys()), label="Select Topic")
# Step 2: Topic Content (dynamically populated)
topic_content_dropdown = gr.Dropdown(label="Select Topic Content", visible=False)
# Step 3: Report Titles (dynamically populated)
report_dropdown = gr.Dropdown(label="Select Report", visible=False)
# Dynamic Link & Topic Params Output
link_output = gr.Markdown(visible=False)
topic_params_box = gr.Textbox(label="Topic Params", lines=1, interactive=True)
#topic_params = gr.Textbox(label="Topic Params (Python list, e.g., ['POP', 'IR', 'IRE', 'IRE010'])")
kwargs_box = gr.Textbox(
label="Query Parameters (Python dict, e.g., {'ETHNICITY': ['E_LAT'], 'TIME': ['2024', '2025']})",
lines=4,
)
output = gr.JSON(label="Result")
run_button = gr.Button("Run Query")
def update_topic_content(topic_name: str)-> Tuple[Any, Dict[str, str], str]:
# """Given a topic name, updates the UI dropdown choices with the corresponding content,
# and returns the content dictionary (what content is available under given topic_name) and internal topic code (ID of the topic_name).
#
# Args:
# topic_name (str): The name of the selected topic. Possible topic names: 'Darbs', 'Iedzīvotāji', 'Informācijas tehnoloģijas', 'Izglītība, kultūra un zinātne', 'Nozares',\
# 'Sociālā aizsardzība un veselība', 'Tirdzniecība un pakalpojumi', 'Uzņēmējdarbība', 'Valsts un ekonomika', 'Vide'.
# Returns:
# tuple:
# - gr.update: Gradio UI update object with new dropdown choices and visibility set to True.
# - content_dict (dict): Dictionary containing content entries for the selected topic.
# - topic_code (str): Internal code corresponding to the topic name.
# """
topic_code = topic_dict[topic_name]
content_dict = get_topic_content(topic_code)
return gr.update(choices=list(content_dict.keys()), visible=True), content_dict, topic_code
def update_reports(topic_content_name: str, content_dict: dict = None) -> Tuple[Dict[str, str], str, Any]:
# """Updates the UI dropdown menu with available report titles for a selected topic content, and returns the titles dictionary and the corresponding content code.
#
# Args:
# topic_content_name (str): The name of the selected topic content (e.g., "Darba samaksa (algas)"). Topic content names you can get from `update_topic_content` (`content_dict` object).
# content_dict (dict): A dictionary mapping topic content names to their corresponding content codes. Obtained from `update_topic_content` (2nd returned object: `content_dict`).
# topic_name (str): The name of the selected topic. Possible topic names: 'Darbs', 'Iedzīvotāji', 'Informācijas tehnoloģijas', 'Izglītība, kultūra un zinātne', 'Nozares',\
# 'Sociālā aizsardzība un veselība', 'Tirdzniecība un pakalpojumi', 'Uzņēmējdarbība', 'Valsts un ekonomika', 'Vide'.
# Returns:
# tuple:
# - gr.update: A Gradio UI update object to populate a dropdown with the list of report titles and make it visible.
# - titles_dict (dict): A dictionary of available report titles from the Official Statistics Portal
# of Latvia (CSP). Each key is a human-readable report title, and each value
# is a list of metadata: `[topic_code, topic_content_code, sub_content_code, report_id]`.
# This list (as a string) of metadata later can be used in the `run_get_csp_data` function (into topic_params_str parameter)
# - topic_content_code (str): The internal code associated with the selected topic content, used to retrieve titles via the `get_titles` function.
# Dependencies:
# - Relies on `get_titles(topic_content_code)` to fetch metadata from the CSP's API at:
# https://data.stat.gov.lv/api/v1/lv/OSP_PUB?query=*&filter=*
# """
topic_content_code = content_dict[topic_content_name]
titles_dict = get_titles(topic_content_code)
return titles_dict, topic_content_code, gr.update(choices=list(titles_dict.keys()), visible=True)
def update_topic_params_and_link(report_title: str, titles_dict: dict) -> Tuple[str, Any, Any]:
# """Prepares and returns metadata, a hyperlink, and query parameter preview for a selected report
# from the Official Statistics Portal of Latvia (CSP).
#
# Args:
# report_title (str): The title of the selected report, as shown in the dropdown.
# titles_dict (dict): Dictionary mapping report titles to their metadata list:
# [topic_code, content_code, sub_content_code, report_id],
# typically retrieved using `get_titles(topic_content_code)`.
# Returns:
# tuple:
# - topic_params_str (str): String representation of the internal report metadata (code list),
# useful for debugging or internal reference.
# This scring can be used in the `run_get_csp_data` (parameter `topic_params_str`).
# - gr.update: Gradio component update with a Markdown-style hyperlink pointing to the
# CSP page for the selected report.
# - gr.update: Gradio component update showing a sample query parameter, particularly
# for the `TIME` dimension if present, using the most recent 3 values.
# Details:
# - The function extracts the internal metadata for the selected report.
# - It generates a URL using `construct_csp_link(...)` that links directly to the CSP report page.
# - It attempts to fetch available query parameters using `get_query_values(...)`, then isolates
# the `TIME` filter and selects the last 3 available values (e.g., most recent years).
# - If fetching query parameters fails, an empty dictionary (`'{}'`) is returned as the fallback.
# Example Output:
# - topic_params_str: "['POP', 'ID', 'IDS', 'IDS010']"
# - link (Markdown): "[Dzimušo skaits pēc dzimuma](https://data.stat.gov.lv/.../IDS010)"
# - query_str: "{'TIME': ['2020', '2021', '2022']}"
# """
title_value = titles_dict[report_title]
topic_params_str = str(title_value)
link = construct_csp_link(title_value)
try:
q = get_query_values(title_value)
timeval = q.get('TIME','')
query = {}
if len(timeval) > 0:
query = {'TIME': list(timeval.keys())[-3:]}
# for i in q:
# if i.get('code', '') == 'TIME':
# query = {'TIME': i['values'][-3:]}
# break
query_str = str(query)
except Exception as e:
query_str = '{}'
return topic_params_str, gr.update(value=f"[{report_title}]({link})", visible=True), gr.update(value=query_str)
topic_content_state = gr.State()
titles_state = gr.State()
topic_code_state = gr.State()
topic_content_code_state = gr.State()
topic_dropdown.change(fn=update_topic_content, inputs=topic_dropdown, outputs=[topic_content_dropdown, topic_content_state, topic_code_state])
topic_content_dropdown.change(fn=update_reports, inputs=[topic_content_dropdown, topic_content_state], #, topic_code_state],
outputs=[titles_state, topic_content_code_state, report_dropdown])
report_dropdown.change(fn=update_topic_params_and_link, inputs=[report_dropdown, titles_state], #, topic_code_state, topic_content_code_state],
outputs=[topic_params_box, link_output, kwargs_box])
def run_get_csp_data(lang: str = 'en', topic_params_str: str = '[]', query_kwargs_str: str = '{}') -> List[Dict]:
"""Get statistics from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
Args:
lang (str): Language. Default value 'en'.
topic_params_str (str): string representation of a list that is needed for data extraction. Arguments in the list should be in the following order: topic code, topic content code, topic sub-content code, report ID.\
These codes you can get from the function get_titles.
query_kwargs_str (str): string representation of a dictionary - keyword arguments for query configuration.\
Possible query argument names and their possible values can be obtained using the function get_query_values.
Returns:
list: The list of the dictionaries, where dictionary's key 'key' contains query parameters and key 'values' contains values.
Examples:
>>> # First get topic code
... get_topics('Iedzīvotāji')
{'Iedzīvotāji': 'POP'}
>>> # Then use this code to get topic contents
... print(get_topic_content('POP'))
{'Iedzīvotāju skaits un raksturojošie rādītāji': 'IR', 'Dzimstība': 'ID', 'Mirstība': 'IM', 'Nāves cēloņi': 'NC'...
>>> # Then use this content code to extract report titles
... print(get_titles('IR'))
{'Iedzīvotāju skaits gada sākumā, tā izmaiņas un dabiskās kustības galvenie rādītāji 1920 - 2021': ['POP', 'IR', 'IRS010'],...
>>> # Use all these previous codes to select data for corresponding report
... report = ['POP', 'IR', 'IRE', 'IRE010']
>>> # Get 'code' and 'values' for filtering data
... query_args = get_query_values(report)
>>> print(query_args)
{'ETHNICITY': {'TOTAL': 'Pavisam', 'E_LAT': 'Latvieši',...}, 'ContentsCode': {'IRE010': 'Skaits'}, 'TIME': {'1935': '1935', ..., '2025': '2025'}}
>>> # Get final result
... data = run_get_csp_data(
... lang='en',
... topic_params_str = str(report),
... query_kwargs_str = '{'ETHNICITY': ['E_LAT'], 'TIME': ['2024', '2025']}'
... )
>>> print(data[0])
{'key': ['E_LAT', '2024'], 'values': ['1186337']}
"""
try:
topic_params = ast.literal_eval(topic_params_str)
query_kwargs = ast.literal_eval(query_kwargs_str)
if not isinstance(topic_params, list) or not isinstance(query_kwargs, dict):
raise ValueError("Input format error")
result = get_csp_data(lang=lang, topic_params=topic_params, **query_kwargs)
return result
except Exception as e:
return {"error": str(e)}
gr.api(get_topics, api_name="get_topic_name_and_id")
gr.api(get_topic_content, api_name="get_topic_content_name_and_id")
gr.api(get_titles, api_name="get_report_titles")
gr.api(get_query_values, api_name="get_query_values")
run_button.click(fn=run_get_csp_data, inputs=[lang, topic_params_box, kwargs_box], outputs=output)
if __name__ == "__main__":
demo.launch(mcp_server=True)