Spaces:

Aivis
/

CSP-data

Running

File size: 19,642 Bytes

import ast
import gradio as gr # pip install "gradio[mcp]"
from pycspwrapper import LVStat # pip install pycspwrapper
import requests
from typing import Any, Dict, List, Tuple


def get_topics(name:str = '') -> dict:
    """Available topics from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
    
Args:
    name (str): name of the topic. If not defined, function will return all available topics.
Returns:
    dict: The dictionary of topics, where key is topic name and value is topic code.
Examples:
    >>> get_topics('vide')
    {'Vide': 'ENV'}
    >>> print(get_topics())
    {'Iedzīvotāji': 'POP', 'Darbs': 'EMP', 'Sociālā aizsardzība un veselība': 'VES',...
"""
    name_capit = name.capitalize()
    base_url = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB/'
    content = requests.get(base_url)
    content_short = {i['text']: i['id'] for i in content.json()}

    if name_capit in content_short.keys():
        return {name_capit: content_short[name_capit]}
    else:
        return content_short


def get_topic_content(topic: str) -> dict:
    """Available contents of the topic from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
    
Args:
    topic (str): topic code. Use get_topics to get topic code.
Returns:
    dict: The dictionary of the contents of the topic, where key is the topic content and value is the topic content code.
Examples:
    >>> # First get topic code
    ... get_topics('vide')
    {'Vide': 'ENV'}
    >>> # Then use this code to get content
    ... print(get_topic_content('ENV'))
    {'Vides konti': 'VI', 'Atkritumu apsaimniekošana': 'AK', 'Agro-vides rādītāji': 'AV',...
    >>> get_topics('Iedzīvotāji')
    {'Iedzīvotāji': 'POP'}
    >>> print(get_topic_content('POP'))
    {'Iedzīvotāju skaits un raksturojošie rādītāji': 'IR', 'Dzimstība': 'ID', 'Mirstība': 'IM', 'Nāves cēloņi': 'NC',...
"""
    base_url = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB/START/'
    content = requests.get(base_url+topic.upper())
    content_short = {i['text']: i['id'] for i in content.json()}
    return content_short


def get_titles(topic_content_code:str = '',
               url:str = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB?query=*&filter=*') -> dict:
    """Available data (titles) from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
    
    Args:
        topic_content_code (str): topic content code. Use get_topic_content to get topic content code.
        If not defined, function will return all available titles.
        url (str): URL from where to get list of available titles. Default value: 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB?query=*&filter=*'.
    Returns:
        dict: The dictionary of the titles available from Official Statistics Portal,
        where key is the title name and value is the list of 4 elements: topic code, topic content code, topic sub-content code and report ID.
    Examples:
        >>> # First get topic code
        ... get_topics('Darbs')
        {'Darbs': 'EMP'}
        >>> # Then use this code to get content
        ... print(get_topic_content('EMP'))
        {'Darba samaksa (algas)': 'DS', 'Darbaspēka izmaksas': 'DI', 'Darbvietas un darba laiks': 'DV',...
        >>> # Then use this content code to extract report titles
        ... print(get_titles('DS'))
        {'Dzīvi un nedzīvi dzimušo skaits pēc dzimuma 1920 - 2020': ['POP', 'ID', 'IDS', 'IDS010'],...
    """
    alldb = requests.get(url)
    dict_result = {}
    for i in alldb.json():
        if topic_content_code == i['path'].split('/')[-2]:
            dict_result[i['title']] = [j for j in i['path'].split('/') if j]+[i['id']]
    return dict_result

def get_query_values(topic_params: list[str] = []) -> List[Dict]:
    """Get query code and values for particular report.
    
    Args:
        topic_params (list[str]): arguments as a list that are needed for data extraction.
            Arguments in the list should be in the following order:
            - topic code,
            - topic content code,
            - topic sub-content code
            - report ID.
            These codes you can get from the function get_titles.
    Returns:
        Dict: A dictionary where each key is a query parameter code and the value is another dictionary mapping possible values to their descriptive texts.
    Examples:
        >>> # First get report topic parameters from get_titles
        ... print(get_titles('DS'))
        {'Dzīvi un nedzīvi dzimušo skaits pēc dzimuma 1920 - 2020': ['POP', 'ID', 'IDS', 'IDS010'],...
        >>> # Then use these values to get possible query values
        ... print(get_query_values(['POP', 'ID', 'IDS', 'IDS010']))
        {'SEX_NEWBORN': {'T': 'Pavisam', 'M': 'Vīrieši', 'F': 'Sievietes'}, 'ContentsCode': {'IDS010': 'Dzīvi dzimuši', 'IDS0101': 'Nedzīvi dzimuši', 'IDS0102': 'Nedzīvi dzimuši uz 1000 dzīvi dzimušiem'}, 
        'TIME': {'1920': '1920' ... '2024': '2024'}}
    """
    base_url = 'https://data.stat.gov.lv/api/v1/lv/OSP_PUB/START/'
    url = base_url + '/'.join(topic_params)
    response = requests.get(url)
    try:
        response.raise_for_status()  # Raises HTTPError for bad responses (4xx, 5xx)
        data = response.json()
        if 'variables' not in data:
            raise ValueError("Unexpected JSON structure: 'variables' key missing")
        
        result = {}

        for var in data['variables']:
            code = var.get('code', '')
            values = var.get('values', [])
            value_texts = var.get('valueTexts', [])
            # build dict mapping value -> valueText
            mapping = dict(zip(values, value_texts))
            result[code] = mapping
        
        return result
    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"Request failed: {e}")
    except ValueError as ve:
        raise RuntimeError(f"Parsing failed: {ve}")

def construct_csp_link(params: list[str]) -> str:
    base_url = 'https://data.stat.gov.lv/pxweb/lv/OSP_PUB/START__'
    mid_path = '__'.join(params[:3])
    last_part = params[3]
    return f"{base_url}{mid_path}/{last_part}/"

def get_csp_data(lang: str = 'en', topic_params: list[str] = [], **kwargs) -> List[Dict]:
    """Get statistics from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde). Use 'Source URL' from the Returns to cite the data source.
    
    Args:
        lang (str): Language. Default value 'en'.
        topic_params (list[str]): arguments as a list that are needed for data extraction.
            Arguments in the list should be in the following order:
            - topic code,
            - topic content code,
            - topic sub-content code
            - report ID.
            These codes you can get from the function get_titles.
        kwargs: Keyword arguments for query configuration.
            Possible query argument names and their possible values
            can be obtained using the function get_query_values.
    Returns:
        list: The list of the dictionaries, where dictionary's key 'key' contains query parameters and key 'values' contains values. First list element is data source (URL) from CSB.
    Examples:
        >>> topics = ['POP', 'IR', 'IRE', 'IRE010']
        >>> query_args = get_query_values(topics)
        >>> print(query_args)
        {'ETHNICITY': {'TOTAL': 'Pavisam', 'E_LAT': 'Latvieši', 'E_ABZ': 'Abāzi', 'E_ABK': 'Abhāzi',...
        'E_SWE': 'Zviedri', 'OTH': 'Cita tautība', 'UNK_NSP': 'Nezināma, neizvēlēta'}, 'ContentsCode': {'IRE010': 'Skaits'}, 
        'TIME': {'1935': '1935', ... '2025': '2025'}}
        >>> # Then use these codes and values to get data for example Latvians for years 2024 and 2025.
        >>> # Value text 'Latvieši' explains what 'E_LAT' means.
        >>> data = get_csp_data(
        ...     lang='en',
        ...     topic_params=topics,
        ...     ETHNICITY=['E_LAT'],
        ...     TIME=['2024', '2025']
        ... )
        >>> print(data[0])
        {'key': ['E_LAT', '2024'], 'values': ['1186337']}
    """
    csp2 = LVStat(lang, *topic_params)
    csp2.set_query(**kwargs)

    link = construct_csp_link(topic_params)
    
    cspdata2 = csp2.get_data()
    return [{"Source URL": link}] + cspdata2['data']


with gr.Blocks() as demo:
    gr.Markdown("### Latvian CSP Data Query Interface")
    
    lang = gr.Dropdown(["en", "lv"], value="en", label="Language")

    # Step 1: Topic Selection
    topic_dict = get_topics()
    topic_dropdown = gr.Dropdown(choices=list(topic_dict.keys()), label="Select Topic")

    # Step 2: Topic Content (dynamically populated)
    topic_content_dropdown = gr.Dropdown(label="Select Topic Content", visible=False)

    # Step 3: Report Titles (dynamically populated)
    report_dropdown = gr.Dropdown(label="Select Report", visible=False)

    # Dynamic Link & Topic Params Output
    link_output = gr.Markdown(visible=False)
    topic_params_box = gr.Textbox(label="Topic Params", lines=1, interactive=True)

    #topic_params = gr.Textbox(label="Topic Params (Python list, e.g., ['POP', 'IR', 'IRE', 'IRE010'])")
    kwargs_box = gr.Textbox(
        label="Query Parameters (Python dict, e.g., {'ETHNICITY': ['E_LAT'], 'TIME': ['2024', '2025']})",
        lines=4,
    )
    output = gr.JSON(label="Result")

    run_button = gr.Button("Run Query")

    def update_topic_content(topic_name: str)-> Tuple[Any, Dict[str, str], str]:
#        """Given a topic name, updates the UI dropdown choices with the corresponding content,
#        and returns the content dictionary (what content is available under given topic_name) and internal topic code (ID of the topic_name).
#
#    Args:
#        topic_name (str): The name of the selected topic. Possible topic names: 'Darbs', 'Iedzīvotāji', 'Informācijas tehnoloģijas', 'Izglītība, kultūra un zinātne', 'Nozares',\
#        'Sociālā aizsardzība un veselība', 'Tirdzniecība un pakalpojumi', 'Uzņēmējdarbība', 'Valsts un ekonomika', 'Vide'.        
#    Returns:
#        tuple:
#            - gr.update: Gradio UI update object with new dropdown choices and visibility set to True.
#            - content_dict (dict): Dictionary containing content entries for the selected topic.
#            - topic_code (str): Internal code corresponding to the topic name.
#    """
        topic_code = topic_dict[topic_name]
        content_dict = get_topic_content(topic_code)
        return gr.update(choices=list(content_dict.keys()), visible=True), content_dict, topic_code

    def update_reports(topic_content_name: str, content_dict: dict = None) -> Tuple[Dict[str, str], str, Any]:
#        """Updates the UI dropdown menu with available report titles for a selected topic content, and returns the titles dictionary and the corresponding content code.
#        
#    Args:
#        topic_content_name (str): The name of the selected topic content (e.g., "Darba samaksa (algas)"). Topic content names you can get from `update_topic_content` (`content_dict` object).
#        content_dict (dict): A dictionary mapping topic content names to their corresponding content codes. Obtained from `update_topic_content` (2nd returned object: `content_dict`).
#        topic_name (str): The name of the selected topic. Possible topic names: 'Darbs', 'Iedzīvotāji', 'Informācijas tehnoloģijas', 'Izglītība, kultūra un zinātne', 'Nozares',\
#        'Sociālā aizsardzība un veselība', 'Tirdzniecība un pakalpojumi', 'Uzņēmējdarbība', 'Valsts un ekonomika', 'Vide'.
#    Returns:
#        tuple:
#            - gr.update: A Gradio UI update object to populate a dropdown with the list of report titles and make it visible.
#            - titles_dict (dict): A dictionary of available report titles from the Official Statistics Portal
#                                  of Latvia (CSP). Each key is a human-readable report title, and each value
#                                  is a list of metadata: `[topic_code, topic_content_code, sub_content_code, report_id]`.
#                                  This list (as a string) of metadata later can be used in the `run_get_csp_data` function (into topic_params_str parameter)
#            - topic_content_code (str): The internal code associated with the selected topic content, used to retrieve titles via the `get_titles` function.
#    Dependencies:
#        - Relies on `get_titles(topic_content_code)` to fetch metadata from the CSP's API at:
#          https://data.stat.gov.lv/api/v1/lv/OSP_PUB?query=*&filter=*
#    """
        topic_content_code = content_dict[topic_content_name]
        titles_dict = get_titles(topic_content_code)
        return titles_dict, topic_content_code, gr.update(choices=list(titles_dict.keys()), visible=True)

    def update_topic_params_and_link(report_title: str, titles_dict: dict) -> Tuple[str, Any, Any]:
#        """Prepares and returns metadata, a hyperlink, and query parameter preview for a selected report
#        from the Official Statistics Portal of Latvia (CSP).
#
#    Args:
#        report_title (str): The title of the selected report, as shown in the dropdown.
#        titles_dict (dict): Dictionary mapping report titles to their metadata list:
#                            [topic_code, content_code, sub_content_code, report_id],
#                            typically retrieved using `get_titles(topic_content_code)`.
#    Returns:
#        tuple:
#            - topic_params_str (str): String representation of the internal report metadata (code list),
#                                      useful for debugging or internal reference.
#                                      This scring can be used in the `run_get_csp_data` (parameter `topic_params_str`).
#            - gr.update: Gradio component update with a Markdown-style hyperlink pointing to the
#                         CSP page for the selected report.
#            - gr.update: Gradio component update showing a sample query parameter, particularly
#                         for the `TIME` dimension if present, using the most recent 3 values.
#    Details:
#        - The function extracts the internal metadata for the selected report.
#        - It generates a URL using `construct_csp_link(...)` that links directly to the CSP report page.
#        - It attempts to fetch available query parameters using `get_query_values(...)`, then isolates
#          the `TIME` filter and selects the last 3 available values (e.g., most recent years).
#        - If fetching query parameters fails, an empty dictionary (`'{}'`) is returned as the fallback.
#    Example Output:
#        - topic_params_str: "['POP', 'ID', 'IDS', 'IDS010']"
#        - link (Markdown): "[Dzimušo skaits pēc dzimuma](https://data.stat.gov.lv/.../IDS010)"
#        - query_str: "{'TIME': ['2020', '2021', '2022']}"
#    """
        title_value = titles_dict[report_title]
        topic_params_str = str(title_value)
        link = construct_csp_link(title_value)

        try:
            q = get_query_values(title_value)
            timeval = q.get('TIME','')
            query = {}

            if len(timeval) > 0:
                query = {'TIME': list(timeval.keys())[-3:]}
                
            # for i in q:
            #     if i.get('code', '') == 'TIME':
            #         query = {'TIME': i['values'][-3:]}
            #         break
            query_str = str(query)
        except Exception as e:
            query_str = '{}'

        return topic_params_str, gr.update(value=f"[{report_title}]({link})", visible=True), gr.update(value=query_str)

    topic_content_state = gr.State()
    titles_state = gr.State()
    topic_code_state = gr.State()
    topic_content_code_state = gr.State()

    topic_dropdown.change(fn=update_topic_content, inputs=topic_dropdown, outputs=[topic_content_dropdown, topic_content_state, topic_code_state])
    topic_content_dropdown.change(fn=update_reports, inputs=[topic_content_dropdown, topic_content_state], #, topic_code_state],
                                  outputs=[titles_state, topic_content_code_state, report_dropdown])
    report_dropdown.change(fn=update_topic_params_and_link, inputs=[report_dropdown, titles_state], #, topic_code_state, topic_content_code_state],
                           outputs=[topic_params_box, link_output, kwargs_box])
    
    def run_get_csp_data(lang: str = 'en', topic_params_str: str = '[]', query_kwargs_str: str = '{}') -> List[Dict]:
        """Get statistics from Official Statistics Portal of Latvia (CSP or Centrālā statistikas pārvalde).
    
    Args:
        lang (str): Language. Default value 'en'.
        topic_params_str (str): string representation of a list that is needed for data extraction. Arguments in the list should be in the following order: topic code, topic content code, topic sub-content code, report ID.\
            These codes you can get from the function get_titles.
        query_kwargs_str (str): string representation of a dictionary - keyword arguments for query configuration.\
            Possible query argument names and their possible values can be obtained using the function get_query_values.
    Returns:
        list: The list of the dictionaries, where dictionary's key 'key' contains query parameters and key 'values' contains values.
    Examples:
        >>> # First get topic code
        ... get_topics('Iedzīvotāji')
        {'Iedzīvotāji': 'POP'}
        >>> # Then use this code to get topic contents
        ... print(get_topic_content('POP'))
        {'Iedzīvotāju skaits un raksturojošie rādītāji': 'IR', 'Dzimstība': 'ID', 'Mirstība': 'IM', 'Nāves cēloņi': 'NC'...
        >>> # Then use this content code to extract report titles
        ... print(get_titles('IR'))
        {'Iedzīvotāju skaits gada sākumā, tā izmaiņas un dabiskās kustības galvenie rādītāji 1920 - 2021': ['POP', 'IR', 'IRS010'],...
        >>> # Use all these previous codes to select data for corresponding report
        ... report = ['POP', 'IR', 'IRE', 'IRE010']
        >>> # Get 'code' and 'values' for filtering data
        ... query_args = get_query_values(report)
        >>> print(query_args)
        {'ETHNICITY': {'TOTAL': 'Pavisam', 'E_LAT': 'Latvieši',...}, 'ContentsCode': {'IRE010': 'Skaits'}, 'TIME': {'1935': '1935', ..., '2025': '2025'}}
        >>> # Get final result
        ... data = run_get_csp_data(
        ...        lang='en',
        ...        topic_params_str = str(report),
        ...        query_kwargs_str = '{'ETHNICITY': ['E_LAT'], 'TIME': ['2024', '2025']}'
        ...        )
        >>> print(data[0])
        {'key': ['E_LAT', '2024'], 'values': ['1186337']}
    """
        try:
            topic_params = ast.literal_eval(topic_params_str)
            query_kwargs = ast.literal_eval(query_kwargs_str)
            if not isinstance(topic_params, list) or not isinstance(query_kwargs, dict):
                raise ValueError("Input format error")
            result = get_csp_data(lang=lang, topic_params=topic_params, **query_kwargs)
            return result
        except Exception as e:
            return {"error": str(e)}

    gr.api(get_topics, api_name="get_topic_name_and_id")
    gr.api(get_topic_content, api_name="get_topic_content_name_and_id")
    gr.api(get_titles, api_name="get_report_titles")
    gr.api(get_query_values, api_name="get_query_values")    

    run_button.click(fn=run_get_csp_data, inputs=[lang, topic_params_box, kwargs_box], outputs=output)

if __name__ == "__main__":
    demo.launch(mcp_server=True)