GovDataLabeler / app.py
saurabhharak's picture
rename the app name
e4230ac
import base64
import streamlit as st
import pandas as pd
# Define Data Source Codes and Sector Codes
data_source_codes = {
'CABSEC': 'Cabinet Secretariat',
'CAG': 'Comptroller & Auditor General',
'DAE': 'Department of Atomic Energy',
'DOS': 'Department of Space',
'ECI': 'Election Commission of India',
'HCDELHI': 'HIGH COURT OF DELHI',
'MOA': 'Ministry of Agriculture',
'MCF': 'Ministry of Chemicals & Fertilizers',
'MOCA': 'Ministry of Civil Aviation',
'MOCOAL': 'Ministry of Coal',
'MOCI': 'Ministry of Commerce & Industry',
'MOCIT': 'Ministry of Communications & Information Tech.',
'MOCF&PD': 'Ministry of Consumer Aff., Food, & Public Dist.',
'MCA': 'Ministry of Corporate Affairs',
'MOCULT': 'Ministry of Culture',
'MOD': 'Ministry of Defence',
'MDONER': 'Ministry of Development of North Eastern Region',
'MDWS': 'Ministry of Drinking Water and Sanitation',
'MOES': 'Ministry of Earth Sciences',
'MOEF': 'Ministry of Environment & Forests',
'MEA': 'Ministry of External Affairs',
'MOF': 'Ministry of Finance',
'MOFPI': 'Ministry of Food Processing Industries',
'MOHFW': 'Ministry of Health & Family Welfare',
'MHI&PE': 'Ministry of Heavy Industry & Public Enterprises',
'MHA': 'Ministry of Home Affairs',
'MOHUA': 'Ministry of Housing & Urban Poverty Alleviation',
'MHRD': 'Ministry of Human Resource Development',
'MOI&B': 'Ministry of Information & Broadcasting',
'MOL&E': 'Ministry of Labour & Employment',
'MOLJ': 'Ministry of Law & Justice',
'MSME': 'Ministry of Micro, Small and Medium Enterprises',
'MOM': 'Ministry of Mines',
'MMA': 'Ministry of Minority Affairs',
'MNRE': 'Ministry of New & Renewable Energy',
'MPR': 'Ministry of Panchayati Raj',
'MPA': 'Ministry of Parliamentary Affairs',
'MPPP': 'Ministry of Personnel, Public Grievances & Pensions',
'MPNG': 'Ministry of Petroleum & Natural Gas',
'MP': 'Ministry of Power',
'MR': 'Ministry of Railways',
'MORTH': 'Ministry of Road Transport & Highways',
'MRD': 'Ministry of Rural Development',
'MST': 'Ministry of Science & Technology',
'MS': 'Ministry of Shipping',
'MSJE': 'Ministry of Social Justice & Empowerment',
'MOSPI': 'Ministry of Statistics & Programme Implementation',
'MSTL': 'Ministry of Steel',
'MT': 'Ministry of Textiles',
'MOT': 'Ministry of Tourism',
'MTA': 'Ministry of Tribal Affairs',
'MOUD': 'Ministry of Urban Development',
'MWR': 'Ministry of Water Resources',
'MWCD': 'Ministry of Women & Child Development',
'MYAS': 'Ministry of Youth Affairs & Sports',
'PC': 'Planning Commission',
'PRES': 'President',
'PMO': "Prime Minister's Office",
'VP': 'Vice-President'
}
sector_codes = {
'AGRI': 'Agriculture',
'ANML': 'Animal Husbandry and Fisheries',
'BNK': 'Banking',
'CENS': 'Census',
'CLMT': 'Climate & Weather',
'CMDB': 'Commodity Boards',
'COMR': 'Commerce',
'CAFF': 'Consumer Affairs',
'COVID': 'Covid',
'CRIME': 'Crime',
'CULT': 'Culture and Tourism',
'DEMO': 'Demographics',
'DIGINF': 'Digital Infrastructure',
'ECON': 'Economy',
'ELECT': 'Elections',
'ENRG': 'Energy',
'EXTAFF': 'External Affairs',
'FINCL': 'Financial Inclusion',
'FAGRI': 'Food and Agriculture',
'FORWLD': 'Forestry and Wildlife',
'GEN': 'General',
'GOVSCM': 'Government Schemes',
'HLTH': 'Health',
'HSNG': 'Housing',
'IND': 'Industries',
'JUST': 'Justice',
'NSS': 'National Sample Survey',
'NATDIS': 'Natural Disasters',
'OTHER': 'Other',
'PETGAS': 'Petroleum and Gas',
'RURALDEV': 'Rural Development',
'SATIMG': 'Satellite Imagery Data',
'SCI': 'Science',
'SOCIOECO': 'Socio Economic',
'TRANS': 'Transportation',
'BUDGET': 'Union Budget',
'WTR': 'Water'
}
#Granularity_values = ["District","State","Tehsil","Other Level", "India","Assembly Constituency","Point Level","Gram Panchayat","Block","Sub-District","Village","Country"]
# Short namings for Granularity_values
granularity_short_codes = {
'District': 'DIS',
'State': 'STA',
'Tehsil': 'TEH',
'Other Level': 'OTH',
'India': 'IND',
'Assembly Constituency': 'AC',
'Point Level': 'PL',
'Gram Panchayat': 'GP',
'Block': 'BL',
'Sub-District': 'SD',
'Village': 'VIL',
'Country': 'CTRY'
}
# frequency_values = ['Yearly', 'Weekly', 'Quinquennial', 'Daily', 'Fortnightly', 'Monthly', 'Seasonally', 'Other / One Time']
# Short namings for frequency_values
frequency_short_codes = {
'Yearly': 'Y',
'Weekly': 'W',
'Quinquennial': 'Q',
'Daily': 'D',
'Fortnightly': 'F',
'Monthly': 'M',
'Seasonally': 'S',
'Other / One Time': 'O'
}
# Read counter from file
def read_counter():
try:
with open('counter.txt', 'r') as f:
counter = int(f.read())
except FileNotFoundError:
counter = 1 # Starting counter value
return counter
# Update and save counter to file
def update_counter(counter):
with open('counter.txt', 'w') as f:
f.write(str(counter))
# Generate unique dataset IDs
def generate_dataset_id(counter):
return f'DID{counter:03}'
# Generate dataset names
def generate_dataset_name(data_source_code, sector_code, start_year, end_year, dataset_id, granularity, frequency):
granularity_short = granularity_short_codes.get(granularity, 'UNK')
frequency_short = frequency_short_codes.get(frequency, 'UNK')
return f'{data_source_code}-{sector_code}-{granularity_short}-{frequency_short}-{dataset_id}'
# List to store existing dataset names
existing_dataset_names = []
# Check if dataset name is unique
def check_dataset_name_uniqueness(dataset_name):
return dataset_name not in existing_dataset_names
def generate_download_link(mapped_dataset):
csv_file = mapped_dataset.to_csv(index=False)
b64 = base64.b64encode(csv_file.encode()).decode()
href = f'<a href="data:file/xlsx;base64,{b64}" download="mapped_dataset.xlsx">Download</a>'
st.success('Download Mapped Dataset')
st.markdown(href, unsafe_allow_html=True)
# Streamlit App
def main():
st.title('Dataset Naming App')
# Read counter from file
counter = read_counter()
# User input: Data Source
data_source = st.selectbox('Select Data Source', list(data_source_codes.values()))
data_source_code = next(code for code, name in data_source_codes.items() if name == data_source)
# Generate dataset name
dataset_id = generate_dataset_id(counter)
# User input: Sector
sector = st.selectbox('Select Sector', list(sector_codes.values()))
sector_code = next(code for code, name in sector_codes.items() if name == sector)
# User input: Start Year
start_year = st.number_input('Enter Start Year', min_value=2000, max_value=2100, value=2022)
# User input: End Year
end_year = st.number_input('Enter End Year', min_value=start_year, max_value=2100, value=2022)
# User input: Granularity
granularity = st.selectbox('Select Granularity', list(granularity_short_codes.keys()))
# User input: Frequency
frequency = st.selectbox('Select Frequency', list(frequency_short_codes.keys()))
# Generate dataset name
dataset_id = generate_dataset_id(counter) # Update with your actual counter
dataset_name = generate_dataset_name(data_source_code, sector_code, start_year, end_year, dataset_id, granularity, frequency)
# User input: Original Dataset Name
original_dataset_name = st.text_input('Enter Original Dataset Name')
# Check if the dataset name is unique
is_unique = check_dataset_name_uniqueness(dataset_name) # Implement this function
# Display generated dataset name
st.write('Gov Data Labeler')
st.write(dataset_name)
# Display warning/error if dataset name is not unique
if not is_unique:
st.warning('Dataset name is not unique. Please generate a new name.')
# Save dataset info to Excel
if st.button('Save to Excel') and is_unique:
data = {
'Dataset Name': [dataset_name],
'Data Source': [data_source],
'Sector': [sector],
'Start Year': [start_year],
'End Year': [end_year],
'Granularity': [granularity],
'Frequency': [frequency],
'Original Dataset Name': [original_dataset_name]
}
df = pd.DataFrame(data)
#df.to_excel('dataset_info.xlsx', index=False)
# Update and save counter to file
counter += 1
update_counter(counter)
generate_download_link(df)
# Clear user inputs
st.success('Dataset information saved to Excel.')
data_source = ''
sector = ''
start_year = 2022 # Reset to default year
end_year = 2022
granularity = ''
frequency = ''
original_dataset_name = ''
if __name__ == '__main__':
main()