Spaces:
Sleeping
Sleeping
Commit ·
3a27418
1
Parent(s): 8da8dfe
Upload 2 files
Browse files- dataset_naming_app.py +265 -0
- requirements.txt +5 -0
dataset_naming_app.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
# Define Data Source Codes and Sector Codes
|
| 6 |
+
data_source_codes = {
|
| 7 |
+
'CABSEC': 'Cabinet Secretariat',
|
| 8 |
+
'CAG': 'Comptroller & Auditor General',
|
| 9 |
+
'DAE': 'Department of Atomic Energy',
|
| 10 |
+
'DOS': 'Department of Space',
|
| 11 |
+
'ECI': 'Election Commission of India',
|
| 12 |
+
'HCDELHI': 'HIGH COURT OF DELHI',
|
| 13 |
+
'MOA': 'Ministry of Agriculture',
|
| 14 |
+
'MCF': 'Ministry of Chemicals & Fertilizers',
|
| 15 |
+
'MOCA': 'Ministry of Civil Aviation',
|
| 16 |
+
'MOCOAL': 'Ministry of Coal',
|
| 17 |
+
'MOCI': 'Ministry of Commerce & Industry',
|
| 18 |
+
'MOCIT': 'Ministry of Communications & Information Tech.',
|
| 19 |
+
'MOCF&PD': 'Ministry of Consumer Aff., Food, & Public Dist.',
|
| 20 |
+
'MCA': 'Ministry of Corporate Affairs',
|
| 21 |
+
'MOCULT': 'Ministry of Culture',
|
| 22 |
+
'MOD': 'Ministry of Defence',
|
| 23 |
+
'MDONER': 'Ministry of Development of North Eastern Region',
|
| 24 |
+
'MDWS': 'Ministry of Drinking Water and Sanitation',
|
| 25 |
+
'MOES': 'Ministry of Earth Sciences',
|
| 26 |
+
'MOEF': 'Ministry of Environment & Forests',
|
| 27 |
+
'MEA': 'Ministry of External Affairs',
|
| 28 |
+
'MOF': 'Ministry of Finance',
|
| 29 |
+
'MOFPI': 'Ministry of Food Processing Industries',
|
| 30 |
+
'MOHFW': 'Ministry of Health & Family Welfare',
|
| 31 |
+
'MHI&PE': 'Ministry of Heavy Industry & Public Enterprises',
|
| 32 |
+
'MHA': 'Ministry of Home Affairs',
|
| 33 |
+
'MOHUA': 'Ministry of Housing & Urban Poverty Alleviation',
|
| 34 |
+
'MHRD': 'Ministry of Human Resource Development',
|
| 35 |
+
'MOI&B': 'Ministry of Information & Broadcasting',
|
| 36 |
+
'MOL&E': 'Ministry of Labour & Employment',
|
| 37 |
+
'MOLJ': 'Ministry of Law & Justice',
|
| 38 |
+
'MSME': 'Ministry of Micro, Small and Medium Enterprises',
|
| 39 |
+
'MOM': 'Ministry of Mines',
|
| 40 |
+
'MMA': 'Ministry of Minority Affairs',
|
| 41 |
+
'MNRE': 'Ministry of New & Renewable Energy',
|
| 42 |
+
'MPR': 'Ministry of Panchayati Raj',
|
| 43 |
+
'MPA': 'Ministry of Parliamentary Affairs',
|
| 44 |
+
'MPPP': 'Ministry of Personnel, Public Grievances & Pensions',
|
| 45 |
+
'MPNG': 'Ministry of Petroleum & Natural Gas',
|
| 46 |
+
'MP': 'Ministry of Power',
|
| 47 |
+
'MR': 'Ministry of Railways',
|
| 48 |
+
'MORTH': 'Ministry of Road Transport & Highways',
|
| 49 |
+
'MRD': 'Ministry of Rural Development',
|
| 50 |
+
'MST': 'Ministry of Science & Technology',
|
| 51 |
+
'MS': 'Ministry of Shipping',
|
| 52 |
+
'MSJE': 'Ministry of Social Justice & Empowerment',
|
| 53 |
+
'MOSPI': 'Ministry of Statistics & Programme Implementation',
|
| 54 |
+
'MSTL': 'Ministry of Steel',
|
| 55 |
+
'MT': 'Ministry of Textiles',
|
| 56 |
+
'MOT': 'Ministry of Tourism',
|
| 57 |
+
'MTA': 'Ministry of Tribal Affairs',
|
| 58 |
+
'MOUD': 'Ministry of Urban Development',
|
| 59 |
+
'MWR': 'Ministry of Water Resources',
|
| 60 |
+
'MWCD': 'Ministry of Women & Child Development',
|
| 61 |
+
'MYAS': 'Ministry of Youth Affairs & Sports',
|
| 62 |
+
'PC': 'Planning Commission',
|
| 63 |
+
'PRES': 'President',
|
| 64 |
+
'PMO': "Prime Minister's Office",
|
| 65 |
+
'VP': 'Vice-President'
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
sector_codes = {
|
| 71 |
+
'AGRI': 'Agriculture',
|
| 72 |
+
'ANML': 'Animal Husbandry and Fisheries',
|
| 73 |
+
'BNK': 'Banking',
|
| 74 |
+
'CENS': 'Census',
|
| 75 |
+
'CLMT': 'Climate & Weather',
|
| 76 |
+
'CMDB': 'Commodity Boards',
|
| 77 |
+
'COMR': 'Commerce',
|
| 78 |
+
'CAFF': 'Consumer Affairs',
|
| 79 |
+
'COVID': 'Covid',
|
| 80 |
+
'CRIME': 'Crime',
|
| 81 |
+
'CULT': 'Culture and Tourism',
|
| 82 |
+
'DEMO': 'Demographics',
|
| 83 |
+
'DIGINF': 'Digital Infrastructure',
|
| 84 |
+
'ECON': 'Economy',
|
| 85 |
+
'ELECT': 'Elections',
|
| 86 |
+
'ENRG': 'Energy',
|
| 87 |
+
'EXTAFF': 'External Affairs',
|
| 88 |
+
'FINCL': 'Financial Inclusion',
|
| 89 |
+
'FAGRI': 'Food and Agriculture',
|
| 90 |
+
'FORWLD': 'Forestry and Wildlife',
|
| 91 |
+
'GEN': 'General',
|
| 92 |
+
'GOVSCM': 'Government Schemes',
|
| 93 |
+
'HLTH': 'Health',
|
| 94 |
+
'HSNG': 'Housing',
|
| 95 |
+
'IND': 'Industries',
|
| 96 |
+
'JUST': 'Justice',
|
| 97 |
+
'NSS': 'National Sample Survey',
|
| 98 |
+
'NATDIS': 'Natural Disasters',
|
| 99 |
+
'OTHER': 'Other',
|
| 100 |
+
'PETGAS': 'Petroleum and Gas',
|
| 101 |
+
'RURALDEV': 'Rural Development',
|
| 102 |
+
'SATIMG': 'Satellite Imagery Data',
|
| 103 |
+
'SCI': 'Science',
|
| 104 |
+
'SOCIOECO': 'Socio Economic',
|
| 105 |
+
'TRANS': 'Transportation',
|
| 106 |
+
'BUDGET': 'Union Budget',
|
| 107 |
+
'WTR': 'Water'
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
#Granularity_values = ["District","State","Tehsil","Other Level", "India","Assembly Constituency","Point Level","Gram Panchayat","Block","Sub-District","Village","Country"]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Short namings for Granularity_values
|
| 116 |
+
granularity_short_codes = {
|
| 117 |
+
'District': 'DIS',
|
| 118 |
+
'State': 'STA',
|
| 119 |
+
'Tehsil': 'TEH',
|
| 120 |
+
'Other Level': 'OTH',
|
| 121 |
+
'India': 'IND',
|
| 122 |
+
'Assembly Constituency': 'AC',
|
| 123 |
+
'Point Level': 'PL',
|
| 124 |
+
'Gram Panchayat': 'GP',
|
| 125 |
+
'Block': 'BL',
|
| 126 |
+
'Sub-District': 'SD',
|
| 127 |
+
'Village': 'VIL',
|
| 128 |
+
'Country': 'CTRY'
|
| 129 |
+
}
|
| 130 |
+
# frequency_values = ['Yearly', 'Weekly', 'Quinquennial', 'Daily', 'Fortnightly', 'Monthly', 'Seasonally', 'Other / One Time']
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# Short namings for frequency_values
|
| 134 |
+
frequency_short_codes = {
|
| 135 |
+
'Yearly': 'Y',
|
| 136 |
+
'Weekly': 'W',
|
| 137 |
+
'Quinquennial': 'Q',
|
| 138 |
+
'Daily': 'D',
|
| 139 |
+
'Fortnightly': 'F',
|
| 140 |
+
'Monthly': 'M',
|
| 141 |
+
'Seasonally': 'S',
|
| 142 |
+
'Other / One Time': 'O'
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# Read counter from file
|
| 147 |
+
def read_counter():
|
| 148 |
+
try:
|
| 149 |
+
with open('counter.txt', 'r') as f:
|
| 150 |
+
counter = int(f.read())
|
| 151 |
+
except FileNotFoundError:
|
| 152 |
+
counter = 1 # Starting counter value
|
| 153 |
+
return counter
|
| 154 |
+
|
| 155 |
+
# Update and save counter to file
|
| 156 |
+
def update_counter(counter):
|
| 157 |
+
with open('counter.txt', 'w') as f:
|
| 158 |
+
f.write(str(counter))
|
| 159 |
+
|
| 160 |
+
# Generate unique dataset IDs
|
| 161 |
+
def generate_dataset_id(counter):
|
| 162 |
+
return f'DID{counter:03}'
|
| 163 |
+
|
| 164 |
+
# Generate dataset names
|
| 165 |
+
def generate_dataset_name(data_source_code, sector_code, start_year, end_year, dataset_id, granularity, frequency):
|
| 166 |
+
granularity_short = granularity_short_codes.get(granularity, 'UNK')
|
| 167 |
+
frequency_short = frequency_short_codes.get(frequency, 'UNK')
|
| 168 |
+
return f'{data_source_code}-{sector_code}-{granularity_short}-{frequency_short}-{dataset_id}'
|
| 169 |
+
|
| 170 |
+
# List to store existing dataset names
|
| 171 |
+
existing_dataset_names = []
|
| 172 |
+
|
| 173 |
+
# Check if dataset name is unique
|
| 174 |
+
def check_dataset_name_uniqueness(dataset_name):
|
| 175 |
+
return dataset_name not in existing_dataset_names
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def generate_download_link(mapped_dataset):
|
| 179 |
+
|
| 180 |
+
csv_file = mapped_dataset.to_csv(index=False)
|
| 181 |
+
b64 = base64.b64encode(csv_file.encode()).decode()
|
| 182 |
+
href = f'<a href="data:file/xlsx;base64,{b64}" download="mapped_dataset.xlsx">Download</a>'
|
| 183 |
+
st.success('Download Mapped Dataset')
|
| 184 |
+
st.markdown(href, unsafe_allow_html=True)
|
| 185 |
+
|
| 186 |
+
# Streamlit App
|
| 187 |
+
def main():
|
| 188 |
+
st.title('Dataset Naming App')
|
| 189 |
+
|
| 190 |
+
# Read counter from file
|
| 191 |
+
counter = read_counter()
|
| 192 |
+
|
| 193 |
+
# User input: Data Source
|
| 194 |
+
data_source = st.selectbox('Select Data Source', list(data_source_codes.values()))
|
| 195 |
+
data_source_code = next(code for code, name in data_source_codes.items() if name == data_source)
|
| 196 |
+
|
| 197 |
+
# Generate dataset name
|
| 198 |
+
dataset_id = generate_dataset_id(counter)
|
| 199 |
+
|
| 200 |
+
# User input: Sector
|
| 201 |
+
sector = st.selectbox('Select Sector', list(sector_codes.values()))
|
| 202 |
+
sector_code = next(code for code, name in sector_codes.items() if name == sector)
|
| 203 |
+
|
| 204 |
+
# User input: Start Year
|
| 205 |
+
start_year = st.number_input('Enter Start Year', min_value=2000, max_value=2100, value=2022)
|
| 206 |
+
|
| 207 |
+
# User input: End Year
|
| 208 |
+
end_year = st.number_input('Enter End Year', min_value=start_year, max_value=2100, value=2022)
|
| 209 |
+
|
| 210 |
+
# User input: Granularity
|
| 211 |
+
granularity = st.selectbox('Select Granularity', list(granularity_short_codes.keys()))
|
| 212 |
+
|
| 213 |
+
# User input: Frequency
|
| 214 |
+
frequency = st.selectbox('Select Frequency', list(frequency_short_codes.keys()))
|
| 215 |
+
|
| 216 |
+
# Generate dataset name
|
| 217 |
+
dataset_id = generate_dataset_id(counter) # Update with your actual counter
|
| 218 |
+
dataset_name = generate_dataset_name(data_source_code, sector_code, start_year, end_year, dataset_id, granularity, frequency)
|
| 219 |
+
|
| 220 |
+
# User input: Original Dataset Name
|
| 221 |
+
original_dataset_name = st.text_input('Enter Original Dataset Name')
|
| 222 |
+
|
| 223 |
+
# Check if the dataset name is unique
|
| 224 |
+
is_unique = check_dataset_name_uniqueness(dataset_name) # Implement this function
|
| 225 |
+
|
| 226 |
+
# Display generated dataset name
|
| 227 |
+
st.write('Generated Dataset Name:')
|
| 228 |
+
st.write(dataset_name)
|
| 229 |
+
|
| 230 |
+
# Display warning/error if dataset name is not unique
|
| 231 |
+
if not is_unique:
|
| 232 |
+
st.warning('Dataset name is not unique. Please generate a new name.')
|
| 233 |
+
|
| 234 |
+
# Save dataset info to Excel
|
| 235 |
+
if st.button('Save to Excel') and is_unique:
|
| 236 |
+
data = {
|
| 237 |
+
'Dataset Name': [dataset_name],
|
| 238 |
+
'Data Source': [data_source],
|
| 239 |
+
'Sector': [sector],
|
| 240 |
+
'Start Year': [start_year],
|
| 241 |
+
'End Year': [end_year],
|
| 242 |
+
'Granularity': [granularity],
|
| 243 |
+
'Frequency': [frequency],
|
| 244 |
+
'Original Dataset Name': [original_dataset_name]
|
| 245 |
+
}
|
| 246 |
+
df = pd.DataFrame(data)
|
| 247 |
+
#df.to_excel('dataset_info.xlsx', index=False)
|
| 248 |
+
|
| 249 |
+
# Update and save counter to file
|
| 250 |
+
counter += 1
|
| 251 |
+
update_counter(counter)
|
| 252 |
+
generate_download_link(df)
|
| 253 |
+
# Clear user inputs
|
| 254 |
+
st.success('Dataset information saved to Excel.')
|
| 255 |
+
data_source = ''
|
| 256 |
+
sector = ''
|
| 257 |
+
start_year = 2022 # Reset to default year
|
| 258 |
+
end_year = 2022
|
| 259 |
+
granularity = ''
|
| 260 |
+
frequency = ''
|
| 261 |
+
original_dataset_name = ''
|
| 262 |
+
|
| 263 |
+
if __name__ == '__main__':
|
| 264 |
+
main()
|
| 265 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
langchain
|
| 4 |
+
openai
|
| 5 |
+
tabulate
|