saurabhharak commited on
Commit
3a27418
·
1 Parent(s): 8da8dfe

Upload 2 files

Browse files
Files changed (2) hide show
  1. dataset_naming_app.py +265 -0
  2. requirements.txt +5 -0
dataset_naming_app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import streamlit as st
3
+ import pandas as pd
4
+
5
+ # Define Data Source Codes and Sector Codes
6
+ data_source_codes = {
7
+ 'CABSEC': 'Cabinet Secretariat',
8
+ 'CAG': 'Comptroller & Auditor General',
9
+ 'DAE': 'Department of Atomic Energy',
10
+ 'DOS': 'Department of Space',
11
+ 'ECI': 'Election Commission of India',
12
+ 'HCDELHI': 'HIGH COURT OF DELHI',
13
+ 'MOA': 'Ministry of Agriculture',
14
+ 'MCF': 'Ministry of Chemicals & Fertilizers',
15
+ 'MOCA': 'Ministry of Civil Aviation',
16
+ 'MOCOAL': 'Ministry of Coal',
17
+ 'MOCI': 'Ministry of Commerce & Industry',
18
+ 'MOCIT': 'Ministry of Communications & Information Tech.',
19
+ 'MOCF&PD': 'Ministry of Consumer Aff., Food, & Public Dist.',
20
+ 'MCA': 'Ministry of Corporate Affairs',
21
+ 'MOCULT': 'Ministry of Culture',
22
+ 'MOD': 'Ministry of Defence',
23
+ 'MDONER': 'Ministry of Development of North Eastern Region',
24
+ 'MDWS': 'Ministry of Drinking Water and Sanitation',
25
+ 'MOES': 'Ministry of Earth Sciences',
26
+ 'MOEF': 'Ministry of Environment & Forests',
27
+ 'MEA': 'Ministry of External Affairs',
28
+ 'MOF': 'Ministry of Finance',
29
+ 'MOFPI': 'Ministry of Food Processing Industries',
30
+ 'MOHFW': 'Ministry of Health & Family Welfare',
31
+ 'MHI&PE': 'Ministry of Heavy Industry & Public Enterprises',
32
+ 'MHA': 'Ministry of Home Affairs',
33
+ 'MOHUA': 'Ministry of Housing & Urban Poverty Alleviation',
34
+ 'MHRD': 'Ministry of Human Resource Development',
35
+ 'MOI&B': 'Ministry of Information & Broadcasting',
36
+ 'MOL&E': 'Ministry of Labour & Employment',
37
+ 'MOLJ': 'Ministry of Law & Justice',
38
+ 'MSME': 'Ministry of Micro, Small and Medium Enterprises',
39
+ 'MOM': 'Ministry of Mines',
40
+ 'MMA': 'Ministry of Minority Affairs',
41
+ 'MNRE': 'Ministry of New & Renewable Energy',
42
+ 'MPR': 'Ministry of Panchayati Raj',
43
+ 'MPA': 'Ministry of Parliamentary Affairs',
44
+ 'MPPP': 'Ministry of Personnel, Public Grievances & Pensions',
45
+ 'MPNG': 'Ministry of Petroleum & Natural Gas',
46
+ 'MP': 'Ministry of Power',
47
+ 'MR': 'Ministry of Railways',
48
+ 'MORTH': 'Ministry of Road Transport & Highways',
49
+ 'MRD': 'Ministry of Rural Development',
50
+ 'MST': 'Ministry of Science & Technology',
51
+ 'MS': 'Ministry of Shipping',
52
+ 'MSJE': 'Ministry of Social Justice & Empowerment',
53
+ 'MOSPI': 'Ministry of Statistics & Programme Implementation',
54
+ 'MSTL': 'Ministry of Steel',
55
+ 'MT': 'Ministry of Textiles',
56
+ 'MOT': 'Ministry of Tourism',
57
+ 'MTA': 'Ministry of Tribal Affairs',
58
+ 'MOUD': 'Ministry of Urban Development',
59
+ 'MWR': 'Ministry of Water Resources',
60
+ 'MWCD': 'Ministry of Women & Child Development',
61
+ 'MYAS': 'Ministry of Youth Affairs & Sports',
62
+ 'PC': 'Planning Commission',
63
+ 'PRES': 'President',
64
+ 'PMO': "Prime Minister's Office",
65
+ 'VP': 'Vice-President'
66
+ }
67
+
68
+
69
+
70
+ sector_codes = {
71
+ 'AGRI': 'Agriculture',
72
+ 'ANML': 'Animal Husbandry and Fisheries',
73
+ 'BNK': 'Banking',
74
+ 'CENS': 'Census',
75
+ 'CLMT': 'Climate & Weather',
76
+ 'CMDB': 'Commodity Boards',
77
+ 'COMR': 'Commerce',
78
+ 'CAFF': 'Consumer Affairs',
79
+ 'COVID': 'Covid',
80
+ 'CRIME': 'Crime',
81
+ 'CULT': 'Culture and Tourism',
82
+ 'DEMO': 'Demographics',
83
+ 'DIGINF': 'Digital Infrastructure',
84
+ 'ECON': 'Economy',
85
+ 'ELECT': 'Elections',
86
+ 'ENRG': 'Energy',
87
+ 'EXTAFF': 'External Affairs',
88
+ 'FINCL': 'Financial Inclusion',
89
+ 'FAGRI': 'Food and Agriculture',
90
+ 'FORWLD': 'Forestry and Wildlife',
91
+ 'GEN': 'General',
92
+ 'GOVSCM': 'Government Schemes',
93
+ 'HLTH': 'Health',
94
+ 'HSNG': 'Housing',
95
+ 'IND': 'Industries',
96
+ 'JUST': 'Justice',
97
+ 'NSS': 'National Sample Survey',
98
+ 'NATDIS': 'Natural Disasters',
99
+ 'OTHER': 'Other',
100
+ 'PETGAS': 'Petroleum and Gas',
101
+ 'RURALDEV': 'Rural Development',
102
+ 'SATIMG': 'Satellite Imagery Data',
103
+ 'SCI': 'Science',
104
+ 'SOCIOECO': 'Socio Economic',
105
+ 'TRANS': 'Transportation',
106
+ 'BUDGET': 'Union Budget',
107
+ 'WTR': 'Water'
108
+ }
109
+
110
+ #Granularity_values = ["District","State","Tehsil","Other Level", "India","Assembly Constituency","Point Level","Gram Panchayat","Block","Sub-District","Village","Country"]
111
+
112
+
113
+
114
+
115
+ # Short namings for Granularity_values
116
+ granularity_short_codes = {
117
+ 'District': 'DIS',
118
+ 'State': 'STA',
119
+ 'Tehsil': 'TEH',
120
+ 'Other Level': 'OTH',
121
+ 'India': 'IND',
122
+ 'Assembly Constituency': 'AC',
123
+ 'Point Level': 'PL',
124
+ 'Gram Panchayat': 'GP',
125
+ 'Block': 'BL',
126
+ 'Sub-District': 'SD',
127
+ 'Village': 'VIL',
128
+ 'Country': 'CTRY'
129
+ }
130
+ # frequency_values = ['Yearly', 'Weekly', 'Quinquennial', 'Daily', 'Fortnightly', 'Monthly', 'Seasonally', 'Other / One Time']
131
+
132
+
133
+ # Short namings for frequency_values
134
+ frequency_short_codes = {
135
+ 'Yearly': 'Y',
136
+ 'Weekly': 'W',
137
+ 'Quinquennial': 'Q',
138
+ 'Daily': 'D',
139
+ 'Fortnightly': 'F',
140
+ 'Monthly': 'M',
141
+ 'Seasonally': 'S',
142
+ 'Other / One Time': 'O'
143
+ }
144
+
145
+
146
+ # Read counter from file
147
+ def read_counter():
148
+ try:
149
+ with open('counter.txt', 'r') as f:
150
+ counter = int(f.read())
151
+ except FileNotFoundError:
152
+ counter = 1 # Starting counter value
153
+ return counter
154
+
155
+ # Update and save counter to file
156
+ def update_counter(counter):
157
+ with open('counter.txt', 'w') as f:
158
+ f.write(str(counter))
159
+
160
+ # Generate unique dataset IDs
161
+ def generate_dataset_id(counter):
162
+ return f'DID{counter:03}'
163
+
164
+ # Generate dataset names
165
+ def generate_dataset_name(data_source_code, sector_code, start_year, end_year, dataset_id, granularity, frequency):
166
+ granularity_short = granularity_short_codes.get(granularity, 'UNK')
167
+ frequency_short = frequency_short_codes.get(frequency, 'UNK')
168
+ return f'{data_source_code}-{sector_code}-{granularity_short}-{frequency_short}-{dataset_id}'
169
+
170
+ # List to store existing dataset names
171
+ existing_dataset_names = []
172
+
173
+ # Check if dataset name is unique
174
+ def check_dataset_name_uniqueness(dataset_name):
175
+ return dataset_name not in existing_dataset_names
176
+
177
+
178
+ def generate_download_link(mapped_dataset):
179
+
180
+ csv_file = mapped_dataset.to_csv(index=False)
181
+ b64 = base64.b64encode(csv_file.encode()).decode()
182
+ href = f'<a href="data:file/xlsx;base64,{b64}" download="mapped_dataset.xlsx">Download</a>'
183
+ st.success('Download Mapped Dataset')
184
+ st.markdown(href, unsafe_allow_html=True)
185
+
186
+ # Streamlit App
187
+ def main():
188
+ st.title('Dataset Naming App')
189
+
190
+ # Read counter from file
191
+ counter = read_counter()
192
+
193
+ # User input: Data Source
194
+ data_source = st.selectbox('Select Data Source', list(data_source_codes.values()))
195
+ data_source_code = next(code for code, name in data_source_codes.items() if name == data_source)
196
+
197
+ # Generate dataset name
198
+ dataset_id = generate_dataset_id(counter)
199
+
200
+ # User input: Sector
201
+ sector = st.selectbox('Select Sector', list(sector_codes.values()))
202
+ sector_code = next(code for code, name in sector_codes.items() if name == sector)
203
+
204
+ # User input: Start Year
205
+ start_year = st.number_input('Enter Start Year', min_value=2000, max_value=2100, value=2022)
206
+
207
+ # User input: End Year
208
+ end_year = st.number_input('Enter End Year', min_value=start_year, max_value=2100, value=2022)
209
+
210
+ # User input: Granularity
211
+ granularity = st.selectbox('Select Granularity', list(granularity_short_codes.keys()))
212
+
213
+ # User input: Frequency
214
+ frequency = st.selectbox('Select Frequency', list(frequency_short_codes.keys()))
215
+
216
+ # Generate dataset name
217
+ dataset_id = generate_dataset_id(counter) # Update with your actual counter
218
+ dataset_name = generate_dataset_name(data_source_code, sector_code, start_year, end_year, dataset_id, granularity, frequency)
219
+
220
+ # User input: Original Dataset Name
221
+ original_dataset_name = st.text_input('Enter Original Dataset Name')
222
+
223
+ # Check if the dataset name is unique
224
+ is_unique = check_dataset_name_uniqueness(dataset_name) # Implement this function
225
+
226
+ # Display generated dataset name
227
+ st.write('Generated Dataset Name:')
228
+ st.write(dataset_name)
229
+
230
+ # Display warning/error if dataset name is not unique
231
+ if not is_unique:
232
+ st.warning('Dataset name is not unique. Please generate a new name.')
233
+
234
+ # Save dataset info to Excel
235
+ if st.button('Save to Excel') and is_unique:
236
+ data = {
237
+ 'Dataset Name': [dataset_name],
238
+ 'Data Source': [data_source],
239
+ 'Sector': [sector],
240
+ 'Start Year': [start_year],
241
+ 'End Year': [end_year],
242
+ 'Granularity': [granularity],
243
+ 'Frequency': [frequency],
244
+ 'Original Dataset Name': [original_dataset_name]
245
+ }
246
+ df = pd.DataFrame(data)
247
+ #df.to_excel('dataset_info.xlsx', index=False)
248
+
249
+ # Update and save counter to file
250
+ counter += 1
251
+ update_counter(counter)
252
+ generate_download_link(df)
253
+ # Clear user inputs
254
+ st.success('Dataset information saved to Excel.')
255
+ data_source = ''
256
+ sector = ''
257
+ start_year = 2022 # Reset to default year
258
+ end_year = 2022
259
+ granularity = ''
260
+ frequency = ''
261
+ original_dataset_name = ''
262
+
263
+ if __name__ == '__main__':
264
+ main()
265
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ langchain
4
+ openai
5
+ tabulate