ARBAJSSHAIKH commited on
Commit
97c0c62
·
verified ·
1 Parent(s): b5cfc6b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+
5
+
6
+ def god(STATE):
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import pandas as pd
10
+ import openpyxl
11
+ import os
12
+
13
+ output_directory = f"D:/{STATE}"
14
+
15
+ if not os.path.exists(output_directory):
16
+ os.makedirs(output_directory)
17
+
18
+ def arbaj(STATE):
19
+ STATE_URL = f"https://www.censusindia2011.com/{STATE}-population.html"
20
+
21
+ def fetch_html(url):
22
+ try:
23
+ response = requests.get(url)
24
+ if response.status_code == 200:
25
+ return response.text
26
+ else:
27
+ print(f"Failed to fetch HTML. Status code: {response.status_code}")
28
+ return None
29
+ except Exception as e:
30
+ print(f"An error occurred: {str(e)}")
31
+ return None
32
+
33
+ url = STATE_URL
34
+ html = fetch_html(url)
35
+
36
+ def extract_district_names(html):
37
+ district_names = []
38
+ soup = BeautifulSoup(html, 'html.parser')
39
+ tables = soup.find_all('table')
40
+ for table in tables:
41
+ headers = [header.text.strip() for header in table.find_all('th')]
42
+ if 'District' in headers:
43
+ rows = table.find_all('tr')
44
+ for row in rows[1:]:
45
+ cells = row.find_all('td')
46
+ district_name = cells[0].text.strip()
47
+ district_names.append(district_name)
48
+ return district_names
49
+
50
+ district_names = extract_district_names(html)
51
+ return district_names
52
+
53
+ districts = arbaj(STATE)
54
+
55
+ def format_district_name(district_name):
56
+ if ' ' in district_name:
57
+ return district_name.replace(" ", "-")
58
+ else:
59
+ return district_name
60
+
61
+ for district in districts:
62
+ formatted_district_name = format_district_name(district)
63
+ DISTRICT_URL = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}-population.html"
64
+ def fetch_html(url):
65
+ try:
66
+ response = requests.get(url)
67
+ if response.status_code == 200:
68
+ return response.text
69
+ else:
70
+ print(f"Failed to fetch HTML. Status code: {response.status_code}")
71
+ return None
72
+ except Exception as e:
73
+ print(f"An error occurred: {str(e)}")
74
+ return None
75
+ html = fetch_html(DISTRICT_URL)
76
+ wb = openpyxl.Workbook()
77
+
78
+ def extract_taluka_names(html):
79
+ taluka_names = []
80
+ soup = BeautifulSoup(html, 'html.parser')
81
+ tables = soup.find_all('table')
82
+
83
+ for table in tables:
84
+ headers = [header.text.strip() for header in table.find_all('th')]
85
+ taluka_headers = ['Taluka', 'Taluk', 'Mandal', 'Tehsil'] # Add more variations if needed
86
+ matching_headers = set(taluka_headers) & set(headers)
87
+
88
+ if matching_headers:
89
+ rows = table.find_all('tr')
90
+ for row in rows[1:]:
91
+ cells = row.find_all('td')
92
+ #taluka_name = cells[0].text.strip()
93
+ taluka_name = cells[0].text.strip().replace(" ", "-") # Replace spaces with hyphens
94
+ taluka_names.append(taluka_name)
95
+ break # Break the loop if taluka names are found
96
+
97
+ return taluka_names
98
+
99
+ taluka_names = extract_taluka_names(html)
100
+
101
+ for taluka_name in taluka_names:
102
+ url = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}/{taluka_name}-population.html"
103
+ def get_html_inside_div(url, div_class):
104
+ # Fetch the webpage content
105
+ response = requests.get(url)
106
+
107
+ # Check if the request was successful
108
+ if response.status_code == 200:
109
+ # Parse the HTML content
110
+ soup = BeautifulSoup(response.content, 'html.parser')
111
+
112
+ # Find the div with the specified class
113
+ div = soup.find('div', class_=div_class)
114
+
115
+ # Check if the div is found
116
+ if div:
117
+ # Return the HTML content inside the div
118
+ return str(div)
119
+ else:
120
+ return "Div with class '{}' not found on the page.".format(div_class)
121
+ else:
122
+ return "Failed to retrieve webpage. Status code: {}".format(response.status_code)
123
+ html_inside_div = get_html_inside_div(url, 'mt20')
124
+ html = str(html_inside_div)
125
+
126
+ def html_to_table(html):
127
+ soup = BeautifulSoup(html, 'html.parser')
128
+ all_tables = soup.find_all('table')
129
+ result = []
130
+ for i, table in enumerate(all_tables):
131
+ headers = [header.text.strip() for header in table.find_all('th')]
132
+ if all(col_name in headers for col_name in ['Village', 'Population', 'Literacy', 'Sex-ratio']) or all(col_name in headers for col_name in ['Town', 'Population', 'Literacy', 'Sex-ratio']):
133
+ rows = []
134
+ for row in table.find_all('tr'):
135
+ cells = [cell.text.strip() for cell in row.find_all('td')]
136
+ if cells:
137
+ rows.append(cells)
138
+ result.append((headers, rows))
139
+ return result
140
+
141
+ tables = html_to_table(html)
142
+
143
+ combined_sheet = wb.create_sheet(title=f'{taluka_name}')
144
+
145
+ for s, (headers, rows) in enumerate(tables):
146
+ df = pd.DataFrame(rows, columns=headers)
147
+
148
+ # Append the data to the combined sheet
149
+ for r_idx, row in enumerate(df.values, 1):
150
+ for c_idx, value in enumerate(row, 1):
151
+ combined_sheet.cell(row=r_idx, column=c_idx, value=value)
152
+
153
+ # Remove the default sheet created by openpyxl (Sheet)
154
+ if 'Sheet' in wb.sheetnames:
155
+ wb.remove(wb['Sheet'])
156
+
157
+ # Save the workbook
158
+ wb.save(os.path.join(output_directory, f'{district}.xlsx'))
159
+
160
+
161
+
162
+
163
+
164
+ st.title("GENERICART")
165
+ #st.write("If name of STATE contains more than 1 word then join with - ,like uttar-pradesh")
166
+ state_names = [
167
+ 'Uttar-Pradesh', 'Maharashtra', 'Bihar', 'West-Bengal', 'Andhra-Pradesh',
168
+ 'Madhya-Pradesh', 'Tamil-Nadu', 'Rajasthan', 'Karnataka', 'Gujarat',
169
+ 'Odisha', 'Kerala', 'Jharkhand', 'Assam', 'Punjab',
170
+ 'Haryana', 'NCT-Of-Delhi', 'Jammu-&-Kashmir', 'Uttarakhand',
171
+ 'Himachal-Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
172
+ 'Goa', 'Arunachal-Pradesh', 'Puducherry', 'Mizoram', 'Chandigarh',
173
+ 'Sikkim', 'Andaman-&-Nicobar-Islands', 'Dadra-&-Nagar-Haveli',
174
+ 'Daman-&-Diu', 'Lakshadweep']
175
+
176
+ state=st.selectbox("Select a state:", state_names)
177
+
178
+ # Button to trigger the processing function
179
+ if st.button("Run Processing"):
180
+ # Show a spinner while the processing is ongoing
181
+ with st.spinner("Processing..."):
182
+ # Call your time-consuming function here
183
+ god(state)
184
+
185
+ # Once the processing is done, remove the spinner
186
+ st.success("Processing complete!")
187
+
188
+ st.subheader(f"Data Organization in Folder")
189
+
190
+ # Write information about the Excel file and data organization
191
+ st.write(
192
+ f"In the D drive, folder has been identified, named after the specified state. "
193
+ f"This file encompasses multiple sheets, each dedicated to individual districts within the state. "
194
+ f"The organization of data within these sheets adheres to a structured format, where the sheet names correspond to the respective talukas."
195
+ )
196
+
197
+ # Display information about the columns in the DataFrame
198
+ st.write("Each sheet contains essential demographic information, meticulously arranged in columns for ease of analysis. The columns include:")
199
+ st.write("- **VILLAGE:** This field provides a comprehensive list of villages within the taluka, offering a granular view of the geographic distribution.")
200
+ st.write("- **POPULATION:** The population column quantifies the total number of residents in each village, facilitating an understanding of settlement sizes.")
201
+ st.write("- **LITERACY:** Literacy rates are documented, reflecting the proportion of individuals who possess basic reading and writing skills in the specified villages.")
202
+ st.write("- **SEX RATIO:** The sex ratio column provides insights into the gender distribution within the villages, aiding in the evaluation of gender demographics.")