File size: 8,667 Bytes
97c0c62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import streamlit as st




def god(STATE):
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import openpyxl
    import os

    output_directory = f"D:/{STATE}"

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    def arbaj(STATE):
        STATE_URL = f"https://www.censusindia2011.com/{STATE}-population.html"

        def fetch_html(url):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return response.text
                else:
                    print(f"Failed to fetch HTML. Status code: {response.status_code}")
                    return None
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                return None

        url = STATE_URL
        html = fetch_html(url)

        def extract_district_names(html):
            district_names = []
            soup = BeautifulSoup(html, 'html.parser')
            tables = soup.find_all('table')
            for table in tables:
                headers = [header.text.strip() for header in table.find_all('th')]
                if 'District' in headers:
                    rows = table.find_all('tr')
                    for row in rows[1:]:
                        cells = row.find_all('td')
                        district_name = cells[0].text.strip()
                        district_names.append(district_name)
            return district_names

        district_names = extract_district_names(html)
        return district_names

    districts = arbaj(STATE)

    def format_district_name(district_name):
        if ' ' in district_name:
            return district_name.replace(" ", "-")
        else:
            return district_name

    for district in districts:
        formatted_district_name = format_district_name(district)
        DISTRICT_URL = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}-population.html"
        def fetch_html(url):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return response.text
                else:
                    print(f"Failed to fetch HTML. Status code: {response.status_code}")
                    return None
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                return None
        html = fetch_html(DISTRICT_URL)
        wb = openpyxl.Workbook()

        def extract_taluka_names(html):
            taluka_names = []
            soup = BeautifulSoup(html, 'html.parser')
            tables = soup.find_all('table')

            for table in tables:
                headers = [header.text.strip() for header in table.find_all('th')]
                taluka_headers = ['Taluka', 'Taluk', 'Mandal', 'Tehsil']  # Add more variations if needed
                matching_headers = set(taluka_headers) & set(headers)

                if matching_headers:
                    rows = table.find_all('tr')
                    for row in rows[1:]:
                        cells = row.find_all('td')
                        #taluka_name = cells[0].text.strip()
                        taluka_name = cells[0].text.strip().replace(" ", "-")  # Replace spaces with hyphens
                        taluka_names.append(taluka_name)
                    break  # Break the loop if taluka names are found

            return taluka_names

        taluka_names = extract_taluka_names(html)

        for taluka_name in taluka_names:
            url = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}/{taluka_name}-population.html"
            def get_html_inside_div(url, div_class):
                # Fetch the webpage content
                response = requests.get(url)

                # Check if the request was successful
                if response.status_code == 200:
                    # Parse the HTML content
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Find the div with the specified class
                    div = soup.find('div', class_=div_class)

                    # Check if the div is found
                    if div:
                        # Return the HTML content inside the div
                        return str(div)
                    else:
                        return "Div with class '{}' not found on the page.".format(div_class)
                else:
                    return "Failed to retrieve webpage. Status code: {}".format(response.status_code)
            html_inside_div = get_html_inside_div(url, 'mt20')
            html = str(html_inside_div)

            def html_to_table(html):
                soup = BeautifulSoup(html, 'html.parser')
                all_tables = soup.find_all('table')
                result = []
                for i, table in enumerate(all_tables):
                    headers = [header.text.strip() for header in table.find_all('th')]
                    if all(col_name in headers for col_name in ['Village', 'Population', 'Literacy', 'Sex-ratio']) or all(col_name in headers for col_name in ['Town', 'Population', 'Literacy', 'Sex-ratio']):
                        rows = []
                        for row in table.find_all('tr'):
                            cells = [cell.text.strip() for cell in row.find_all('td')]
                            if cells:
                                rows.append(cells)
                        result.append((headers, rows))
                return result

            tables = html_to_table(html)

            combined_sheet = wb.create_sheet(title=f'{taluka_name}')

            for s, (headers, rows) in enumerate(tables):
                df = pd.DataFrame(rows, columns=headers)

                # Append the data to the combined sheet
                for r_idx, row in enumerate(df.values, 1):
                    for c_idx, value in enumerate(row, 1):
                        combined_sheet.cell(row=r_idx, column=c_idx, value=value)

            # Remove the default sheet created by openpyxl (Sheet)
            if 'Sheet' in wb.sheetnames:
                wb.remove(wb['Sheet'])

            # Save the workbook
        wb.save(os.path.join(output_directory, f'{district}.xlsx'))





st.title("GENERICART")
#st.write("If name of STATE contains more than 1 word then join with - ,like uttar-pradesh")
state_names = [
    'Uttar-Pradesh', 'Maharashtra', 'Bihar', 'West-Bengal', 'Andhra-Pradesh',
    'Madhya-Pradesh', 'Tamil-Nadu', 'Rajasthan', 'Karnataka', 'Gujarat',
    'Odisha', 'Kerala', 'Jharkhand', 'Assam', 'Punjab',
    'Haryana', 'NCT-Of-Delhi', 'Jammu-&-Kashmir', 'Uttarakhand',
    'Himachal-Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
    'Goa', 'Arunachal-Pradesh', 'Puducherry', 'Mizoram', 'Chandigarh',
    'Sikkim', 'Andaman-&-Nicobar-Islands', 'Dadra-&-Nagar-Haveli',
    'Daman-&-Diu', 'Lakshadweep']

state=st.selectbox("Select a state:", state_names)

# Button to trigger the processing function
if st.button("Run Processing"):
    # Show a spinner while the processing is ongoing
    with st.spinner("Processing..."):
        # Call your time-consuming function here
        god(state)

    # Once the processing is done, remove the spinner
    st.success("Processing complete!")

st.subheader(f"Data Organization in Folder")

# Write information about the Excel file and data organization
st.write(
    f"In the D drive, folder has been identified, named after the specified state. "
    f"This file encompasses multiple sheets, each dedicated to individual districts within the state. "
    f"The organization of data within these sheets adheres to a structured format, where the sheet names correspond to the respective talukas."
)

# Display information about the columns in the DataFrame
st.write("Each sheet contains essential demographic information, meticulously arranged in columns for ease of analysis. The columns include:")
st.write("- **VILLAGE:** This field provides a comprehensive list of villages within the taluka, offering a granular view of the geographic distribution.")
st.write("- **POPULATION:** The population column quantifies the total number of residents in each village, facilitating an understanding of settlement sizes.")
st.write("- **LITERACY:** Literacy rates are documented, reflecting the proportion of individuals who possess basic reading and writing skills in the specified villages.")
st.write("- **SEX RATIO:** The sex ratio column provides insights into the gender distribution within the villages, aiding in the evaluation of gender demographics.")