Spaces:

ARBAJSSHAIKH
/

WEB-SCRAPING

Build error

File size: 8,667 Bytes

97c0c62

import streamlit as st




def god(STATE):
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import openpyxl
    import os

    output_directory = f"D:/{STATE}"

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    def arbaj(STATE):
        STATE_URL = f"https://www.censusindia2011.com/{STATE}-population.html"

        def fetch_html(url):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return response.text
                else:
                    print(f"Failed to fetch HTML. Status code: {response.status_code}")
                    return None
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                return None

        url = STATE_URL
        html = fetch_html(url)

        def extract_district_names(html):
            district_names = []
            soup = BeautifulSoup(html, 'html.parser')
            tables = soup.find_all('table')
            for table in tables:
                headers = [header.text.strip() for header in table.find_all('th')]
                if 'District' in headers:
                    rows = table.find_all('tr')
                    for row in rows[1:]:
                        cells = row.find_all('td')
                        district_name = cells[0].text.strip()
                        district_names.append(district_name)
            return district_names

        district_names = extract_district_names(html)
        return district_names

    districts = arbaj(STATE)

    def format_district_name(district_name):
        if ' ' in district_name:
            return district_name.replace(" ", "-")
        else:
            return district_name

    for district in districts:
        formatted_district_name = format_district_name(district)
        DISTRICT_URL = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}-population.html"
        def fetch_html(url):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return response.text
                else:
                    print(f"Failed to fetch HTML. Status code: {response.status_code}")
                    return None
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                return None
        html = fetch_html(DISTRICT_URL)
        wb = openpyxl.Workbook()

        def extract_taluka_names(html):
            taluka_names = []
            soup = BeautifulSoup(html, 'html.parser')
            tables = soup.find_all('table')

            for table in tables:
                headers = [header.text.strip() for header in table.find_all('th')]
                taluka_headers = ['Taluka', 'Taluk', 'Mandal', 'Tehsil']  # Add more variations if needed
                matching_headers = set(taluka_headers) & set(headers)

                if matching_headers:
                    rows = table.find_all('tr')
                    for row in rows[1:]:
                        cells = row.find_all('td')
                        #taluka_name = cells[0].text.strip()
                        taluka_name = cells[0].text.strip().replace(" ", "-")  # Replace spaces with hyphens
                        taluka_names.append(taluka_name)
                    break  # Break the loop if taluka names are found

            return taluka_names

        taluka_names = extract_taluka_names(html)

        for taluka_name in taluka_names:
            url = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}/{taluka_name}-population.html"
            def get_html_inside_div(url, div_class):
                # Fetch the webpage content
                response = requests.get(url)

                # Check if the request was successful
                if response.status_code == 200:
                    # Parse the HTML content
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Find the div with the specified class
                    div = soup.find('div', class_=div_class)

                    # Check if the div is found
                    if div:
                        # Return the HTML content inside the div
                        return str(div)
                    else:
                        return "Div with class '{}' not found on the page.".format(div_class)
                else:
                    return "Failed to retrieve webpage. Status code: {}".format(response.status_code)
            html_inside_div = get_html_inside_div(url, 'mt20')
            html = str(html_inside_div)

            def html_to_table(html):
                soup = BeautifulSoup(html, 'html.parser')
                all_tables = soup.find_all('table')
                result = []
                for i, table in enumerate(all_tables):
                    headers = [header.text.strip() for header in table.find_all('th')]
                    if all(col_name in headers for col_name in ['Village', 'Population', 'Literacy', 'Sex-ratio']) or all(col_name in headers for col_name in ['Town', 'Population', 'Literacy', 'Sex-ratio']):
                        rows = []
                        for row in table.find_all('tr'):
                            cells = [cell.text.strip() for cell in row.find_all('td')]
                            if cells:
                                rows.append(cells)
                        result.append((headers, rows))
                return result

            tables = html_to_table(html)

            combined_sheet = wb.create_sheet(title=f'{taluka_name}')

            for s, (headers, rows) in enumerate(tables):
                df = pd.DataFrame(rows, columns=headers)

                # Append the data to the combined sheet
                for r_idx, row in enumerate(df.values, 1):
                    for c_idx, value in enumerate(row, 1):
                        combined_sheet.cell(row=r_idx, column=c_idx, value=value)

            # Remove the default sheet created by openpyxl (Sheet)
            if 'Sheet' in wb.sheetnames:
                wb.remove(wb['Sheet'])

            # Save the workbook
        wb.save(os.path.join(output_directory, f'{district}.xlsx'))





st.title("GENERICART")
#st.write("If name of STATE contains more than 1 word then join with - ,like uttar-pradesh")
state_names = [
    'Uttar-Pradesh', 'Maharashtra', 'Bihar', 'West-Bengal', 'Andhra-Pradesh',
    'Madhya-Pradesh', 'Tamil-Nadu', 'Rajasthan', 'Karnataka', 'Gujarat',
    'Odisha', 'Kerala', 'Jharkhand', 'Assam', 'Punjab',
    'Haryana', 'NCT-Of-Delhi', 'Jammu-&-Kashmir', 'Uttarakhand',
    'Himachal-Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
    'Goa', 'Arunachal-Pradesh', 'Puducherry', 'Mizoram', 'Chandigarh',
    'Sikkim', 'Andaman-&-Nicobar-Islands', 'Dadra-&-Nagar-Haveli',
    'Daman-&-Diu', 'Lakshadweep']

state=st.selectbox("Select a state:", state_names)

# Button to trigger the processing function
if st.button("Run Processing"):
    # Show a spinner while the processing is ongoing
    with st.spinner("Processing..."):
        # Call your time-consuming function here
        god(state)

    # Once the processing is done, remove the spinner
    st.success("Processing complete!")

st.subheader(f"Data Organization in Folder")

# Write information about the Excel file and data organization
st.write(
    f"In the D drive, folder has been identified, named after the specified state. "
    f"This file encompasses multiple sheets, each dedicated to individual districts within the state. "
    f"The organization of data within these sheets adheres to a structured format, where the sheet names correspond to the respective talukas."
)

# Display information about the columns in the DataFrame
st.write("Each sheet contains essential demographic information, meticulously arranged in columns for ease of analysis. The columns include:")
st.write("- **VILLAGE:** This field provides a comprehensive list of villages within the taluka, offering a granular view of the geographic distribution.")
st.write("- **POPULATION:** The population column quantifies the total number of residents in each village, facilitating an understanding of settlement sizes.")
st.write("- **LITERACY:** Literacy rates are documented, reflecting the proportion of individuals who possess basic reading and writing skills in the specified villages.")
st.write("- **SEX RATIO:** The sex ratio column provides insights into the gender distribution within the villages, aiding in the evaluation of gender demographics.")