import streamlit as st def god(STATE): import requests from bs4 import BeautifulSoup import pandas as pd import openpyxl import os output_directory = f"D:/{STATE}" if not os.path.exists(output_directory): os.makedirs(output_directory) def arbaj(STATE): STATE_URL = f"https://www.censusindia2011.com/{STATE}-population.html" def fetch_html(url): try: response = requests.get(url) if response.status_code == 200: return response.text else: print(f"Failed to fetch HTML. Status code: {response.status_code}") return None except Exception as e: print(f"An error occurred: {str(e)}") return None url = STATE_URL html = fetch_html(url) def extract_district_names(html): district_names = [] soup = BeautifulSoup(html, 'html.parser') tables = soup.find_all('table') for table in tables: headers = [header.text.strip() for header in table.find_all('th')] if 'District' in headers: rows = table.find_all('tr') for row in rows[1:]: cells = row.find_all('td') district_name = cells[0].text.strip() district_names.append(district_name) return district_names district_names = extract_district_names(html) return district_names districts = arbaj(STATE) def format_district_name(district_name): if ' ' in district_name: return district_name.replace(" ", "-") else: return district_name for district in districts: formatted_district_name = format_district_name(district) DISTRICT_URL = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}-population.html" def fetch_html(url): try: response = requests.get(url) if response.status_code == 200: return response.text else: print(f"Failed to fetch HTML. Status code: {response.status_code}") return None except Exception as e: print(f"An error occurred: {str(e)}") return None html = fetch_html(DISTRICT_URL) wb = openpyxl.Workbook() def extract_taluka_names(html): taluka_names = [] soup = BeautifulSoup(html, 'html.parser') tables = soup.find_all('table') for table in tables: headers = [header.text.strip() for header in table.find_all('th')] taluka_headers = ['Taluka', 'Taluk', 'Mandal', 'Tehsil'] # Add more variations if needed matching_headers = set(taluka_headers) & set(headers) if matching_headers: rows = table.find_all('tr') for row in rows[1:]: cells = row.find_all('td') #taluka_name = cells[0].text.strip() taluka_name = cells[0].text.strip().replace(" ", "-") # Replace spaces with hyphens taluka_names.append(taluka_name) break # Break the loop if taluka names are found return taluka_names taluka_names = extract_taluka_names(html) for taluka_name in taluka_names: url = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}/{taluka_name}-population.html" def get_html_inside_div(url, div_class): # Fetch the webpage content response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') # Find the div with the specified class div = soup.find('div', class_=div_class) # Check if the div is found if div: # Return the HTML content inside the div return str(div) else: return "Div with class '{}' not found on the page.".format(div_class) else: return "Failed to retrieve webpage. Status code: {}".format(response.status_code) html_inside_div = get_html_inside_div(url, 'mt20') html = str(html_inside_div) def html_to_table(html): soup = BeautifulSoup(html, 'html.parser') all_tables = soup.find_all('table') result = [] for i, table in enumerate(all_tables): headers = [header.text.strip() for header in table.find_all('th')] if all(col_name in headers for col_name in ['Village', 'Population', 'Literacy', 'Sex-ratio']) or all(col_name in headers for col_name in ['Town', 'Population', 'Literacy', 'Sex-ratio']): rows = [] for row in table.find_all('tr'): cells = [cell.text.strip() for cell in row.find_all('td')] if cells: rows.append(cells) result.append((headers, rows)) return result tables = html_to_table(html) combined_sheet = wb.create_sheet(title=f'{taluka_name}') for s, (headers, rows) in enumerate(tables): df = pd.DataFrame(rows, columns=headers) # Append the data to the combined sheet for r_idx, row in enumerate(df.values, 1): for c_idx, value in enumerate(row, 1): combined_sheet.cell(row=r_idx, column=c_idx, value=value) # Remove the default sheet created by openpyxl (Sheet) if 'Sheet' in wb.sheetnames: wb.remove(wb['Sheet']) # Save the workbook wb.save(os.path.join(output_directory, f'{district}.xlsx')) st.title("GENERICART") #st.write("If name of STATE contains more than 1 word then join with - ,like uttar-pradesh") state_names = [ 'Uttar-Pradesh', 'Maharashtra', 'Bihar', 'West-Bengal', 'Andhra-Pradesh', 'Madhya-Pradesh', 'Tamil-Nadu', 'Rajasthan', 'Karnataka', 'Gujarat', 'Odisha', 'Kerala', 'Jharkhand', 'Assam', 'Punjab', 'Haryana', 'NCT-Of-Delhi', 'Jammu-&-Kashmir', 'Uttarakhand', 'Himachal-Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland', 'Goa', 'Arunachal-Pradesh', 'Puducherry', 'Mizoram', 'Chandigarh', 'Sikkim', 'Andaman-&-Nicobar-Islands', 'Dadra-&-Nagar-Haveli', 'Daman-&-Diu', 'Lakshadweep'] state=st.selectbox("Select a state:", state_names) # Button to trigger the processing function if st.button("Run Processing"): # Show a spinner while the processing is ongoing with st.spinner("Processing..."): # Call your time-consuming function here god(state) # Once the processing is done, remove the spinner st.success("Processing complete!") st.subheader(f"Data Organization in Folder") # Write information about the Excel file and data organization st.write( f"In the D drive, folder has been identified, named after the specified state. " f"This file encompasses multiple sheets, each dedicated to individual districts within the state. " f"The organization of data within these sheets adheres to a structured format, where the sheet names correspond to the respective talukas." ) # Display information about the columns in the DataFrame st.write("Each sheet contains essential demographic information, meticulously arranged in columns for ease of analysis. The columns include:") st.write("- **VILLAGE:** This field provides a comprehensive list of villages within the taluka, offering a granular view of the geographic distribution.") st.write("- **POPULATION:** The population column quantifies the total number of residents in each village, facilitating an understanding of settlement sizes.") st.write("- **LITERACY:** Literacy rates are documented, reflecting the proportion of individuals who possess basic reading and writing skills in the specified villages.") st.write("- **SEX RATIO:** The sex ratio column provides insights into the gender distribution within the villages, aiding in the evaluation of gender demographics.")