WEB-SCRAPING / app.py
ARBAJSSHAIKH's picture
Create app.py
97c0c62 verified
import streamlit as st
def god(STATE):
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl
import os
output_directory = f"D:/{STATE}"
if not os.path.exists(output_directory):
os.makedirs(output_directory)
def arbaj(STATE):
STATE_URL = f"https://www.censusindia2011.com/{STATE}-population.html"
def fetch_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to fetch HTML. Status code: {response.status_code}")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
url = STATE_URL
html = fetch_html(url)
def extract_district_names(html):
district_names = []
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
for table in tables:
headers = [header.text.strip() for header in table.find_all('th')]
if 'District' in headers:
rows = table.find_all('tr')
for row in rows[1:]:
cells = row.find_all('td')
district_name = cells[0].text.strip()
district_names.append(district_name)
return district_names
district_names = extract_district_names(html)
return district_names
districts = arbaj(STATE)
def format_district_name(district_name):
if ' ' in district_name:
return district_name.replace(" ", "-")
else:
return district_name
for district in districts:
formatted_district_name = format_district_name(district)
DISTRICT_URL = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}-population.html"
def fetch_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to fetch HTML. Status code: {response.status_code}")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
html = fetch_html(DISTRICT_URL)
wb = openpyxl.Workbook()
def extract_taluka_names(html):
taluka_names = []
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
for table in tables:
headers = [header.text.strip() for header in table.find_all('th')]
taluka_headers = ['Taluka', 'Taluk', 'Mandal', 'Tehsil'] # Add more variations if needed
matching_headers = set(taluka_headers) & set(headers)
if matching_headers:
rows = table.find_all('tr')
for row in rows[1:]:
cells = row.find_all('td')
#taluka_name = cells[0].text.strip()
taluka_name = cells[0].text.strip().replace(" ", "-") # Replace spaces with hyphens
taluka_names.append(taluka_name)
break # Break the loop if taluka names are found
return taluka_names
taluka_names = extract_taluka_names(html)
for taluka_name in taluka_names:
url = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}/{taluka_name}-population.html"
def get_html_inside_div(url, div_class):
# Fetch the webpage content
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the div with the specified class
div = soup.find('div', class_=div_class)
# Check if the div is found
if div:
# Return the HTML content inside the div
return str(div)
else:
return "Div with class '{}' not found on the page.".format(div_class)
else:
return "Failed to retrieve webpage. Status code: {}".format(response.status_code)
html_inside_div = get_html_inside_div(url, 'mt20')
html = str(html_inside_div)
def html_to_table(html):
soup = BeautifulSoup(html, 'html.parser')
all_tables = soup.find_all('table')
result = []
for i, table in enumerate(all_tables):
headers = [header.text.strip() for header in table.find_all('th')]
if all(col_name in headers for col_name in ['Village', 'Population', 'Literacy', 'Sex-ratio']) or all(col_name in headers for col_name in ['Town', 'Population', 'Literacy', 'Sex-ratio']):
rows = []
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
if cells:
rows.append(cells)
result.append((headers, rows))
return result
tables = html_to_table(html)
combined_sheet = wb.create_sheet(title=f'{taluka_name}')
for s, (headers, rows) in enumerate(tables):
df = pd.DataFrame(rows, columns=headers)
# Append the data to the combined sheet
for r_idx, row in enumerate(df.values, 1):
for c_idx, value in enumerate(row, 1):
combined_sheet.cell(row=r_idx, column=c_idx, value=value)
# Remove the default sheet created by openpyxl (Sheet)
if 'Sheet' in wb.sheetnames:
wb.remove(wb['Sheet'])
# Save the workbook
wb.save(os.path.join(output_directory, f'{district}.xlsx'))
st.title("GENERICART")
#st.write("If name of STATE contains more than 1 word then join with - ,like uttar-pradesh")
state_names = [
'Uttar-Pradesh', 'Maharashtra', 'Bihar', 'West-Bengal', 'Andhra-Pradesh',
'Madhya-Pradesh', 'Tamil-Nadu', 'Rajasthan', 'Karnataka', 'Gujarat',
'Odisha', 'Kerala', 'Jharkhand', 'Assam', 'Punjab',
'Haryana', 'NCT-Of-Delhi', 'Jammu-&-Kashmir', 'Uttarakhand',
'Himachal-Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
'Goa', 'Arunachal-Pradesh', 'Puducherry', 'Mizoram', 'Chandigarh',
'Sikkim', 'Andaman-&-Nicobar-Islands', 'Dadra-&-Nagar-Haveli',
'Daman-&-Diu', 'Lakshadweep']
state=st.selectbox("Select a state:", state_names)
# Button to trigger the processing function
if st.button("Run Processing"):
# Show a spinner while the processing is ongoing
with st.spinner("Processing..."):
# Call your time-consuming function here
god(state)
# Once the processing is done, remove the spinner
st.success("Processing complete!")
st.subheader(f"Data Organization in Folder")
# Write information about the Excel file and data organization
st.write(
f"In the D drive, folder has been identified, named after the specified state. "
f"This file encompasses multiple sheets, each dedicated to individual districts within the state. "
f"The organization of data within these sheets adheres to a structured format, where the sheet names correspond to the respective talukas."
)
# Display information about the columns in the DataFrame
st.write("Each sheet contains essential demographic information, meticulously arranged in columns for ease of analysis. The columns include:")
st.write("- **VILLAGE:** This field provides a comprehensive list of villages within the taluka, offering a granular view of the geographic distribution.")
st.write("- **POPULATION:** The population column quantifies the total number of residents in each village, facilitating an understanding of settlement sizes.")
st.write("- **LITERACY:** Literacy rates are documented, reflecting the proportion of individuals who possess basic reading and writing skills in the specified villages.")
st.write("- **SEX RATIO:** The sex ratio column provides insights into the gender distribution within the villages, aiding in the evaluation of gender demographics.")