Spaces:

ARBAJSSHAIKH
/

WEB-SCRAPING

Build error

App Files Files Community

WEB-SCRAPING / app.py

ARBAJSSHAIKH

Create app.py

97c0c62 verified about 2 years ago

raw

history blame contribute delete

8.67 kB

	import streamlit as st




	def god(STATE):
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import openpyxl
	import os

	output_directory = f"D:/{STATE}"

	if not os.path.exists(output_directory):
	os.makedirs(output_directory)

	def arbaj(STATE):
	STATE_URL = f"https://www.censusindia2011.com/{STATE}-population.html"

	def fetch_html(url):
	try:
	response = requests.get(url)
	if response.status_code == 200:
	return response.text
	else:
	print(f"Failed to fetch HTML. Status code: {response.status_code}")
	return None
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	return None

	url = STATE_URL
	html = fetch_html(url)

	def extract_district_names(html):
	district_names = []
	soup = BeautifulSoup(html, 'html.parser')
	tables = soup.find_all('table')
	for table in tables:
	headers = [header.text.strip() for header in table.find_all('th')]
	if 'District' in headers:
	rows = table.find_all('tr')
	for row in rows[1:]:
	cells = row.find_all('td')
	district_name = cells[0].text.strip()
	district_names.append(district_name)
	return district_names

	district_names = extract_district_names(html)
	return district_names

	districts = arbaj(STATE)

	def format_district_name(district_name):
	if ' ' in district_name:
	return district_name.replace(" ", "-")
	else:
	return district_name

	for district in districts:
	formatted_district_name = format_district_name(district)
	DISTRICT_URL = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}-population.html"
	def fetch_html(url):
	try:
	response = requests.get(url)
	if response.status_code == 200:
	return response.text
	else:
	print(f"Failed to fetch HTML. Status code: {response.status_code}")
	return None
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	return None
	html = fetch_html(DISTRICT_URL)
	wb = openpyxl.Workbook()

	def extract_taluka_names(html):
	taluka_names = []
	soup = BeautifulSoup(html, 'html.parser')
	tables = soup.find_all('table')

	for table in tables:
	headers = [header.text.strip() for header in table.find_all('th')]
	taluka_headers = ['Taluka', 'Taluk', 'Mandal', 'Tehsil'] # Add more variations if needed
	matching_headers = set(taluka_headers) & set(headers)

	if matching_headers:
	rows = table.find_all('tr')
	for row in rows[1:]:
	cells = row.find_all('td')
	#taluka_name = cells[0].text.strip()
	taluka_name = cells[0].text.strip().replace(" ", "-") # Replace spaces with hyphens
	taluka_names.append(taluka_name)
	break # Break the loop if taluka names are found

	return taluka_names

	taluka_names = extract_taluka_names(html)

	for taluka_name in taluka_names:
	url = f"https://www.censusindia2011.com/{STATE}/{formatted_district_name}/{taluka_name}-population.html"
	def get_html_inside_div(url, div_class):
	# Fetch the webpage content
	response = requests.get(url)

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the div with the specified class
	div = soup.find('div', class_=div_class)

	# Check if the div is found
	if div:
	# Return the HTML content inside the div
	return str(div)
	else:
	return "Div with class '{}' not found on the page.".format(div_class)
	else:
	return "Failed to retrieve webpage. Status code: {}".format(response.status_code)
	html_inside_div = get_html_inside_div(url, 'mt20')
	html = str(html_inside_div)

	def html_to_table(html):
	soup = BeautifulSoup(html, 'html.parser')
	all_tables = soup.find_all('table')
	result = []
	for i, table in enumerate(all_tables):
	headers = [header.text.strip() for header in table.find_all('th')]
	if all(col_name in headers for col_name in ['Village', 'Population', 'Literacy', 'Sex-ratio']) or all(col_name in headers for col_name in ['Town', 'Population', 'Literacy', 'Sex-ratio']):
	rows = []
	for row in table.find_all('tr'):
	cells = [cell.text.strip() for cell in row.find_all('td')]
	if cells:
	rows.append(cells)
	result.append((headers, rows))
	return result

	tables = html_to_table(html)

	combined_sheet = wb.create_sheet(title=f'{taluka_name}')

	for s, (headers, rows) in enumerate(tables):
	df = pd.DataFrame(rows, columns=headers)

	# Append the data to the combined sheet
	for r_idx, row in enumerate(df.values, 1):
	for c_idx, value in enumerate(row, 1):
	combined_sheet.cell(row=r_idx, column=c_idx, value=value)

	# Remove the default sheet created by openpyxl (Sheet)
	if 'Sheet' in wb.sheetnames:
	wb.remove(wb['Sheet'])

	# Save the workbook
	wb.save(os.path.join(output_directory, f'{district}.xlsx'))





	st.title("GENERICART")
	#st.write("If name of STATE contains more than 1 word then join with - ,like uttar-pradesh")
	state_names = [
	'Uttar-Pradesh', 'Maharashtra', 'Bihar', 'West-Bengal', 'Andhra-Pradesh',
	'Madhya-Pradesh', 'Tamil-Nadu', 'Rajasthan', 'Karnataka', 'Gujarat',
	'Odisha', 'Kerala', 'Jharkhand', 'Assam', 'Punjab',
	'Haryana', 'NCT-Of-Delhi', 'Jammu-&-Kashmir', 'Uttarakhand',
	'Himachal-Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
	'Goa', 'Arunachal-Pradesh', 'Puducherry', 'Mizoram', 'Chandigarh',
	'Sikkim', 'Andaman-&-Nicobar-Islands', 'Dadra-&-Nagar-Haveli',
	'Daman-&-Diu', 'Lakshadweep']

	state=st.selectbox("Select a state:", state_names)

	# Button to trigger the processing function
	if st.button("Run Processing"):
	# Show a spinner while the processing is ongoing
	with st.spinner("Processing..."):
	# Call your time-consuming function here
	god(state)

	# Once the processing is done, remove the spinner
	st.success("Processing complete!")

	st.subheader(f"Data Organization in Folder")

	# Write information about the Excel file and data organization
	st.write(
	f"In the D drive, folder has been identified, named after the specified state. "
	f"This file encompasses multiple sheets, each dedicated to individual districts within the state. "
	f"The organization of data within these sheets adheres to a structured format, where the sheet names correspond to the respective talukas."
	)

	# Display information about the columns in the DataFrame
	st.write("Each sheet contains essential demographic information, meticulously arranged in columns for ease of analysis. The columns include:")
	st.write("- VILLAGE: This field provides a comprehensive list of villages within the taluka, offering a granular view of the geographic distribution.")
	st.write("- POPULATION: The population column quantifies the total number of residents in each village, facilitating an understanding of settlement sizes.")
	st.write("- LITERACY: Literacy rates are documented, reflecting the proportion of individuals who possess basic reading and writing skills in the specified villages.")
	st.write("- SEX RATIO: The sex ratio column provides insights into the gender distribution within the villages, aiding in the evaluation of gender demographics.")