Spaces:

sarim
/

pix

Running

App Files Files Community

pix / psx_scraper.py

sarim

add circuit breaker ticker

f641e6a 27 days ago

raw

history blame contribute delete

10.4 kB

	from bs4 import BeautifulSoup
	import re
	from models import FinancialResult,FinancialEntry,Financials,RatioEntry,CompanyData, CircuitBreakerRow
	from typing import List, Optional, Dict, Any

	class PsxScraper(object):
	def __init__(self, html_content:str):
	self.soup = BeautifulSoup(html_content, 'html.parser')

	def _clean_number(self, text: str) -> float:
	"""Clean and convert number strings to float"""
	if not text:
	return 0.0
	# Remove commas, spaces, and non-numeric characters except decimal points and minus signs
	text = str(text).replace(',', '').replace(' ', '').replace('Rs.', '')
	# Extract numbers with optional decimal points
	match = re.search(r'[-+]?\d*\.?\d+', text)
	return float(match.group()) if match else 0.0

	def _extract_range(self, range_text: str) -> Dict[str, float]:
	"""Extract low, high, and current values from range strings"""
	# Example: "296.08 — 361.88"
	parts = range_text.split('—')
	if len(parts) == 2:
	return {
	'low': self._clean_number(parts[0]),
	'high': self._clean_number(parts[1]),
	'current': 0.0 # Will be set from data attributes
	}
	return {'low': 0.0, 'high': 0.0, 'current': 0.0}


	def extract_announcements(self) -> List[FinancialResult]:
	"""Extract financial results announcements"""
	announcements = []

	# Look for financial results tab
	financial_results_tab = self.soup.find('div', class_='tabs__panel', attrs={'data-name': 'Financial Results'})
	if not financial_results_tab:
	return announcements

	table = financial_results_tab.find('table')
	if not table:
	return announcements

	rows = table.find_all('tr')[1:] # Skip header row
	for row in rows:
	cols = row.find_all('td')
	if len(cols) >= 3:
	date = cols[0].text.strip()
	title = cols[1].text.strip()

	# Extract links
	document_link = None
	pdf_link = None

	links = cols[2].find_all('a')
	for link in links:
	href = link.get('href', '')
	if 'javascript:' in href:
	document_link = href
	elif '.pdf' in href:
	pdf_link = href

	announcements.append(FinancialResult(
	date=date,
	title=title,
	documentLink=document_link,
	pdfLink=pdf_link
	))

	return announcements


	def extract_financials(self) -> Financials:
	"""Extract financial data (annual and quarterly)"""
	annual_data = []
	quarterly_data = []

	# Find the financials section
	financials_section = self.soup.find('div', id='financials')
	if not financials_section:
	return Financials(annual=[], quarterly=[])

	# Extract annual financials
	annual_tab = financials_section.find('div', class_='tabs__panel', attrs={'data-name': 'Annual'})
	if annual_tab:
	table = annual_tab.find('table')
	if table:
	headers = []
	rows_data = []

	# Extract headers
	header_row = table.find('thead').find('tr')
	for th in header_row.find_all('th'):
	headers.append(th.text.strip())

	# Extract data rows
	body_rows = table.find('tbody').find_all('tr')
	for row in body_rows:
	row_data = {}
	cells = row.find_all('td')
	if len(cells) == len(headers):
	for i, cell in enumerate(cells):
	row_data[headers[i]] = cell.text.strip()
	rows_data.append(row_data)

	# Process annual data
	if headers and rows_data:
	for i in range(1, len(headers)): # Skip first header (metric names)
	period = headers[i]
	entry = FinancialEntry(period=period)

	for row in rows_data:
	metric = row[headers[0]]
	value = row[period]

	if 'Sales' in metric:
	entry.sales = self._clean_number(value)
	elif 'Profit after Taxation' in metric:
	entry.profit_after_tax = self._clean_number(value)
	elif 'EPS' in metric:
	entry.eps = self._clean_number(value)

	annual_data.append(entry)

	# Extract quarterly financials
	quarterly_tab = financials_section.find('div', class_='tabs__panel', attrs={'data-name': 'Quarterly'})
	if quarterly_tab:
	table = quarterly_tab.find('table')
	if table:
	headers = []
	rows_data = []

	# Extract headers
	header_row = table.find('thead').find('tr')
	for th in header_row.find_all('th'):
	headers.append(th.text.strip())

	# Extract data rows
	body_rows = table.find('tbody').find_all('tr')
	for row in body_rows:
	row_data = {}
	cells = row.find_all('td')
	if len(cells) == len(headers):
	for i, cell in enumerate(cells):
	row_data[headers[i]] = cell.text.strip()
	rows_data.append(row_data)

	# Process quarterly data
	if headers and rows_data:
	for i in range(1, len(headers)): # Skip first header (metric names)
	period = headers[i]
	entry = FinancialEntry(period=period)

	for row in rows_data:
	metric = row[headers[0]]
	value = row[period]

	if 'Sales' in metric:
	entry.sales = self._clean_number(value)
	elif 'Profit after Taxation' in metric:
	entry.profit_after_tax = self._clean_number(value)
	elif 'EPS' in metric:
	entry.eps = self._clean_number(value)

	quarterly_data.append(entry)

	return Financials(annual=annual_data, quarterly=quarterly_data)


	def extract_ratios(self) -> List[RatioEntry]:
	"""Extract financial ratios"""
	ratios = []

	ratios_section = self.soup.find('div', id='ratios')
	if not ratios_section:
	return ratios

	table = ratios_section.find('table')
	if not table:
	return ratios

	headers = []
	rows_data = []

	# Extract headers
	header_row = table.find('thead').find('tr')
	for th in header_row.find_all('th'):
	headers.append(th.text.strip())

	# Extract data rows
	body_rows = table.find('tbody').find_all('tr')
	for row in body_rows:
	row_data = {}
	cells = row.find_all('td')
	if len(cells) == len(headers):
	for i, cell in enumerate(cells):
	row_data[headers[i]] = cell.text.strip()
	rows_data.append(row_data)

	# Process ratio data
	if headers and rows_data:
	for i in range(1, len(headers)): # Skip first header (ratio names)
	period = headers[i]
	entry = RatioEntry(period=period)

	for row in rows_data:
	ratio_name = row[headers[0]]
	value = row[period]

	# Clean value (remove parentheses for negative numbers)
	clean_value = value.replace('(', '').replace(')', '')

	if 'Gross Profit Margin' in ratio_name:
	entry.gross_profit_margin = self._clean_number(clean_value)
	elif 'Net Profit Margin' in ratio_name:
	entry.net_profit_margin = self._clean_number(clean_value)
	elif 'EPS Growth' in ratio_name:
	entry.eps_growth = self._clean_number(clean_value)
	elif 'PEG' in ratio_name:
	entry.peg = self._clean_number(clean_value)

	ratios.append(entry)

	return ratios

	def scrape_all_data(self) -> CompanyData:
	"""Scrape all data and return as CompanyData object"""
	return CompanyData(
	announcements=self.extract_announcements(),
	financials=self.extract_financials(),
	ratios=self.extract_ratios()
	)



	def fetch_circuit_breaker_table(self,table_id:str) -> list[CircuitBreakerRow]:
	table = self.soup.find("table", id=table_id)
	if not table or not table.tbody:
	return []
	records = []
	for row in table.tbody.find_all("tr"):
	cols = [td.get_text(strip=True) for td in row.find_all("td")]

	records.append(
	CircuitBreakerRow(
	symbol=cols[0],
	ldcp=cols[1],
	open=cols[2],
	high=cols[3],
	low=cols[4],
	current=cols[5],
	change=cols[6],
	change_percent=cols[7],
	volume=cols[8],
	)
	)

	return records