pix / psx_scraper.py
sarim's picture
add circuit breaker ticker
f641e6a
from bs4 import BeautifulSoup
import re
from models import FinancialResult,FinancialEntry,Financials,RatioEntry,CompanyData, CircuitBreakerRow
from typing import List, Optional, Dict, Any
class PsxScraper(object):
def __init__(self, html_content:str):
self.soup = BeautifulSoup(html_content, 'html.parser')
def _clean_number(self, text: str) -> float:
"""Clean and convert number strings to float"""
if not text:
return 0.0
# Remove commas, spaces, and non-numeric characters except decimal points and minus signs
text = str(text).replace(',', '').replace(' ', '').replace('Rs.', '')
# Extract numbers with optional decimal points
match = re.search(r'[-+]?\d*\.?\d+', text)
return float(match.group()) if match else 0.0
def _extract_range(self, range_text: str) -> Dict[str, float]:
"""Extract low, high, and current values from range strings"""
# Example: "296.08 — 361.88"
parts = range_text.split('—')
if len(parts) == 2:
return {
'low': self._clean_number(parts[0]),
'high': self._clean_number(parts[1]),
'current': 0.0 # Will be set from data attributes
}
return {'low': 0.0, 'high': 0.0, 'current': 0.0}
def extract_announcements(self) -> List[FinancialResult]:
"""Extract financial results announcements"""
announcements = []
# Look for financial results tab
financial_results_tab = self.soup.find('div', class_='tabs__panel', attrs={'data-name': 'Financial Results'})
if not financial_results_tab:
return announcements
table = financial_results_tab.find('table')
if not table:
return announcements
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cols = row.find_all('td')
if len(cols) >= 3:
date = cols[0].text.strip()
title = cols[1].text.strip()
# Extract links
document_link = None
pdf_link = None
links = cols[2].find_all('a')
for link in links:
href = link.get('href', '')
if 'javascript:' in href:
document_link = href
elif '.pdf' in href:
pdf_link = href
announcements.append(FinancialResult(
date=date,
title=title,
documentLink=document_link,
pdfLink=pdf_link
))
return announcements
def extract_financials(self) -> Financials:
"""Extract financial data (annual and quarterly)"""
annual_data = []
quarterly_data = []
# Find the financials section
financials_section = self.soup.find('div', id='financials')
if not financials_section:
return Financials(annual=[], quarterly=[])
# Extract annual financials
annual_tab = financials_section.find('div', class_='tabs__panel', attrs={'data-name': 'Annual'})
if annual_tab:
table = annual_tab.find('table')
if table:
headers = []
rows_data = []
# Extract headers
header_row = table.find('thead').find('tr')
for th in header_row.find_all('th'):
headers.append(th.text.strip())
# Extract data rows
body_rows = table.find('tbody').find_all('tr')
for row in body_rows:
row_data = {}
cells = row.find_all('td')
if len(cells) == len(headers):
for i, cell in enumerate(cells):
row_data[headers[i]] = cell.text.strip()
rows_data.append(row_data)
# Process annual data
if headers and rows_data:
for i in range(1, len(headers)): # Skip first header (metric names)
period = headers[i]
entry = FinancialEntry(period=period)
for row in rows_data:
metric = row[headers[0]]
value = row[period]
if 'Sales' in metric:
entry.sales = self._clean_number(value)
elif 'Profit after Taxation' in metric:
entry.profit_after_tax = self._clean_number(value)
elif 'EPS' in metric:
entry.eps = self._clean_number(value)
annual_data.append(entry)
# Extract quarterly financials
quarterly_tab = financials_section.find('div', class_='tabs__panel', attrs={'data-name': 'Quarterly'})
if quarterly_tab:
table = quarterly_tab.find('table')
if table:
headers = []
rows_data = []
# Extract headers
header_row = table.find('thead').find('tr')
for th in header_row.find_all('th'):
headers.append(th.text.strip())
# Extract data rows
body_rows = table.find('tbody').find_all('tr')
for row in body_rows:
row_data = {}
cells = row.find_all('td')
if len(cells) == len(headers):
for i, cell in enumerate(cells):
row_data[headers[i]] = cell.text.strip()
rows_data.append(row_data)
# Process quarterly data
if headers and rows_data:
for i in range(1, len(headers)): # Skip first header (metric names)
period = headers[i]
entry = FinancialEntry(period=period)
for row in rows_data:
metric = row[headers[0]]
value = row[period]
if 'Sales' in metric:
entry.sales = self._clean_number(value)
elif 'Profit after Taxation' in metric:
entry.profit_after_tax = self._clean_number(value)
elif 'EPS' in metric:
entry.eps = self._clean_number(value)
quarterly_data.append(entry)
return Financials(annual=annual_data, quarterly=quarterly_data)
def extract_ratios(self) -> List[RatioEntry]:
"""Extract financial ratios"""
ratios = []
ratios_section = self.soup.find('div', id='ratios')
if not ratios_section:
return ratios
table = ratios_section.find('table')
if not table:
return ratios
headers = []
rows_data = []
# Extract headers
header_row = table.find('thead').find('tr')
for th in header_row.find_all('th'):
headers.append(th.text.strip())
# Extract data rows
body_rows = table.find('tbody').find_all('tr')
for row in body_rows:
row_data = {}
cells = row.find_all('td')
if len(cells) == len(headers):
for i, cell in enumerate(cells):
row_data[headers[i]] = cell.text.strip()
rows_data.append(row_data)
# Process ratio data
if headers and rows_data:
for i in range(1, len(headers)): # Skip first header (ratio names)
period = headers[i]
entry = RatioEntry(period=period)
for row in rows_data:
ratio_name = row[headers[0]]
value = row[period]
# Clean value (remove parentheses for negative numbers)
clean_value = value.replace('(', '').replace(')', '')
if 'Gross Profit Margin' in ratio_name:
entry.gross_profit_margin = self._clean_number(clean_value)
elif 'Net Profit Margin' in ratio_name:
entry.net_profit_margin = self._clean_number(clean_value)
elif 'EPS Growth' in ratio_name:
entry.eps_growth = self._clean_number(clean_value)
elif 'PEG' in ratio_name:
entry.peg = self._clean_number(clean_value)
ratios.append(entry)
return ratios
def scrape_all_data(self) -> CompanyData:
"""Scrape all data and return as CompanyData object"""
return CompanyData(
announcements=self.extract_announcements(),
financials=self.extract_financials(),
ratios=self.extract_ratios()
)
def fetch_circuit_breaker_table(self,table_id:str) -> list[CircuitBreakerRow]:
table = self.soup.find("table", id=table_id)
if not table or not table.tbody:
return []
records = []
for row in table.tbody.find_all("tr"):
cols = [td.get_text(strip=True) for td in row.find_all("td")]
records.append(
CircuitBreakerRow(
symbol=cols[0],
ldcp=cols[1],
open=cols[2],
high=cols[3],
low=cols[4],
current=cols[5],
change=cols[6],
change_percent=cols[7],
volume=cols[8],
)
)
return records