Spaces:

Roberta2024
/

0825ESGscraper

Sleeping

App Files Files Community

0825ESGscraper / app.py

Roberta2024

Create app.py

495475c verified 6 months ago

raw

history blame contribute delete

12.2 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	from typing import List, Tuple
	import time
	from pandas.io.formats.style import Styler
	import streamlit as st
	import os
	from datetime import datetime
	import io

	BASE_URL = "https://cgc.twse.com.tw/front/chPage"

	def fetch_page(offset: int, max_per: int = 30, fmt: str = "") -> str:
	params = {"offset": offset, "max": max_per, "format": fmt}
	resp = requests.get(BASE_URL, params=params, timeout=10)
	resp.raise_for_status()
	return resp.text

	def parse_companies(html: str) -> List[Tuple[str, str, str]]:
	soup = BeautifulSoup(html, "html.parser")
	results = []
	for tr in soup.select("table tr"):
	tds = tr.find_all("td")
	if len(tds) >= 3:
	code = tds[1].get_text(strip=True)
	name = tds[2].get_text(strip=True)
	link_tag = tds[2].find("a")
	url = link_tag["href"].strip() if link_tag and "href" in link_tag.attrs else ""
	if code.isdigit():
	results.append((code, name, url))
	return results

	def collect_all(start_offset: int = 0, max_per: int = 30, max_pages: int = 100, progress_bar=None, status_text=None) -> pd.DataFrame:
	all_rows = []
	offset = start_offset

	for i in range(max_pages):
	try:
	# 更新進度條和狀態
	if progress_bar:
	progress_bar.progress((i + 1) / max_pages)
	if status_text:
	status_text.text(f"正在爬取第 {i + 1} 頁，偏移量: {offset}")

	html = fetch_page(offset, max_per)
	rows = parse_companies(html)
	if not rows:
	if status_text:
	status_text.text(f"已完成爬取，共處理 {i + 1} 頁")
	break
	all_rows.extend(rows)
	offset += max_per
	time.sleep(0.5)
	except Exception as e:
	if status_text:
	status_text.text(f"錯誤發生於偏移量 {offset}: {e}")
	break

	# 加入編號欄位
	df = pd.DataFrame(all_rows, columns=["公司代碼", "公司名稱", "公司網址"])
	df.insert(0, "編號", range(1, len(df) + 1))
	return df

	def style_dataframe(df: pd.DataFrame) -> Styler:
	"""
	設定DataFrame的樣式：
	- 編號、公司代碼、公司名稱欄位標題為藍色背景
	- 每個欄位的值交替黃色背景
	"""
	def header_style(s):
	"""設定標題樣式"""
	styles = []
	for col in s.index:
	if col in ["編號", "公司代碼", "公司名稱"]:
	styles.append('background-color: #4472C4; color: white; font-weight: bold')
	else:
	styles.append('background-color: #D9D9D9; color: black; font-weight: bold')
	return styles

	def alternating_rows(s):
	"""設定交替行顏色"""
	styles = []
	for i, col in enumerate(s.index):
	if col in ["編號", "公司代碼", "公司名稱"]:
	if s.name % 2 == 0: # 偶數行
	styles.append('background-color: #FFF2CC') # 淺黃色
	else: # 奇數行
	styles.append('background-color: #FFFFFF') # 白色
	else:
	styles.append('background-color: #F8F8F8') # 淺灰色
	return styles

	# 應用樣式
	styled = df.style.apply(alternating_rows, axis=1).apply(header_style, axis=0)

	# 設定表格整體樣式
	styled = styled.set_table_styles([
	{'selector': 'th', 'props': [('text-align', 'center'), ('padding', '8px')]},
	{'selector': 'td', 'props': [('text-align', 'center'), ('padding', '6px')]},
	{'selector': 'table', 'props': [('border-collapse', 'collapse'), ('margin', 'auto')]},
	{'selector': 'th, td', 'props': [('border', '1px solid #CCCCCC')]}
	])

	return styled

	def save_to_excel(df: pd.DataFrame) -> bytes:
	"""儲存為Excel格式並應用樣式，返回bytes"""
	output = io.BytesIO()

	# 建立樣式化的DataFrame
	with pd.ExcelWriter(output, engine='openpyxl') as writer:
	# 先寫入基本資料
	df.to_excel(writer, sheet_name='公司資料', index=False)

	# 取得工作表以進行格式設定
	worksheet = writer.sheets['公司資料']

	# 設定欄寬
	worksheet.column_dimensions['A'].width = 8 # 編號
	worksheet.column_dimensions['B'].width = 12 # 公司代碼
	worksheet.column_dimensions['C'].width = 25 # 公司名稱
	worksheet.column_dimensions['D'].width = 40 # 公司網址

	# 使用openpyxl進行進階格式設定
	from openpyxl.styles import PatternFill, Font, Alignment, Border, Side

	# 定義顏色
	blue_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
	yellow_fill = PatternFill(start_color="FFF2CC", end_color="FFF2CC", fill_type="solid")
	white_fill = PatternFill(start_color="FFFFFF", end_color="FFFFFF", fill_type="solid")
	gray_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")

	# 定義字體
	header_font = Font(bold=True, color="FFFFFF")
	normal_font = Font(color="000000")

	# 定義對齊
	center_alignment = Alignment(horizontal="center", vertical="center")

	# 定義邊框
	thin_border = Border(
	left=Side(style='thin'),
	right=Side(style='thin'),
	top=Side(style='thin'),
	bottom=Side(style='thin')
	)

	# 設定標題行格式
	for col_num, col_name in enumerate(['編號', '公司代碼', '公司名稱', '公司網址'], 1):
	cell = worksheet.cell(row=1, column=col_num)
	cell.font = header_font
	cell.alignment = center_alignment
	cell.border = thin_border

	if col_name in ['編號', '公司代碼', '公司名稱']:
	cell.fill = blue_fill
	else:
	cell.fill = gray_fill

	# 設定資料行格式
	for row_num in range(2, len(df) + 2):
	for col_num in range(1, 5):
	cell = worksheet.cell(row=row_num, column=col_num)
	cell.font = normal_font
	cell.alignment = center_alignment
	cell.border = thin_border

	# 針對編號、公司代碼、公司名稱欄位設定交替顏色
	if col_num <= 3: # 編號、公司代碼、公司名稱
	if (row_num - 2) % 2 == 0: # 偶數行
	cell.fill = yellow_fill
	else: # 奇數行
	cell.fill = white_fill

	output.seek(0)
	return output.getvalue()

	def save_to_csv(df: pd.DataFrame) -> str:
	"""儲存為CSV格式，返回CSV字串"""
	return df.to_csv(index=False, encoding="utf-8-sig")

	def main():
	st.set_page_config(
	page_title="台灣證交所公司資料爬取工具",
	page_icon="🏢",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	st.title("🏢 台灣證交所公司資料爬取工具")
	st.markdown("這個工具可以幫您從台灣證交所網站爬取上市公司資料，並提供CSV或Excel格式下載。")

	# 側邊欄參數設定
	with st.sidebar:
	st.header("⚙️ 參數設定")

	start_offset = st.number_input(
	"起始偏移量",
	min_value=0,
	value=0,
	step=1,
	help="從第幾筆資料開始爬取"
	)

	max_per = st.slider(
	"每頁筆數",
	min_value=1,
	max_value=100,
	value=30,
	step=1,
	help="每次請求爬取的資料筆數"
	)

	max_pages = st.slider(
	"最大頁數",
	min_value=1,
	max_value=100,
	value=50,
	step=1,
	help="最多爬取幾頁資料"
	)

	output_format = st.radio(
	"輸出格式",
	options=["CSV", "Excel", "兩者都要"],
	index=1,
	help="選擇要下載的檔案格式"
	)

	st.markdown("---")

	# 使用說明
	with st.expander("📖 使用說明"):
	st.markdown("""
	### 參數說明：
	- 起始偏移量：從第幾筆資料開始爬取，通常設為0
	- 每頁筆數：每次API請求的資料筆數，建議30-50
	- 最大頁數：最多爬取幾頁，避免設定太大導致執行時間過長
	- 輸出格式：
	- CSV：純文字格式，適合後續程式處理
	- Excel：包含樣式格式的Excel檔案
	- 兩者都要：同時產生CSV和Excel檔案

	### 注意事項：
	- 爬取過程中請勿關閉瀏覽器
	- 建議先用較小的參數測試
	- 檔案會自動加上時間戳記避免重複
	""")

	# 主要內容區域
	col1, col2 = st.columns([2, 1])

	with col2:
	start_scraping = st.button("🚀 開始爬取", type="primary", use_container_width=True)

	# 執行爬取
	if start_scraping:
	# 驗證輸入參數
	if start_offset < 0:
	st.error("起始偏移量不能小於0")
	return
	if max_per <= 0 or max_per > 100:
	st.error("每頁筆數必須在1-100之間")
	return
	if max_pages <= 0 or max_pages > 1000:
	st.error("最大頁數必須在1-1000之間")
	return

	try:
	# 建立進度條和狀態顯示
	progress_bar = st.progress(0)
	status_text = st.empty()

	# 開始爬取資料
	status_text.text("開始爬取資料...")
	df = collect_all(start_offset, max_per, max_pages, progress_bar, status_text)

	if df.empty:
	st.warning("未爬取到任何資料")
	return

	# 產生時間戳記
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	# 完成狀態
	progress_bar.progress(1.0)
	status_text.text(f"✅ 成功爬取 {len(df)} 筆公司資料！")

	# 顯示資料預覽
	st.subheader("📊 資料預覽（前10筆）")
	st.dataframe(df.head(10), use_container_width=True)

	# 檔案下載區域
	st.subheader("📁 檔案下載")

	download_col1, download_col2 = st.columns(2)

	if output_format in ["CSV", "兩者都要"]:
	csv_data = save_to_csv(df)
	with download_col1:
	st.download_button(
	label="⬇️ 下載 CSV 檔案",
	data=csv_data,
	file_name=f"companies_{timestamp}.csv",
	mime="text/csv",
	use_container_width=True
	)

	if output_format in ["Excel", "兩者都要"]:
	excel_data = save_to_excel(df)
	with download_col2:
	st.download_button(
	label="⬇️ 下載 Excel 檔案",
	data=excel_data,
	file_name=f"companies_styled_{timestamp}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	use_container_width=True
	)

	# 顯示統計資訊
	st.subheader("📈 統計資訊")
	stat_col1, stat_col2, stat_col3 = st.columns(3)

	with stat_col1:
	st.metric("總公司數量", len(df))

	with stat_col2:
	st.metric("有網址的公司", len(df[df['公司網址'] != '']))

	with stat_col3:
	st.metric("執行頁數", min(max_pages, (len(df) // max_per) + 1))

	except Exception as e:
	st.error(f"❌ 爬取過程中發生錯誤：{str(e)}")

	if __name__ == "__main__":
	main()