Spaces:

PuppetLover
/

StockInsights

Sleeping

App Files Files Community

StockInsights / src /report_generator.py

PuppetLover

Update src/report_generator.py

ee1c5d3 verified 5 months ago

raw

history blame contribute delete

11.9 kB

	import sqlite3
	import pandas as pd
	import os
	from datetime import datetime
	import streamlit as st

	# def get_db_path():
	# db_path = "../database/stock_insights.db"
	# if not os.path.exists(db_path) and os.path.exists("/tmp/stock_insights.db"):
	# db_path = "/tmp/stock_insights.db"
	# return db_path

	# download from Hugging Face dataset
	def ensure_db():
	repo_path = os.path.join(os.getcwd(), "database", "stock_insights.db")
	if os.path.exists(repo_path):
	return repo_path

	candidates = [
	os.path.join("/app", "database", "stock_insights.db"),
	os.path.join("/tmp", "database", "stock_insights.db"),
	os.path.join("/tmp", "stock_insights.db"),
	]
	for p in candidates:
	if os.path.exists(p):
	return p

	try:
	from huggingface_hub import hf_hub_download
	tmp_dir = os.path.join("/tmp", "database")
	os.makedirs(tmp_dir, exist_ok=True)
	local_file = hf_hub_download(
	repo_id="PuppetLover/stock_insights",
	filename="stock_insights.db",
	repo_type="dataset",
	local_dir=tmp_dir,
	local_dir_use_symlinks=False,
	)
	return local_file
	except Exception as e:
	local_rel = os.path.join("database", "stock_insights.db")
	if os.path.exists(local_rel):
	return local_rel
	raise RuntimeError(f"Cannot access or download database file: {e}")

	# gọi và gán hằng DB_PATH dùng trong module
	DB_PATH = ensure_db()

	def generate_stock_report(stock_code, time_period):

	start_date, end_date = time_period
	today = datetime.now().date()
	# db_path = os.path.join("database", "stock_insights.db")
	db_path = DB_PATH

	report = {
	"stock_code": stock_code,
	"report_period": f"{start_date} to {end_date}"
	}

	with sqlite3.connect(db_path) as conn:
	# Tạo bảng tạm relevant_articles
	conn.execute("DROP TABLE IF EXISTS relevant_articles;")
	conn.execute("""
	CREATE TEMP TABLE relevant_articles AS
	SELECT DISTINCT article_id FROM entities
	WHERE entity_text =?
	AND entity_type IN ('STOCK', 'COMPANY')
	AND confidence = 'high'
	AND article_id IN (
	SELECT article_id FROM articles WHERE publish_date BETWEEN ? AND ?
	);
	""", (stock_code, start_date, end_date))

	# 1. OVERALL SENTIMENT
	q_sentences = """
	SELECT s.sentiment_score, s.sentiment_label, a.publish_date
	FROM sentences s
	JOIN articles a ON s.article_id = a.article_id
	WHERE s.article_id IN (
	SELECT s2.sentence_id FROM sentences s2
	WHERE s2.article_id IN (SELECT article_id FROM relevant_articles)
	)
	AND s.sentiment_score IS NOT NULL;
	"""
	df_sent = pd.read_sql_query(q_sentences, conn)

	if not df_sent.empty:
	df_sent['publish_date'] = pd.to_datetime(df_sent['publish_date']).dt.date
	df_sent['days_ago'] = (today - df_sent['publish_date']).apply(lambda x: x.days)
	df_sent['weight'] = 1 / (df_sent['days_ago'] + 1)
	weighted_score = (df_sent['sentiment_score'] * df_sent['weight']).sum() / df_sent['weight'].sum()
	# Chuẩn hóa nhãn sentiment về lower-case
	df_sent['sentiment_label'] = df_sent['sentiment_label'].str.lower()
	sentiment_counts = df_sent['sentiment_label'].value_counts().to_dict()
	trend = "Tích cực" if weighted_score > 0.1 else "Tiêu cực" if weighted_score < -0.1 else "Trung tính"
	else:
	weighted_score, sentiment_counts, trend = 0.0, {}, "Không có dữ liệu"

	report["overall_sentiment"] = {
	"score": weighted_score,
	"trend": trend,
	"positive_mentions": sentiment_counts.get("positive", 0),
	"negative_mentions": sentiment_counts.get("negative", 0),
	"neutral_mentions": sentiment_counts.get("neutral", 0)
	}

	# 2. KEY EVENTS, RISKS, PRICE ACTIONS
	def get_key_entities(entity_type):
	query = f"""
	SELECT
	e.entity_text,
	COUNT(e.entity_id) as count,
	AVG(s.sentiment_score) as avg_sentiment
	FROM entities e
	JOIN sentences s ON e.sentence_id = s.sentence_id
	WHERE e.article_id IN (SELECT article_id FROM relevant_articles)
	AND e.entity_type =?
	GROUP BY e.entity_text
	ORDER BY count DESC
	LIMIT 5;
	"""
	df = pd.read_sql_query(query, conn, params=(entity_type,))
	def score_to_label(score):
	if score is None: return "N/A"
	return "Tích cực" if score > 0.1 else "Tiêu cực" if score < -0.1 else "Trung tính"
	df['sentiment'] = df['avg_sentiment'].apply(score_to_label)
	return df.to_dict('records')

	report["key_events"] = get_key_entities('EVENT')
	report["key_price_actions"] = get_key_entities('PRICE_ACTION')
	report["key_risks_mentioned"] = get_key_entities('RISK')

	# 3. TOP RELATED ENTITIES
	q_related = """
	SELECT e.entity_type, e.entity_text
	FROM entities e
	WHERE e.article_id IN (SELECT article_id FROM relevant_articles)
	AND e.entity_text!=?
	AND e.entity_type IN ('STOCK', 'COMPANY', 'PERSON');
	"""
	df_related = pd.read_sql_query(q_related, conn, params=(stock_code,))
	top_related = {}
	if not df_related.empty:
	for etype in ['STOCK', 'COMPANY', 'PERSON']:
	top_related[etype.lower() + 's'] = df_related[df_related['entity_type'] == etype]['entity_text'].value_counts().head(3).index.tolist()
	report["top_related_entities"] = top_related

	# 4. SOURCE ARTICLES
	q_articles = """
	SELECT a.title, a.source_url, s.sentiment_label
	FROM articles a
	JOIN sentences s ON a.article_id = s.article_id
	WHERE a.article_id IN (SELECT article_id FROM relevant_articles)
	GROUP BY a.article_id
	ORDER BY a.publish_date DESC
	LIMIT 5;
	"""
	df_articles = pd.read_sql_query(q_articles, conn)
	report["source_articles"] = df_articles.to_dict('records')

	return report

	# --- HIỂN THỊ BÁO CÁO ---
	def show_report(report_data, summary, stock_code_input):
	st.markdown(
	f"<h3 style='text-align: center; color: #30cfd0; margin-top:2rem;'>Báo cáo Phân tích cho {report_data.get('stock_code', stock_code_input)}</h3>", unsafe_allow_html=True)
	st.markdown(
	f"<p style='text-align: center; color: #94a3b8;'>Giai đoạn: {report_data.get('report_period', 'N/A')}</p>", unsafe_allow_html=True)

	st.markdown("#### 🤖 Tóm tắt từ AI")
	st.info(summary)

	# Tổng quan cảm xúc
	st.markdown("#### 📊 Tổng quan Cảm xúc")
	sentiment = report_data['overall_sentiment']
	score = sentiment['score']
	trend_color = "normal"
	if sentiment['trend'] == "Tích cực":
	trend_color = "normal"
	if sentiment['trend'] == "Tiêu cực":
	trend_color = "inverse"

	st.metric(
	label="Điểm Cảm xúc (có trọng số thời gian)",
	value=f"{score:.2f}" if score is not None else "N/A",
	delta=sentiment['trend'],
	delta_color=trend_color
	)

	col1, col2, col3 = st.columns(3)
	col1.metric("👍 Tích cực", sentiment['positive_mentions'])
	col2.metric("👎 Tiêu cực", sentiment['negative_mentions'])
	col3.metric("😐 Trung tính", sentiment['neutral_mentions'])

	# Các bảng chi tiết
	st.markdown("---")

	col_events, col_risks = st.columns(2)
	with col_events:
	st.markdown("#### ⚡ Sự kiện Nổi bật")
	if report_data["key_events"]:
	# Kiểm tra key thực tế
	df_events = pd.DataFrame(report_data["key_events"])
	if 'avg_sentiment' in df_events.columns:
	df_events = df_events.rename(
	columns={'entity_text': 'Sự kiện', 'avg_sentiment': 'Sentiment'})
	show_cols = ['Sự kiện', 'count', 'Sentiment']
	elif 'sentiment' in df_events.columns:
	df_events = df_events.rename(
	columns={'entity_text': 'Sự kiện'})
	show_cols = ['Sự kiện', 'count', 'sentiment']
	else:
	df_events = df_events.rename(
	columns={'entity_text': 'Sự kiện'})
	show_cols = ['Sự kiện', 'count']
	st.dataframe(df_events[show_cols], use_container_width=True)
	else:
	st.write("Không có sự kiện nổi bật.")

	with col_risks:
	st.markdown("#### ⚠️ Rủi ro được đề cập")
	if report_data["key_risks_mentioned"]:
	df_risks = pd.DataFrame(report_data["key_risks_mentioned"])
	if 'avg_sentiment' in df_risks.columns:
	df_risks = df_risks.rename(
	columns={'entity_text': 'Rủi ro', 'avg_sentiment': 'Sentiment'})
	show_cols = ['Rủi ro', 'count', 'Sentiment']
	elif 'sentiment' in df_risks.columns:
	df_risks = df_risks.rename(
	columns={'entity_text': 'Rủi ro'})
	show_cols = ['Rủi ro', 'count', 'sentiment']
	else:
	df_risks = df_risks.rename(
	columns={'entity_text': 'Rủi ro'})
	show_cols = ['Rủi ro', 'count']
	st.dataframe(df_risks[show_cols], use_container_width=True)
	else:
	st.write("Không có rủi ro nổi bật.")

	st.markdown("#### 📈 Hành động Giá Chính")
	if report_data["key_price_actions"]:
	df_price = pd.DataFrame(report_data["key_price_actions"])
	if 'avg_sentiment' in df_price.columns:
	df_price = df_price.rename(
	columns={'entity_text': 'Hành động giá', 'avg_sentiment': 'Sentiment'})
	show_cols = ['Hành động giá', 'count', 'Sentiment']
	elif 'sentiment' in df_price.columns:
	df_price = df_price.rename(
	columns={'entity_text': 'Hành động giá'})
	show_cols = ['Hành động giá', 'count', 'sentiment']
	else:
	df_price = df_price.rename(
	columns={'entity_text': 'Hành động giá'})
	show_cols = ['Hành động giá', 'count']
	st.dataframe(df_price[show_cols], use_container_width=True)
	else:
	st.write("Không có hành động giá nổi bật.")

	# Thực thể liên quan
	st.markdown("---")
	st.markdown("#### 🔗 Các Thực thể Liên quan nhiều nhất")
	related = report_data['top_related_entities']
	if any(related.values()):
	for etype, entities in related.items():
	if entities:
	st.markdown(
	f"{etype.replace('_', ' ').title()}: {', '.join(entities)}")
	else:
	st.write("Không tìm thấy thực thể liên quan nổi bật.")

	# Nguồn bài viết
	st.markdown("---")
	st.markdown("#### 📰 Nguồn Bài viết Tham khảo")
	if report_data["source_articles"]:
	for article in report_data["source_articles"]:
	st.markdown(
	f"- [{article['title']}]({article['source_url']}) - Cảm xúc: {article['sentiment_label']}")
	else:
	st.write("Không có bài viết nào trong khoảng thời gian này.")

	st.markdown("</div>", unsafe_allow_html=True)