Spaces:

tlong-ds
/

daai_rangdong

Sleeping

App Files Files Community

daai_rangdong / database.py

tlong-ds

update

777f36a 10 months ago

raw

history blame contribute delete

26.4 kB

	import pandas as pd
	import os
	import mysql.connector
	from mysql.connector import Error
	from dotenv import load_dotenv
	from langchain_core.tools import tool
	import decimal
	import numpy as np
	from collections import defaultdict
	from typing import List, Dict
	import json
	load_dotenv()

	class Database:
	def __init__(self):
	self.connection = None
	self.cursor = None
	self.data_dir = "sales_data"

	def connect(self):
	"""Establish connection to MySQL database"""
	try:
	self.connection = mysql.connector.connect(
	host=os.getenv('MYSQL_HOST'),
	user=os.getenv('MYSQL_USER'),
	password=os.getenv('MYSQL_PASSWORD'),
	database=os.getenv('MYSQL_DB'),
	port=int(os.getenv('MYSQL_PORT', 3306))
	)
	if self.connection.is_connected():
	self.cursor = self.connection.cursor()
	return True
	except Error as e:
	print(f"Error connecting to MySQL: {e}")
	return False

	def close(self):
	"""Close database connection"""
	if self.connection and self.connection.is_connected():
	if self.cursor:
	self.cursor.close()
	self.connection.close()
	print("MySQL connection closed")

	def create_salein_class(self):
	"""Create and populate salein_class table"""
	try:
	self.cursor.execute("""
	CREATE TABLE IF NOT EXISTS salein_class (
	id INT AUTO_INCREMENT PRIMARY KEY,
	date DATE,
	employee_name VARCHAR(255),
	department VARCHAR(255),
	item_code VARCHAR(50),
	product_name VARCHAR(255),
	quantity INT
	)
	""")

	salein_class_df = pd.read_csv(os.path.join(self.data_dir, "salein_class.csv"))
	for _, row in salein_class_df.iterrows():
	self.cursor.execute("""
	INSERT IGNORE INTO salein_class (date, employee_name, department, item_code, product_name, quantity)
	VALUES (%s, %s, %s, %s, %s, %s)
	""", (row['Date'], row['Employee_Name'], row['Department'], row['item_code'], row['product_name'], row['Số lượng']))
	print("Successfully created salein_class table")
	return True
	except Error as e:
	print(f"Error creating salein_class table: {e}")
	return False

	def create_salein_thuc_xuat(self):
	"""Create and populate salein_thuc_xuat table"""
	try:
	self.cursor.execute("""
	CREATE TABLE IF NOT EXISTS salein_thuc_xuat (
	id INT AUTO_INCREMENT PRIMARY KEY,
	date DATE,
	employee_name VARCHAR(255),
	unit_code VARCHAR(255),
	product_name VARCHAR(255),
	quantity INT,
	province VARCHAR(255)
	)
	""")

	salein_thuc_xuat_df = pd.read_csv(os.path.join(self.data_dir, "salein_thuc_xuat.csv"))
	for _, row in salein_thuc_xuat_df.iterrows():
	self.cursor.execute("""
	INSERT IGNORE INTO salein_thuc_xuat (date, employee_name, unit_code, product_name, quantity, province)
	VALUES (%s, %s, %s, %s, %s, %s)
	""", (row['Date'], row['Employee_Name'], row['unit_code'], row['product_name'], row['Số lượng xuất'], row['province']))
	print("Successfully created salein_thuc_xuat table")
	return True
	except Error as e:
	print(f"Error creating salein_thuc_xuat table: {e}")
	return False

	def create_kpi_thuc_xuat(self):
	"""Create and populate kpi_thuc_xuat table"""
	try:
	self.cursor.execute("""
	CREATE TABLE IF NOT EXISTS kpi_thuc_xuat (
	id INT AUTO_INCREMENT PRIMARY KEY,
	date DATE,
	employee_name VARCHAR(255),
	kpi_code VARCHAR(50),
	region VARCHAR(255),
	product_name VARCHAR(255),
	kpi_score INT
	)
	""")

	kpi_thuc_xuat_df = pd.read_csv(os.path.join(self.data_dir, "kpi_thuc_xuat.csv"))
	for _, row in kpi_thuc_xuat_df.iterrows():
	self.cursor.execute("""
	INSERT IGNORE INTO kpi_thuc_xuat (date, employee_name, kpi_code, region, product_name, kpi_score)
	VALUES (%s, %s, %s, %s, %s, %s)
	""", (row['Date'], row['Employee_Name'], row['KPI_code'], row['region'], row['product_name'], row['Số KPI']))
	print("Successfully created kpi_thuc_xuat table")
	return True
	except Error as e:
	print(f"Error creating kpi_thuc_xuat table: {e}")
	return False

	def create_tables(self):
	"""Create all necessary tables and insert data into them"""
	if not self.connect():
	return

	try:
	if self.create_salein_class():
	print("Salein Class table created and data inserted.")
	if self.create_salein_thuc_xuat():
	print("Salein Thuc Xuat table created and data inserted.")
	if self.create_kpi_thuc_xuat():
	print("KPI Thuc Xuat table created and data inserted.")

	self.connection.commit()
	print("All tables created and data inserted successfully.")

	except Error as e:
	print(f"Error in database creation or data insertion: {e}")

	finally:
	self.close()

	def extract_tables_schemas(self):
	"""Extract schemas of all tables in the database"""
	if not self.connect():
	return {}

	try:
	# Get list of tables
	self.cursor.execute("""
	SELECT table_name
	FROM information_schema.tables
	WHERE table_schema = %s
	""", (os.getenv('MYSQL_DB'),))
	tables = self.cursor.fetchall()

	schemas = {}
	for (table_name,) in tables:
	# Get columns info for each table
	self.cursor.execute("""
	SELECT
	column_name,
	column_type,
	is_nullable,
	column_key,
	extra,
	column_comment
	FROM information_schema.columns
	WHERE table_schema = %s
	AND table_name = %s
	ORDER BY ordinal_position
	""", (os.getenv('MYSQL_DB'), table_name))

	columns = self.cursor.fetchall()

	# Get foreign keys info
	self.cursor.execute("""
	SELECT
	column_name,
	referenced_table_name,
	referenced_column_name
	FROM information_schema.key_column_usage
	WHERE table_schema = %s
	AND table_name = %s
	AND referenced_table_name IS NOT NULL
	""", (os.getenv('MYSQL_DB'), table_name))

	foreign_keys = self.cursor.fetchall()

	# Format schema information
	schema = {
	'columns': [
	{
	'name': col[0],
	'type': col[1],
	'nullable': col[2],
	'key': col[3],
	'extra': col[4],
	'comment': col[5]
	}
	for col in columns
	],
	'foreign_keys': [
	{
	'column': fk[0],
	'references_table': fk[1],
	'references_column': fk[2]
	}
	for fk in foreign_keys
	]
	}

	schemas[table_name] = schema

	return schemas

	except Error as e:
	print(f"Error extracting table schemas: {e}")
	return {}

	finally:
	self.close()

	def get_distinct_values(self, table:str, column: str) -> list:
	"""
	Get distinct values of a column in a table.

	Args:
	table (str): Name of the table.
	column (str): Column name to retrieve distinct values from.

	Returns:
	list: A list of unique values found in the specified column.
	"""
	self.cursor.execute(f"SELECT DISTINCT {column} FROM {table}")
	return [row[0] for row in self.cursor.fetchall()]


	def get_total(self, table: str, value_col: str) -> float:
	"""
	Get the total sum of a numeric column in a table.

	Args:
	table (str): Name of the table.
	value_col (str): Name of the column to sum.

	Returns:
	float: The total sum of the specified column.
	"""
	self.cursor.execute(f"SELECT SUM({value_col}) FROM {table}")
	return self.cursor.fetchone()[0]


	def get_count(self, table: str) -> int:
	"""
	Count total number of records in a table.

	Args:
	table (str): Name of the table.

	Returns:
	int: Total number of rows in the table.
	"""
	self.cursor.execute(f"SELECT COUNT(*) FROM {table}")
	return self.cursor.fetchone()[0]


	def get_total_by_group(self, table: str, group_col: str, value_col: str) -> list:
	"""
	Get total value grouped by a specified column.

	Args:
	table (str): Name of the table.
	group_col (str): Column to group by (e.g. 'region', 'department').
	value_col (str): Column to aggregate using SUM.

	Returns:
	list[tuple]: List of (group_value, total) pairs sorted by total descending.
	e.g: [("North", 5000), ("South", 3000)]
	"""
	self.cursor.execute(f"""
	SELECT {group_col}, SUM({value_col}) AS total
	FROM {table}
	GROUP BY {group_col}
	ORDER BY total DESC
	""")
	return self.cursor.fetchall()


	def get_avg_by_group(self, table: str, group_col: str, value_col: str) -> list:
	"""
	Get average value grouped by a specified column.

	Args:
	table (str): Name of the table.
	group_col (str): Column to group by (e.g. 'product_name').
	value_col (str): Column to aggregate using AVG.

	Returns:
	list[tuple]: List of (group_value, average) pairs sorted by average descending.
	e.g: [("Product A", 105.4), ("Product B", 89.2)]
	"""
	self.cursor.execute(f"""
	SELECT {group_col}, AVG({value_col}) AS avg_value
	FROM {table}
	GROUP BY {group_col}
	ORDER BY avg_value DESC
	""")
	return self.cursor.fetchall()


	def get_total_by_month(self, table: str, date_col: str, value_col: str) -> list:
	"""
	Get total value grouped by month (from a date column).

	Args:
	table (str): Name of the table.
	date_col (str): Column containing date values.
	value_col (str): Column to aggregate using SUM.

	Returns:
	list[tuple]: List of (month, total) pairs in chronological order.
	e.g: [("2024-01", 1200), ("2024-02", 1800)]
	"""
	self.cursor.execute(f"""
	SELECT DATE_FORMAT({date_col}, '%Y-%m') AS month, SUM({value_col}) AS total
	FROM {table}
	GROUP BY month
	ORDER BY month
	""")
	return self.cursor.fetchall()


	def get_entity_trend(self, table: str, entity_col: str, entity_value: str, date_col: str, value_col: str) -> list:
	"""
	Get monthly trend of total value for a specific entity (e.g. employee, product).

	Args:
	table (str): Name of the table.
	entity_col (str): Column to filter by (e.g. 'employee_name').
	entity_value (str): Value of the entity to track.
	date_col (str): Date column to group by month.
	value_col (str): Column to sum.

	Returns:
	list[tuple]: List of (month, total) for the specified entity.
	e.g: [("2024-01", 300), ("2024-02", 500)]
	"""
	self.cursor.execute(f"""
	SELECT DATE_FORMAT({date_col}, '%Y-%m') AS month, SUM({value_col}) AS total
	FROM {table}
	WHERE {entity_col} = %s
	GROUP BY month
	ORDER BY month
	""", (entity_value,))
	return self.cursor.fetchall()


	def compare_plan_vs_actual(self, plan_table: str, actual_table: str, match_col: str, value_col: str) -> list:
	"""
	Compare planned vs actual values for a common attribute (e.g. product_name).

	Args:
	plan_table (str): Table containing planned values.
	actual_table (str): Table containing actual values.
	match_col (str): Column used to join the two tables (e.g. 'product_name').
	value_col (str): Column to compare (e.g. 'quantity').

	Returns:
	list[tuple]: List of (item, planned, actual, difference) sorted by difference.
	e.g: [("Product A", 1000, 800, -200)]
	"""
	self.cursor.execute(f"""
	SELECT
	p.{match_col},
	SUM(p.{value_col}) AS planned,
	SUM(a.{value_col}) AS actual,
	SUM(a.{value_col}) - SUM(p.{value_col}) AS difference
	FROM {plan_table} p
	JOIN {actual_table} a ON p.{match_col} = a.{match_col}
	GROUP BY p.{match_col}
	ORDER BY difference DESC
	""")
	return self.cursor.fetchall()


	def get_monthly_growth(self, table: str, date_col: str, value_col: str) -> list:
	"""
	Calculate monthly growth rate based on summed values.

	Args:
	table (str): Name of the table.
	date_col (str): Date column to group by month.
	value_col (str): Column to aggregate using SUM.

	Returns:
	list[tuple]: List of (month, total, previous_total, growth_rate) for each month.
	e.g: [("2024-02", 1200, 1000, 20.0)]
	"""
	self.cursor.execute(f"""
	SELECT
	month,
	total,
	LAG(total) OVER (ORDER BY month) AS prev_total,
	ROUND((total - LAG(total) OVER (ORDER BY month)) / LAG(total) OVER (ORDER BY month) * 100, 2) AS growth_rate
	FROM (
	SELECT DATE_FORMAT({date_col}, '%Y-%m') AS month, SUM({value_col}) AS total
	FROM {table}
	GROUP BY month
	) AS subquery
	""")
	return self.cursor.fetchall()


	def get_best_employees_by_score(self, kpi_table: str, salein_table: str, top_n: int = 5) -> List[Dict]:
	"""
	Calculate a composite score for employees based on average KPI and total quantity.

	Args:
	kpi_table (str): Name of the KPI table (columns: employee_name, kpi_score).
	salein_table (str): Name of the actual sales table (columns: employee_name, quantity).
	top_n (int): Number of top employees to return.

	Returns:
	list[dict]: List of employees with their average KPI, total quantity, and composite score.
	Example:
	[{"employee_name": "A", "avg_kpi": 85.5, "quantity": 1200, "score": 91.3}]
	"""
	# Join both tables on employee_name, then calculate weighted score
	self.cursor.execute(f"""
	SELECT
	k.employee_name,
	AVG(k.kpi_score) AS avg_kpi,
	SUM(s.quantity) AS total_quantity,
	ROUND(0.6 * AVG(k.kpi_score) + 0.4 * SUM(s.quantity), 2) AS composite_score
	FROM {kpi_table} k
	JOIN {salein_table} s ON k.employee_name = s.employee_name
	GROUP BY k.employee_name
	ORDER BY composite_score DESC
	LIMIT %s
	""", (top_n,))
	return [
	{
	"employee_name": row[0],
	"avg_kpi": float(row[1]),
	"quantity": float(row[2]),
	"score": float(row[3])
	}
	for row in self.cursor.fetchall()
	]


	def get_best_products_by_region(self, table: str, top_n: int = 5) -> List[Dict]:
	"""
	Find the most prominent products across regions based on quantity and regional coverage.

	Args:
	table (str): Table name (columns: product_name, province, quantity).
	top_n (int): Number of top products to return.

	Returns:
	list[dict]: List of products with number of provinces, total quantity, and score.
	Example:
	[{"product_name": "X", "provinces": 8, "quantity": 3200, "score": 1604.0}]
	"""
	# Count distinct provinces and sum quantity, then compute composite score
	self.cursor.execute(f"""
	SELECT
	product_name,
	COUNT(DISTINCT province) AS province_coverage,
	SUM(quantity) AS total_quantity,
	ROUND(0.5 * COUNT(DISTINCT province) + 0.5 * SUM(quantity), 2) AS composite_score
	FROM {table}
	GROUP BY product_name
	ORDER BY composite_score DESC
	LIMIT %s
	""", (top_n,))
	return [
	{
	"product_name": row[0],
	"provinces": int(row[1]),
	"quantity": float(row[2]),
	"score": float(row[3])
	}
	for row in self.cursor.fetchall()
	]

	def get_deliver_by_region_per_month(self, regions: List[str], department: str, year: int) -> List[dict]:
	"""
	Get total deliver value by region per month for a specific department and year.
	"""
	placeholders = ','.join(['%s'] * len(regions))
	query = f"""
	SELECT
	Organization,
	DATE_FORMAT(Date, '%%Y-%%m') AS Month,
	SUM(Deliver) AS Total_Deliver
	FROM kpi_thuc_xuat
	WHERE YEAR(Date) = %s AND Department = %s AND Organization IN ({placeholders})
	GROUP BY Organization, Month
	ORDER BY Organization, Month
	"""
	params = [year, department] + regions
	self.cursor.execute(query, params)
	rows = self.cursor.fetchall()
	return [{"organization": r[0], "month": r[1], "total_deliver": float(r[2])} for r in rows]

	def get_plan_vs_actual_same_day(self, date: str, department: str, regions: List[str]) -> List[dict]:
	"""
	Compare planned vs actual values for a given date, department, and regions.
	"""
	placeholders = ','.join(['%s'] * len(regions))

	self.cursor.execute(f"""
	SELECT Organization, SUM(Deliver)
	FROM kpi_thuc_xuat
	WHERE Date = %s AND Department = %s AND Organization IN ({placeholders})
	GROUP BY Organization
	""", [date, department] + regions)
	actual = {r[0]: float(r[1]) for r in self.cursor.fetchall()}

	self.cursor.execute(f"""
	SELECT Organization, SUM(SaleIn)
	FROM salein_class
	WHERE Date = %s AND Department = %s AND Organization IN ({placeholders})
	GROUP BY Organization
	""", [date, department] + regions)
	plan = {r[0]: float(r[1]) for r in self.cursor.fetchall()}

	all_keys = set(actual) \| set(plan)
	return [{
	"organization": org,
	"planned": plan.get(org, 0),
	"actual": actual.get(org, 0),
	"difference": actual.get(org, 0) - plan.get(org, 0)
	} for org in all_keys]

	def get_completion_rate_by_department_per_month(self, year: int) -> List[dict]:
	"""
	Get monthly completion rate (actual / planned) per department.
	"""
	self.cursor.execute(f"""
	SELECT s.Department, DATE_FORMAT(s.Date, '%%Y-%%m') AS Month,
	ROUND(SUM(t.SaleIn) / SUM(s.SaleIn) * 100, 2) AS Completion_Rate
	FROM salein_class s
	JOIN salein_thuc_xuat t ON s.Department = t.Department AND DATE(s.Date) = DATE(t.Date)
	WHERE YEAR(s.Date) = %s
	GROUP BY s.Department, Month
	""", (year,))
	rows = self.cursor.fetchall()
	return [{"department": r[0], "month": r[1], "completion_rate": float(r[2])} for r in rows]

	def get_avg_kpi_by_month(self, year: int) -> List[dict]:
	"""
	Get average KPI score per month for a given year.
	"""
	self.cursor.execute(f"""
	SELECT DATE_FORMAT(Date, '%%Y-%%m') AS Month, ROUND(AVG(kpi_score), 2)
	FROM kpi_thuc_xuat
	WHERE YEAR(Date) = %s
	GROUP BY Month
	""", (year,))
	return [{"month": r[0], "avg_kpi": float(r[1])} for r in self.cursor.fetchall()]

	def get_salein_comparison_by_region_year(self, years: List[int]) -> List[dict]:
	"""
	Compare sale-in performance by region across multiple years.
	"""
	placeholders = ','.join(['%s'] * len(years))
	self.cursor.execute(f"""
	SELECT Organization, YEAR(Date), SUM(SaleIn)
	FROM salein_thuc_xuat
	WHERE YEAR(Date) IN ({placeholders})
	GROUP BY Organization, YEAR(Date)
	ORDER BY Organization, YEAR(Date)
	""", years)
	return [{"organization": r[0], "year": r[1], "total_salein": float(r[2])} for r in self.cursor.fetchall()]

	def main():
	db = Database()

	db.create_tables()

	try:
	db.connect()

	print("\n📌 DISTINCT VALUES IN COLUMN:")
	regions = db.get_distinct_values("kpi_thuc_xuat", "region")
	print("Regions:", regions)

	print("\n📌 TOTAL KPI:")
	total_kpi = db.get_total("kpi_thuc_xuat", "kpi_score")
	print("Total KPI Score:", total_kpi)

	print("\n📌 RECORD COUNT:")
	row_count = db.get_count("salein_thuc_xuat")
	print("Rows in salein_thuc_xuat:", row_count)

	print("\n📌 TOTAL BY GROUP:")
	quantity_by_province = db.get_total_by_group("salein_thuc_xuat", "province", "quantity")
	print("Quantity by province:", quantity_by_province)

	print("\n📌 AVERAGE KPI BY REGION:")
	avg_kpi = db.get_avg_by_group("kpi_thuc_xuat", "region", "kpi_score")
	print("Avg KPI per region:", avg_kpi)

	print("\n📌 TOTAL BY MONTH:")
	total_monthly = db.get_total_by_month("salein_thuc_xuat", "date", "quantity")
	print("Monthly totals:", total_monthly)

	print("\n📌 EMPLOYEE TREND (test name in your dataset):")
	trend = db.get_entity_trend("salein_thuc_xuat", "employee_name", "Bảo Thế Nguyễn", "date", "quantity")
	print("Trend for 'Bảo Thế Nguyễn':", trend)

	print("\n📌 COMPARE PLAN VS ACTUAL:")
	comparison = db.compare_plan_vs_actual("salein_class", "salein_thuc_xuat", "product_name", "quantity")
	print("Plan vs Actual:", comparison)

	print("\n📌 MONTHLY GROWTH:")
	growth = db.get_monthly_growth("salein_thuc_xuat", "date", "quantity")
	print("Monthly Growth:", growth)

	print("\n📌 TOP EMPLOYEES BY KPI + QUANTITY:")
	top_employees = db.get_best_employees_by_score("kpi_thuc_xuat", "salein_thuc_xuat", top_n=5)
	print(json.dumps(top_employees, indent=4, ensure_ascii=False))

	print("\n📌 TOP PRODUCTS BY REGION + QUANTITY:")
	top_products = db.get_best_products_by_region("salein_thuc_xuat", top_n=5)
	print(json.dumps(top_products, indent=4, ensure_ascii=False))

	print("\n📌 DELIVERY BY REGION PER MONTH (BH1 - 2024):")
	delivery_stats = db.get_deliver_by_region_per_month(["TV01", "TV02", "TV03"], "BH1", 2024)
	print(json.dumps(delivery_stats, indent=4, ensure_ascii=False))

	print("\n📌 PLAN VS ACTUAL SAME DAY (1/2/2024):")
	plan_vs_actual = db.get_plan_vs_actual_same_day("2024-02-01", "BH1", ["TV01", "TV02", "TV03"])
	print(json.dumps(plan_vs_actual, indent=4, ensure_ascii=False))

	print("\n📌 COMPLETION RATE PER DEPARTMENT (2024):")
	completion_rate = db.get_completion_rate_by_department_per_month(2024)
	print(json.dumps(completion_rate, indent=4, ensure_ascii=False))

	print("\n📌 AVG KPI PER MONTH (2024):")
	avg_kpi_month = db.get_avg_kpi_by_month(2024)
	print(json.dumps(avg_kpi_month, indent=4, ensure_ascii=False))

	print("\n📌 COMPARE SALEIN BY REGION BY YEAR:")
	salein_years = db.get_salein_comparison_by_region_year([2023, 2024])
	print(json.dumps(salein_years, indent=4, ensure_ascii=False))

	except Error as e:
	print(f"❌ MySQL Error: {e}")

	finally:
	db.close()


	if __name__ == "__main__":
	main()