Spaces:

tlong-ds
/

daai_rangdong

Sleeping

File size: 26,447 Bytes

777f36a

import pandas as pd
import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from langchain_core.tools import tool
import decimal
import numpy as np
from collections import defaultdict
from typing import List, Dict
import json
load_dotenv()

class Database:
    def __init__(self):
        self.connection = None
        self.cursor = None
        self.data_dir = "sales_data"
    
    def connect(self):
        """Establish connection to MySQL database"""
        try:
            self.connection = mysql.connector.connect(
                host=os.getenv('MYSQL_HOST'),
                user=os.getenv('MYSQL_USER'),
                password=os.getenv('MYSQL_PASSWORD'),
                database=os.getenv('MYSQL_DB'),
                port=int(os.getenv('MYSQL_PORT', 3306))
            )
            if self.connection.is_connected():
                self.cursor = self.connection.cursor()
                return True
        except Error as e:
            print(f"Error connecting to MySQL: {e}")
            return False
    
    def close(self):
        """Close database connection"""
        if self.connection and self.connection.is_connected():
            if self.cursor:
                self.cursor.close()
            self.connection.close()
            print("MySQL connection closed")
    
    def create_salein_class(self):
        """Create and populate salein_class table"""
        try:
            self.cursor.execute("""

                CREATE TABLE IF NOT EXISTS salein_class (

                    id INT AUTO_INCREMENT PRIMARY KEY,

                    date DATE,

                    employee_name VARCHAR(255),

                    department VARCHAR(255),

                    item_code VARCHAR(50),

                    product_name VARCHAR(255),

                    quantity INT

                )

            """)
            
            salein_class_df = pd.read_csv(os.path.join(self.data_dir, "salein_class.csv"))
            for _, row in salein_class_df.iterrows():
                self.cursor.execute("""

                    INSERT IGNORE INTO salein_class (date, employee_name, department, item_code, product_name, quantity)

                    VALUES (%s, %s, %s, %s, %s, %s)

                """, (row['Date'], row['Employee_Name'], row['Department'], row['item_code'], row['product_name'], row['Số lượng']))
            print("Successfully created salein_class table")
            return True
        except Error as e:
            print(f"Error creating salein_class table: {e}")
            return False

    def create_salein_thuc_xuat(self):
        """Create and populate salein_thuc_xuat table"""
        try:
            self.cursor.execute("""

                CREATE TABLE IF NOT EXISTS salein_thuc_xuat (

                    id INT AUTO_INCREMENT PRIMARY KEY,

                    date DATE,

                    employee_name VARCHAR(255),

                    unit_code VARCHAR(255),

                    product_name VARCHAR(255),

                    quantity INT,

                    province VARCHAR(255)

                )

            """)
            
            salein_thuc_xuat_df = pd.read_csv(os.path.join(self.data_dir, "salein_thuc_xuat.csv"))
            for _, row in salein_thuc_xuat_df.iterrows():
                self.cursor.execute("""

                    INSERT IGNORE INTO salein_thuc_xuat (date, employee_name, unit_code, product_name, quantity, province)

                    VALUES (%s, %s, %s, %s, %s, %s)

                """, (row['Date'], row['Employee_Name'], row['unit_code'], row['product_name'], row['Số lượng xuất'], row['province']))
            print("Successfully created salein_thuc_xuat table")
            return True
        except Error as e:
            print(f"Error creating salein_thuc_xuat table: {e}")
            return False

    def create_kpi_thuc_xuat(self):
        """Create and populate kpi_thuc_xuat table"""
        try:
            self.cursor.execute("""

                CREATE TABLE IF NOT EXISTS kpi_thuc_xuat (

                    id INT AUTO_INCREMENT PRIMARY KEY,

                    date DATE,

                    employee_name VARCHAR(255),

                    kpi_code VARCHAR(50),

                    region VARCHAR(255),

                    product_name VARCHAR(255),

                    kpi_score INT

                )

            """)
            
            kpi_thuc_xuat_df = pd.read_csv(os.path.join(self.data_dir, "kpi_thuc_xuat.csv"))
            for _, row in kpi_thuc_xuat_df.iterrows():
                self.cursor.execute("""

                    INSERT IGNORE INTO kpi_thuc_xuat (date, employee_name, kpi_code, region, product_name, kpi_score)

                    VALUES (%s, %s, %s, %s, %s, %s)

                """, (row['Date'], row['Employee_Name'], row['KPI_code'], row['region'], row['product_name'], row['Số KPI']))
            print("Successfully created kpi_thuc_xuat table")
            return True
        except Error as e:
            print(f"Error creating kpi_thuc_xuat table: {e}")
            return False

    def create_tables(self):
        """Create all necessary tables and insert data into them"""
        if not self.connect():
            return
        
        try:
            if self.create_salein_class():
                print("Salein Class table created and data inserted.")
            if self.create_salein_thuc_xuat():
                print("Salein Thuc Xuat table created and data inserted.")
            if self.create_kpi_thuc_xuat():
                print("KPI Thuc Xuat table created and data inserted.")
            
            self.connection.commit()
            print("All tables created and data inserted successfully.")
        
        except Error as e:
            print(f"Error in database creation or data insertion: {e}")
        
        finally:
            self.close()

    def extract_tables_schemas(self):
        """Extract schemas of all tables in the database"""
        if not self.connect():
            return {}
        
        try:
            # Get list of tables
            self.cursor.execute("""

                SELECT table_name 

                FROM information_schema.tables 

                WHERE table_schema = %s

            """, (os.getenv('MYSQL_DB'),))
            tables = self.cursor.fetchall()
            
            schemas = {}
            for (table_name,) in tables:
                # Get columns info for each table
                self.cursor.execute("""

                    SELECT 

                        column_name,

                        column_type,

                        is_nullable,

                        column_key,

                        extra,

                        column_comment

                    FROM information_schema.columns 

                    WHERE table_schema = %s 

                    AND table_name = %s 

                    ORDER BY ordinal_position

                """, (os.getenv('MYSQL_DB'), table_name))
                
                columns = self.cursor.fetchall()
                
                # Get foreign keys info
                self.cursor.execute("""

                    SELECT

                        column_name,

                        referenced_table_name,

                        referenced_column_name

                    FROM information_schema.key_column_usage

                    WHERE table_schema = %s

                    AND table_name = %s

                    AND referenced_table_name IS NOT NULL

                """, (os.getenv('MYSQL_DB'), table_name))
                
                foreign_keys = self.cursor.fetchall()
                
                # Format schema information
                schema = {
                    'columns': [
                        {
                            'name': col[0],
                            'type': col[1],
                            'nullable': col[2],
                            'key': col[3],
                            'extra': col[4],
                            'comment': col[5]
                        }
                        for col in columns
                    ],
                    'foreign_keys': [
                        {
                            'column': fk[0],
                            'references_table': fk[1],
                            'references_column': fk[2]
                        }
                        for fk in foreign_keys
                    ]
                }
                
                schemas[table_name] = schema
            
            return schemas
            
        except Error as e:
            print(f"Error extracting table schemas: {e}")
            return {}
            
        finally:
            self.close()
            
    def get_distinct_values(self, table:str, column: str) -> list: 
        """

        Get distinct values of a column in a table.

        

        Args:

            table (str): Name of the table.

            column (str): Column name to retrieve distinct values from.



        Returns:

            list: A list of unique values found in the specified column.

        """
        self.cursor.execute(f"SELECT DISTINCT {column} FROM {table}")
        return [row[0] for row in self.cursor.fetchall()]


    def get_total(self, table: str, value_col: str) -> float:
        """

        Get the total sum of a numeric column in a table.



        Args:

            table (str): Name of the table.

            value_col (str): Name of the column to sum.



        Returns:

            float: The total sum of the specified column.

        """        
        self.cursor.execute(f"SELECT SUM({value_col}) FROM {table}")
        return self.cursor.fetchone()[0]


    def get_count(self, table: str) -> int:
        """

        Count total number of records in a table.



        Args:

            table (str): Name of the table.



        Returns:

            int: Total number of rows in the table.   

        """
        self.cursor.execute(f"SELECT COUNT(*) FROM {table}")
        return self.cursor.fetchone()[0]


    def get_total_by_group(self, table: str, group_col: str, value_col: str) -> list:
        """

        Get total value grouped by a specified column.



        Args:

            table (str): Name of the table.

            group_col (str): Column to group by (e.g. 'region', 'department').

            value_col (str): Column to aggregate using SUM.



        Returns:

            list[tuple]: List of (group_value, total) pairs sorted by total descending.

            e.g: [("North", 5000), ("South", 3000)]

        """
        self.cursor.execute(f"""

            SELECT {group_col}, SUM({value_col}) AS total

            FROM {table}

            GROUP BY {group_col}

            ORDER BY total DESC

        """)
        return self.cursor.fetchall()


    def get_avg_by_group(self, table: str, group_col: str, value_col: str) -> list:
        """

        Get average value grouped by a specified column.



        Args:

            table (str): Name of the table.

            group_col (str): Column to group by (e.g. 'product_name').

            value_col (str): Column to aggregate using AVG.



        Returns:

            list[tuple]: List of (group_value, average) pairs sorted by average descending.

            e.g: [("Product A", 105.4), ("Product B", 89.2)]

        """
        self.cursor.execute(f"""

            SELECT {group_col}, AVG({value_col}) AS avg_value

            FROM {table}

            GROUP BY {group_col}

            ORDER BY avg_value DESC

        """)
        return self.cursor.fetchall()

    
    def get_total_by_month(self, table: str, date_col: str, value_col: str) -> list:
        """

        Get total value grouped by month (from a date column).



        Args:

            table (str): Name of the table.

            date_col (str): Column containing date values.

            value_col (str): Column to aggregate using SUM.



        Returns:

            list[tuple]: List of (month, total) pairs in chronological order.

            e.g: [("2024-01", 1200), ("2024-02", 1800)]

        """
        self.cursor.execute(f"""

            SELECT DATE_FORMAT({date_col}, '%Y-%m') AS month, SUM({value_col}) AS total

            FROM {table}

            GROUP BY month

            ORDER BY month

        """)
        return self.cursor.fetchall()


    def get_entity_trend(self, table: str, entity_col: str, entity_value: str, date_col: str, value_col: str) -> list:
        """

        Get monthly trend of total value for a specific entity (e.g. employee, product).



        Args:

            table (str): Name of the table.

            entity_col (str): Column to filter by (e.g. 'employee_name').

            entity_value (str): Value of the entity to track.

            date_col (str): Date column to group by month.

            value_col (str): Column to sum.



        Returns:

            list[tuple]: List of (month, total) for the specified entity.

            e.g: [("2024-01", 300), ("2024-02", 500)]

        """
        self.cursor.execute(f"""

            SELECT DATE_FORMAT({date_col}, '%Y-%m') AS month, SUM({value_col}) AS total

            FROM {table}

            WHERE {entity_col} = %s

            GROUP BY month

            ORDER BY month

        """, (entity_value,))
        return self.cursor.fetchall()


    def compare_plan_vs_actual(self, plan_table: str, actual_table: str, match_col: str, value_col: str) -> list:
        """

        Compare planned vs actual values for a common attribute (e.g. product_name).



        Args:

            plan_table (str): Table containing planned values.

            actual_table (str): Table containing actual values.

            match_col (str): Column used to join the two tables (e.g. 'product_name').

            value_col (str): Column to compare (e.g. 'quantity').



        Returns:

            list[tuple]: List of (item, planned, actual, difference) sorted by difference.

            e.g: [("Product A", 1000, 800, -200)]

        """
        self.cursor.execute(f"""

            SELECT 

                p.{match_col},

                SUM(p.{value_col}) AS planned,

                SUM(a.{value_col}) AS actual,

                SUM(a.{value_col}) - SUM(p.{value_col}) AS difference

            FROM {plan_table} p

            JOIN {actual_table} a ON p.{match_col} = a.{match_col}

            GROUP BY p.{match_col}

            ORDER BY difference DESC

        """)
        return self.cursor.fetchall()


    def get_monthly_growth(self, table: str, date_col: str, value_col: str) -> list:
        """

        Calculate monthly growth rate based on summed values.



        Args:

            table (str): Name of the table.

            date_col (str): Date column to group by month.

            value_col (str): Column to aggregate using SUM.



        Returns:

            list[tuple]: List of (month, total, previous_total, growth_rate) for each month.

            e.g: [("2024-02", 1200, 1000, 20.0)]

        """
        self.cursor.execute(f"""

            SELECT 

                month,

                total,

                LAG(total) OVER (ORDER BY month) AS prev_total,

                ROUND((total - LAG(total) OVER (ORDER BY month)) / LAG(total) OVER (ORDER BY month) * 100, 2) AS growth_rate

            FROM (

                SELECT DATE_FORMAT({date_col}, '%Y-%m') AS month, SUM({value_col}) AS total

                FROM {table}

                GROUP BY month

            ) AS subquery

        """)
        return self.cursor.fetchall()  


    def get_best_employees_by_score(self, kpi_table: str, salein_table: str, top_n: int = 5) -> List[Dict]:
        """

        Calculate a composite score for employees based on average KPI and total quantity.



        Args:

            kpi_table (str): Name of the KPI table (columns: employee_name, kpi_score).

            salein_table (str): Name of the actual sales table (columns: employee_name, quantity).

            top_n (int): Number of top employees to return.



        Returns:

            list[dict]: List of employees with their average KPI, total quantity, and composite score.

            Example:

                [{"employee_name": "A", "avg_kpi": 85.5, "quantity": 1200, "score": 91.3}]

        """
        # Join both tables on employee_name, then calculate weighted score
        self.cursor.execute(f"""

            SELECT 

                k.employee_name,

                AVG(k.kpi_score) AS avg_kpi,

                SUM(s.quantity) AS total_quantity,

                ROUND(0.6 * AVG(k.kpi_score) + 0.4 * SUM(s.quantity), 2) AS composite_score

            FROM {kpi_table} k

            JOIN {salein_table} s ON k.employee_name = s.employee_name

            GROUP BY k.employee_name

            ORDER BY composite_score DESC

            LIMIT %s

        """, (top_n,))
        return [
            {
                "employee_name": row[0],
                "avg_kpi": float(row[1]),
                "quantity": float(row[2]),
                "score": float(row[3])
            }
            for row in self.cursor.fetchall()
        ]


    def get_best_products_by_region(self, table: str, top_n: int = 5) -> List[Dict]:
        """

        Find the most prominent products across regions based on quantity and regional coverage.



        Args:

            table (str): Table name (columns: product_name, province, quantity).

            top_n (int): Number of top products to return.



        Returns:

            list[dict]: List of products with number of provinces, total quantity, and score.

            Example:

                [{"product_name": "X", "provinces": 8, "quantity": 3200, "score": 1604.0}]

        """
        # Count distinct provinces and sum quantity, then compute composite score
        self.cursor.execute(f"""

            SELECT 

                product_name,

                COUNT(DISTINCT province) AS province_coverage,

                SUM(quantity) AS total_quantity,

                ROUND(0.5 * COUNT(DISTINCT province) + 0.5 * SUM(quantity), 2) AS composite_score

            FROM {table}

            GROUP BY product_name

            ORDER BY composite_score DESC

            LIMIT %s

        """, (top_n,))
        return [
            {
                "product_name": row[0],
                "provinces": int(row[1]),
                "quantity": float(row[2]),
                "score": float(row[3])
            }
            for row in self.cursor.fetchall()
        ]

    def get_deliver_by_region_per_month(self, regions: List[str], department: str, year: int) -> List[dict]:
        """

        Get total deliver value by region per month for a specific department and year.

        """
        placeholders = ','.join(['%s'] * len(regions))
        query = f"""

            SELECT 

                Organization,

                DATE_FORMAT(Date, '%%Y-%%m') AS Month,

                SUM(Deliver) AS Total_Deliver

            FROM kpi_thuc_xuat

            WHERE YEAR(Date) = %s AND Department = %s AND Organization IN ({placeholders})

            GROUP BY Organization, Month

            ORDER BY Organization, Month

        """
        params = [year, department] + regions
        self.cursor.execute(query, params)
        rows = self.cursor.fetchall()
        return [{"organization": r[0], "month": r[1], "total_deliver": float(r[2])} for r in rows]

    def get_plan_vs_actual_same_day(self, date: str, department: str, regions: List[str]) -> List[dict]:
        """

        Compare planned vs actual values for a given date, department, and regions.

        """
        placeholders = ','.join(['%s'] * len(regions))

        self.cursor.execute(f"""

            SELECT Organization, SUM(Deliver)

            FROM kpi_thuc_xuat

            WHERE Date = %s AND Department = %s AND Organization IN ({placeholders})

            GROUP BY Organization

        """, [date, department] + regions)
        actual = {r[0]: float(r[1]) for r in self.cursor.fetchall()}

        self.cursor.execute(f"""

            SELECT Organization, SUM(SaleIn)

            FROM salein_class

            WHERE Date = %s AND Department = %s AND Organization IN ({placeholders})

            GROUP BY Organization

        """, [date, department] + regions)
        plan = {r[0]: float(r[1]) for r in self.cursor.fetchall()}

        all_keys = set(actual) | set(plan)
        return [{
            "organization": org,
            "planned": plan.get(org, 0),
            "actual": actual.get(org, 0),
            "difference": actual.get(org, 0) - plan.get(org, 0)
        } for org in all_keys]

    def get_completion_rate_by_department_per_month(self, year: int) -> List[dict]:
        """

        Get monthly completion rate (actual / planned) per department.

        """
        self.cursor.execute(f"""

            SELECT s.Department, DATE_FORMAT(s.Date, '%%Y-%%m') AS Month,

                ROUND(SUM(t.SaleIn) / SUM(s.SaleIn) * 100, 2) AS Completion_Rate

            FROM salein_class s

            JOIN salein_thuc_xuat t ON s.Department = t.Department AND DATE(s.Date) = DATE(t.Date)

            WHERE YEAR(s.Date) = %s

            GROUP BY s.Department, Month

        """, (year,))
        rows = self.cursor.fetchall()
        return [{"department": r[0], "month": r[1], "completion_rate": float(r[2])} for r in rows]

    def get_avg_kpi_by_month(self, year: int) -> List[dict]:
        """

        Get average KPI score per month for a given year.

        """
        self.cursor.execute(f"""

            SELECT DATE_FORMAT(Date, '%%Y-%%m') AS Month, ROUND(AVG(kpi_score), 2)

            FROM kpi_thuc_xuat

            WHERE YEAR(Date) = %s

            GROUP BY Month

        """, (year,))
        return [{"month": r[0], "avg_kpi": float(r[1])} for r in self.cursor.fetchall()]

    def get_salein_comparison_by_region_year(self, years: List[int]) -> List[dict]:
        """

        Compare sale-in performance by region across multiple years.

        """
        placeholders = ','.join(['%s'] * len(years))
        self.cursor.execute(f"""

            SELECT Organization, YEAR(Date), SUM(SaleIn)

            FROM salein_thuc_xuat

            WHERE YEAR(Date) IN ({placeholders})

            GROUP BY Organization, YEAR(Date)

            ORDER BY Organization, YEAR(Date)

        """, years)
        return [{"organization": r[0], "year": r[1], "total_salein": float(r[2])} for r in self.cursor.fetchall()]

def main():
    db = Database()
    
    db.create_tables()

    try:
        db.connect()

        print("\n📌 DISTINCT VALUES IN COLUMN:")
        regions = db.get_distinct_values("kpi_thuc_xuat", "region")
        print("Regions:", regions)

        print("\n📌 TOTAL KPI:")
        total_kpi = db.get_total("kpi_thuc_xuat", "kpi_score")
        print("Total KPI Score:", total_kpi)

        print("\n📌 RECORD COUNT:")
        row_count = db.get_count("salein_thuc_xuat")
        print("Rows in salein_thuc_xuat:", row_count)

        print("\n📌 TOTAL BY GROUP:")
        quantity_by_province = db.get_total_by_group("salein_thuc_xuat", "province", "quantity")
        print("Quantity by province:", quantity_by_province)

        print("\n📌 AVERAGE KPI BY REGION:")
        avg_kpi = db.get_avg_by_group("kpi_thuc_xuat", "region", "kpi_score")
        print("Avg KPI per region:", avg_kpi)

        print("\n📌 TOTAL BY MONTH:")
        total_monthly = db.get_total_by_month("salein_thuc_xuat", "date", "quantity")
        print("Monthly totals:", total_monthly)

        print("\n📌 EMPLOYEE TREND (test name in your dataset):")
        trend = db.get_entity_trend("salein_thuc_xuat", "employee_name", "Bảo Thế Nguyễn", "date", "quantity")
        print("Trend for 'Bảo Thế Nguyễn':", trend)

        print("\n📌 COMPARE PLAN VS ACTUAL:")
        comparison = db.compare_plan_vs_actual("salein_class", "salein_thuc_xuat", "product_name", "quantity")
        print("Plan vs Actual:", comparison)

        print("\n📌 MONTHLY GROWTH:")
        growth = db.get_monthly_growth("salein_thuc_xuat", "date", "quantity")
        print("Monthly Growth:", growth)

        print("\n📌 TOP EMPLOYEES BY KPI + QUANTITY:")
        top_employees = db.get_best_employees_by_score("kpi_thuc_xuat", "salein_thuc_xuat", top_n=5)
        print(json.dumps(top_employees, indent=4, ensure_ascii=False))

        print("\n📌 TOP PRODUCTS BY REGION + QUANTITY:")
        top_products = db.get_best_products_by_region("salein_thuc_xuat", top_n=5)
        print(json.dumps(top_products, indent=4, ensure_ascii=False))

        print("\n📌 DELIVERY BY REGION PER MONTH (BH1 - 2024):")
        delivery_stats = db.get_deliver_by_region_per_month(["TV01", "TV02", "TV03"], "BH1", 2024)
        print(json.dumps(delivery_stats, indent=4, ensure_ascii=False))

        print("\n📌 PLAN VS ACTUAL SAME DAY (1/2/2024):")
        plan_vs_actual = db.get_plan_vs_actual_same_day("2024-02-01", "BH1", ["TV01", "TV02", "TV03"])
        print(json.dumps(plan_vs_actual, indent=4, ensure_ascii=False))

        print("\n📌 COMPLETION RATE PER DEPARTMENT (2024):")
        completion_rate = db.get_completion_rate_by_department_per_month(2024)
        print(json.dumps(completion_rate, indent=4, ensure_ascii=False))

        print("\n📌 AVG KPI PER MONTH (2024):")
        avg_kpi_month = db.get_avg_kpi_by_month(2024)
        print(json.dumps(avg_kpi_month, indent=4, ensure_ascii=False))

        print("\n📌 COMPARE SALEIN BY REGION BY YEAR:")
        salein_years = db.get_salein_comparison_by_region_year([2023, 2024])
        print(json.dumps(salein_years, indent=4, ensure_ascii=False))

    except Error as e:
        print(f"❌ MySQL Error: {e}")

    finally:
        db.close()


if __name__ == "__main__":
    main()