File size: 3,501 Bytes
98a466d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# app/entity_detector.py
import pandas as pd
from typing import Tuple

# Entity-specific canonical schemas
ENTITY_SCHEMAS = {
    "sales": {
        "indicators": ["timestamp", "total", "amount", "qty", "quantity", "sale_date", "transaction_id"],
        "required_matches": 2,
        "aliases": {
            "timestamp": ["timestamp", "date", "sale_date", "created_at", "transaction_time"],
            "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
            "qty": ["qty", "quantity", "units", "pieces", "item_count"],
            "total": ["total", "amount", "line_total", "sales_amount", "price"],
            "store_id": ["store_id", "branch", "location", "outlet_id", "branch_code"],
        }
    },
    "inventory": {
        "indicators": ["stock", "quantity_on_hand", "reorder", "inventory", "current_stock", "warehouse_qty"],
        "required_matches": 2,
        "aliases": {
            "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
            "current_stock": ["stock", "quantity_on_hand", "qty_available", "current_quantity"],
            "reorder_point": ["reorder_level", "min_stock", "reorder_point", "threshold"],
            "supplier_id": ["supplier", "supplier_id", "vendor", "vendor_code"],
            "last_stock_date": ["last_stock_date", "last_receipt", "last_updated"],
        }
    },
    "customer": {
        "indicators": ["customer_id", "email", "phone", "customer_name", "client_id", "loyalty_number"],
        "required_matches": 2,
        "aliases": {
            "customer_id": ["customer_id", "client_id", "member_id", "loyalty_number", "phone"],
            "full_name": ["customer_name", "full_name", "name", "client_name"],
            "email": ["email", "email_address", "e_mail"],
            "phone": ["phone", "phone_number", "mobile", "contact"],
        }
    },
    "product": {
        "indicators": ["product_name", "product_id", "sku", "category", "price", "cost", "unit_of_measure"],
        "required_matches": 2,
        "aliases": {
            "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
            "product_name": ["product_name", "name", "description", "item_name"],
            "category": ["category", "department", "cat", "family", "classification"],
            "unit_price": ["price", "unit_price", "selling_price", "retail_price"],
            "cost_price": ["cost", "cost_price", "purchase_price", "wholesale_price"],
        }
    }
}

def detect_entity_type(df: pd.DataFrame) -> Tuple[str, float]:
    """
    AUTO-DETECT entity type from DataFrame columns.
    Returns: (entity_type, confidence_score)
    """
    columns = {str(col).lower().strip() for col in df.columns}
    
    scores = {}
    for entity_type, config in ENTITY_SCHEMAS.items():
        # Count matches between DataFrame columns and entity indicators
        matches = sum(
            1 for indicator in config["indicators"] 
            if any(indicator in col for col in columns)
        )
        
        # Calculate confidence (0.0 to 1.0)
        confidence = min(matches / config["required_matches"], 1.0)
        scores[entity_type] = confidence
    
    # Return best match if confident enough
    if scores:
        best_entity = max(scores, key=scores.get)
        confidence = scores[best_entity]
        
        if confidence > 0.3:  # 30% threshold
            return best_entity, confidence
    
    # Default to sales if uncertain (most common)
    return "sales", 0.0