petermutwiri commited on
Commit
9de8307
·
verified ·
1 Parent(s): 3ee7700

Create app/entity_detector.py

Browse files
Files changed (1) hide show
  1. app/entity_detector.py +80 -0
app/entity_detector.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/entity_detector.py
2
+ import pandas as pd
3
+ from typing import Tuple
4
+
5
+ # Entity-specific canonical schemas
6
+ ENTITY_SCHEMAS = {
7
+ "sales": {
8
+ "indicators": ["timestamp", "total", "amount", "qty", "quantity", "sale_date", "transaction_id"],
9
+ "required_matches": 2,
10
+ "aliases": {
11
+ "timestamp": ["timestamp", "date", "sale_date", "created_at", "transaction_time"],
12
+ "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
13
+ "qty": ["qty", "quantity", "units", "pieces", "item_count"],
14
+ "total": ["total", "amount", "line_total", "sales_amount", "price"],
15
+ "store_id": ["store_id", "branch", "location", "outlet_id", "branch_code"],
16
+ }
17
+ },
18
+ "inventory": {
19
+ "indicators": ["stock", "quantity_on_hand", "reorder", "inventory", "current_stock", "warehouse_qty"],
20
+ "required_matches": 2,
21
+ "aliases": {
22
+ "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
23
+ "current_stock": ["stock", "quantity_on_hand", "qty_available", "current_quantity"],
24
+ "reorder_point": ["reorder_level", "min_stock", "reorder_point", "threshold"],
25
+ "supplier_id": ["supplier", "supplier_id", "vendor", "vendor_code"],
26
+ "last_stock_date": ["last_stock_date", "last_receipt", "last_updated"],
27
+ }
28
+ },
29
+ "customer": {
30
+ "indicators": ["customer_id", "email", "phone", "customer_name", "client_id", "loyalty_number"],
31
+ "required_matches": 2,
32
+ "aliases": {
33
+ "customer_id": ["customer_id", "client_id", "member_id", "loyalty_number", "phone"],
34
+ "full_name": ["customer_name", "full_name", "name", "client_name"],
35
+ "email": ["email", "email_address", "e_mail"],
36
+ "phone": ["phone", "phone_number", "mobile", "contact"],
37
+ }
38
+ },
39
+ "product": {
40
+ "indicators": ["product_name", "product_id", "sku", "category", "price", "cost", "unit_of_measure"],
41
+ "required_matches": 2,
42
+ "aliases": {
43
+ "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
44
+ "product_name": ["product_name", "name", "description", "item_name"],
45
+ "category": ["category", "department", "cat", "family", "classification"],
46
+ "unit_price": ["price", "unit_price", "selling_price", "retail_price"],
47
+ "cost_price": ["cost", "cost_price", "purchase_price", "wholesale_price"],
48
+ }
49
+ }
50
+ }
51
+
52
+ def detect_entity_type(df: pd.DataFrame) -> Tuple[str, float]:
53
+ """
54
+ AUTO-DETECT entity type from DataFrame columns.
55
+ Returns: (entity_type, confidence_score)
56
+ """
57
+ columns = {str(col).lower().strip() for col in df.columns}
58
+
59
+ scores = {}
60
+ for entity_type, config in ENTITY_SCHEMAS.items():
61
+ # Count matches between DataFrame columns and entity indicators
62
+ matches = sum(
63
+ 1 for indicator in config["indicators"]
64
+ if any(indicator in col for col in columns)
65
+ )
66
+
67
+ # Calculate confidence (0.0 to 1.0)
68
+ confidence = min(matches / config["required_matches"], 1.0)
69
+ scores[entity_type] = confidence
70
+
71
+ # Return best match if confident enough
72
+ if scores:
73
+ best_entity = max(scores, key=scores.get)
74
+ confidence = scores[best_entity]
75
+
76
+ if confidence > 0.3: # 30% threshold
77
+ return best_entity, confidence
78
+
79
+ # Default to sales if uncertain (most common)
80
+ return "sales", 0.0