Create app/entity_detector.py
Browse files- app/entity_detector.py +80 -0
app/entity_detector.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/entity_detector.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from typing import Tuple
|
| 4 |
+
|
| 5 |
+
# Entity-specific canonical schemas
|
| 6 |
+
ENTITY_SCHEMAS = {
|
| 7 |
+
"sales": {
|
| 8 |
+
"indicators": ["timestamp", "total", "amount", "qty", "quantity", "sale_date", "transaction_id"],
|
| 9 |
+
"required_matches": 2,
|
| 10 |
+
"aliases": {
|
| 11 |
+
"timestamp": ["timestamp", "date", "sale_date", "created_at", "transaction_time"],
|
| 12 |
+
"product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
|
| 13 |
+
"qty": ["qty", "quantity", "units", "pieces", "item_count"],
|
| 14 |
+
"total": ["total", "amount", "line_total", "sales_amount", "price"],
|
| 15 |
+
"store_id": ["store_id", "branch", "location", "outlet_id", "branch_code"],
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"inventory": {
|
| 19 |
+
"indicators": ["stock", "quantity_on_hand", "reorder", "inventory", "current_stock", "warehouse_qty"],
|
| 20 |
+
"required_matches": 2,
|
| 21 |
+
"aliases": {
|
| 22 |
+
"product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
|
| 23 |
+
"current_stock": ["stock", "quantity_on_hand", "qty_available", "current_quantity"],
|
| 24 |
+
"reorder_point": ["reorder_level", "min_stock", "reorder_point", "threshold"],
|
| 25 |
+
"supplier_id": ["supplier", "supplier_id", "vendor", "vendor_code"],
|
| 26 |
+
"last_stock_date": ["last_stock_date", "last_receipt", "last_updated"],
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"customer": {
|
| 30 |
+
"indicators": ["customer_id", "email", "phone", "customer_name", "client_id", "loyalty_number"],
|
| 31 |
+
"required_matches": 2,
|
| 32 |
+
"aliases": {
|
| 33 |
+
"customer_id": ["customer_id", "client_id", "member_id", "loyalty_number", "phone"],
|
| 34 |
+
"full_name": ["customer_name", "full_name", "name", "client_name"],
|
| 35 |
+
"email": ["email", "email_address", "e_mail"],
|
| 36 |
+
"phone": ["phone", "phone_number", "mobile", "contact"],
|
| 37 |
+
}
|
| 38 |
+
},
|
| 39 |
+
"product": {
|
| 40 |
+
"indicators": ["product_name", "product_id", "sku", "category", "price", "cost", "unit_of_measure"],
|
| 41 |
+
"required_matches": 2,
|
| 42 |
+
"aliases": {
|
| 43 |
+
"product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
|
| 44 |
+
"product_name": ["product_name", "name", "description", "item_name"],
|
| 45 |
+
"category": ["category", "department", "cat", "family", "classification"],
|
| 46 |
+
"unit_price": ["price", "unit_price", "selling_price", "retail_price"],
|
| 47 |
+
"cost_price": ["cost", "cost_price", "purchase_price", "wholesale_price"],
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
def detect_entity_type(df: pd.DataFrame) -> Tuple[str, float]:
|
| 53 |
+
"""
|
| 54 |
+
AUTO-DETECT entity type from DataFrame columns.
|
| 55 |
+
Returns: (entity_type, confidence_score)
|
| 56 |
+
"""
|
| 57 |
+
columns = {str(col).lower().strip() for col in df.columns}
|
| 58 |
+
|
| 59 |
+
scores = {}
|
| 60 |
+
for entity_type, config in ENTITY_SCHEMAS.items():
|
| 61 |
+
# Count matches between DataFrame columns and entity indicators
|
| 62 |
+
matches = sum(
|
| 63 |
+
1 for indicator in config["indicators"]
|
| 64 |
+
if any(indicator in col for col in columns)
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Calculate confidence (0.0 to 1.0)
|
| 68 |
+
confidence = min(matches / config["required_matches"], 1.0)
|
| 69 |
+
scores[entity_type] = confidence
|
| 70 |
+
|
| 71 |
+
# Return best match if confident enough
|
| 72 |
+
if scores:
|
| 73 |
+
best_entity = max(scores, key=scores.get)
|
| 74 |
+
confidence = scores[best_entity]
|
| 75 |
+
|
| 76 |
+
if confidence > 0.3: # 30% threshold
|
| 77 |
+
return best_entity, confidence
|
| 78 |
+
|
| 79 |
+
# Default to sales if uncertain (most common)
|
| 80 |
+
return "sales", 0.0
|