Business_Chatbot / src /entity_extractor.py
Ancastal's picture
Upload folder using huggingface_hub
401b16c verified
import re
import spacy
from typing import Optional, Dict, Any
from datetime import datetime
from dateutil import parser as date_parser
from models import EntityExtraction
class EntityExtractor:
def __init__(self):
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
self.nlp = None
def extract_entities(self, text: str) -> EntityExtraction:
"""Extract entities from user input text"""
text_lower = text.lower()
# Determine transaction type
transaction_type = self._detect_transaction_type(text_lower)
# Extract entities
product = self._extract_product(text)
quantity = self._extract_quantity(text)
unit = self._extract_unit(text)
supplier = self._extract_supplier(text) if transaction_type == "purchase" else None
customer = self._extract_customer(text) if transaction_type == "sale" else None
unit_price = self._extract_unit_price(text)
total_amount = self._calculate_total(quantity, unit_price)
return EntityExtraction(
product=product,
quantity=quantity,
unit=unit,
supplier=supplier,
customer=customer,
unit_price=unit_price,
total_amount=total_amount,
transaction_type=transaction_type,
notes=text
)
def _detect_transaction_type(self, text: str) -> str:
"""Detect if this is a purchase or sale"""
purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"]
sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"]
purchase_score = sum(1 for keyword in purchase_keywords if keyword in text)
sale_score = sum(1 for keyword in sale_keywords if keyword in text)
return "purchase" if purchase_score >= sale_score else "sale"
def _extract_product(self, text: str) -> Optional[str]:
"""Extract product name from text"""
# Enhanced product patterns to handle various formats
product_patterns = [
# Pattern for "X units of Y" format (e.g., "20 tons of Apples")
r"(?:\d+)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$|$)",
# Pattern for "bought/purchased X Y" format
r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:\d+\s*(?:tons?|kg|pieces?|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+to|\s+at|\s+for|\s*€|\s*\$)",
# Pattern for quantity followed by product
r"(?:\d+)\s*(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$)",
# Pattern for standalone capitalized product names
r"\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*)\b(?!\s+(?:from|at|for|€|\$))",
]
for pattern in product_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
product = match.group(1).strip()
# Filter out common non-product words
if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']:
return product
# Use spaCy for named entity recognition if available
if self.nlp:
doc = self.nlp(text)
for ent in doc.ents:
if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2:
return ent.text
return None
def _extract_quantity(self, text: str) -> Optional[int]:
"""Extract quantity from text"""
# Enhanced quantity patterns to handle various units
quantity_patterns = [
# Numbers with explicit units
r"(\d+(?:\.\d+)?)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)",
# Numbers followed by "of" or "x"
r"(\d+(?:\.\d+)?)\s*(?:of|x)\s+",
# Numbers in transaction context
r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)",
# Standalone numbers at start
r"^(\d+(?:\.\d+)?)\s+",
]
for pattern in quantity_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
# Convert to int, handling decimal quantities
quantity = float(match.group(1))
return int(quantity) if quantity.is_integer() else int(round(quantity))
except (ValueError, AttributeError):
continue
return None
def _extract_unit(self, text: str) -> Optional[str]:
"""Extract unit from text (tons, kg, pieces, etc.)"""
# Common unit patterns
unit_patterns = [
r"\d+(?:\.\d+)?\s*(tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?|liters?|gallons?)",
]
for pattern in unit_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
unit = match.group(1).lower()
# Normalize units
unit_mapping = {
'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg',
'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs',
'piece': 'pieces', 'pieces': 'pieces',
'unit': 'units', 'units': 'units',
'item': 'items', 'items': 'items',
'box': 'boxes', 'boxes': 'boxes',
'liter': 'liters', 'liters': 'liters',
'gallon': 'gallons', 'gallons': 'gallons'
}
return unit_mapping.get(unit, unit)
return None
def _extract_supplier(self, text: str) -> Optional[str]:
"""Extract supplier name from text"""
# Look for "from [supplier]" patterns
supplier_patterns = [
r"from\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
r"supplier\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
]
for pattern in supplier_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).strip()
# Use spaCy for organization detection
if self.nlp:
doc = self.nlp(text)
for ent in doc.ents:
if ent.label_ == "ORG":
return ent.text
return None
def _extract_customer(self, text: str) -> Optional[str]:
"""Extract customer name from text"""
# Look for "to [customer]" patterns
customer_patterns = [
r"to\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
r"customer\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
]
for pattern in customer_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).strip()
# Use spaCy for person detection
if self.nlp:
doc = self.nlp(text)
for ent in doc.ents:
if ent.label_ == "PERSON":
return ent.text
return None
def _extract_unit_price(self, text: str) -> Optional[float]:
"""Extract unit price from text"""
# Look for price patterns
price_patterns = [
r"(?:at|for|€|$)\s*(\d+(?:\.\d{2})?)\s*(?:each|per|unit)?",
r"(\d+(?:\.\d{2})?)\s*(?:€|$)\s*(?:each|per|unit)",
r"(?:price|cost)?\s*(?:of)?\s*(\d+(?:\.\d{2})?)\s*(?:€|$)",
]
for pattern in price_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return float(match.group(1))
return None
def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]:
"""Calculate total amount"""
if quantity and unit_price:
return quantity * unit_price
return None