import re import spacy from typing import Optional, Dict, Any from datetime import datetime from dateutil import parser as date_parser from models import EntityExtraction class EntityExtractor: def __init__(self): try: self.nlp = spacy.load("en_core_web_sm") except OSError: print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm") self.nlp = None def extract_entities(self, text: str) -> EntityExtraction: """Extract entities from user input text""" text_lower = text.lower() # Determine transaction type transaction_type = self._detect_transaction_type(text_lower) # Extract entities product = self._extract_product(text) quantity = self._extract_quantity(text) unit = self._extract_unit(text) supplier = self._extract_supplier(text) if transaction_type == "purchase" else None customer = self._extract_customer(text) if transaction_type == "sale" else None unit_price = self._extract_unit_price(text) total_amount = self._calculate_total(quantity, unit_price) return EntityExtraction( product=product, quantity=quantity, unit=unit, supplier=supplier, customer=customer, unit_price=unit_price, total_amount=total_amount, transaction_type=transaction_type, notes=text ) def _detect_transaction_type(self, text: str) -> str: """Detect if this is a purchase or sale""" purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"] sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"] purchase_score = sum(1 for keyword in purchase_keywords if keyword in text) sale_score = sum(1 for keyword in sale_keywords if keyword in text) return "purchase" if purchase_score >= sale_score else "sale" def _extract_product(self, text: str) -> Optional[str]: """Extract product name from text""" # Enhanced product patterns to handle various formats product_patterns = [ # Pattern for "X units of Y" format (e.g., "20 tons of Apples") r"(?:\d+)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$|$)", # Pattern for "bought/purchased X Y" format r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:\d+\s*(?:tons?|kg|pieces?|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+to|\s+at|\s+for|\s*€|\s*\$)", # Pattern for quantity followed by product r"(?:\d+)\s*(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$)", # Pattern for standalone capitalized product names r"\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*)\b(?!\s+(?:from|at|for|€|\$))", ] for pattern in product_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: product = match.group(1).strip() # Filter out common non-product words if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']: return product # Use spaCy for named entity recognition if available if self.nlp: doc = self.nlp(text) for ent in doc.ents: if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2: return ent.text return None def _extract_quantity(self, text: str) -> Optional[int]: """Extract quantity from text""" # Enhanced quantity patterns to handle various units quantity_patterns = [ # Numbers with explicit units r"(\d+(?:\.\d+)?)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)", # Numbers followed by "of" or "x" r"(\d+(?:\.\d+)?)\s*(?:of|x)\s+", # Numbers in transaction context r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)", # Standalone numbers at start r"^(\d+(?:\.\d+)?)\s+", ] for pattern in quantity_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: try: # Convert to int, handling decimal quantities quantity = float(match.group(1)) return int(quantity) if quantity.is_integer() else int(round(quantity)) except (ValueError, AttributeError): continue return None def _extract_unit(self, text: str) -> Optional[str]: """Extract unit from text (tons, kg, pieces, etc.)""" # Common unit patterns unit_patterns = [ r"\d+(?:\.\d+)?\s*(tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?|liters?|gallons?)", ] for pattern in unit_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: unit = match.group(1).lower() # Normalize units unit_mapping = { 'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg', 'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs', 'piece': 'pieces', 'pieces': 'pieces', 'unit': 'units', 'units': 'units', 'item': 'items', 'items': 'items', 'box': 'boxes', 'boxes': 'boxes', 'liter': 'liters', 'liters': 'liters', 'gallon': 'gallons', 'gallons': 'gallons' } return unit_mapping.get(unit, unit) return None def _extract_supplier(self, text: str) -> Optional[str]: """Extract supplier name from text""" # Look for "from [supplier]" patterns supplier_patterns = [ r"from\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", r"supplier\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", ] for pattern in supplier_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1).strip() # Use spaCy for organization detection if self.nlp: doc = self.nlp(text) for ent in doc.ents: if ent.label_ == "ORG": return ent.text return None def _extract_customer(self, text: str) -> Optional[str]: """Extract customer name from text""" # Look for "to [customer]" patterns customer_patterns = [ r"to\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", r"customer\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", ] for pattern in customer_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1).strip() # Use spaCy for person detection if self.nlp: doc = self.nlp(text) for ent in doc.ents: if ent.label_ == "PERSON": return ent.text return None def _extract_unit_price(self, text: str) -> Optional[float]: """Extract unit price from text""" # Look for price patterns price_patterns = [ r"(?:at|for|€|$)\s*(\d+(?:\.\d{2})?)\s*(?:each|per|unit)?", r"(\d+(?:\.\d{2})?)\s*(?:€|$)\s*(?:each|per|unit)", r"(?:price|cost)?\s*(?:of)?\s*(\d+(?:\.\d{2})?)\s*(?:€|$)", ] for pattern in price_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: return float(match.group(1)) return None def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]: """Calculate total amount""" if quantity and unit_price: return quantity * unit_price return None