Spaces:

Ancastal
/

Business_Chatbot

Sleeping

File size: 8,400 Bytes

401b16c

import re
import spacy
from typing import Optional, Dict, Any
from datetime import datetime
from dateutil import parser as date_parser
from models import EntityExtraction

class EntityExtractor:
    def __init__(self):
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None
    
    def extract_entities(self, text: str) -> EntityExtraction:
        """Extract entities from user input text"""
        text_lower = text.lower()
        
        # Determine transaction type
        transaction_type = self._detect_transaction_type(text_lower)
        
        # Extract entities
        product = self._extract_product(text)
        quantity = self._extract_quantity(text)
        unit = self._extract_unit(text)
        supplier = self._extract_supplier(text) if transaction_type == "purchase" else None
        customer = self._extract_customer(text) if transaction_type == "sale" else None
        unit_price = self._extract_unit_price(text)
        total_amount = self._calculate_total(quantity, unit_price)
        
        return EntityExtraction(
            product=product,
            quantity=quantity,
            unit=unit,
            supplier=supplier,
            customer=customer,
            unit_price=unit_price,
            total_amount=total_amount,
            transaction_type=transaction_type,
            notes=text
        )
    
    def _detect_transaction_type(self, text: str) -> str:
        """Detect if this is a purchase or sale"""
        purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"]
        sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"]
        
        purchase_score = sum(1 for keyword in purchase_keywords if keyword in text)
        sale_score = sum(1 for keyword in sale_keywords if keyword in text)
        
        return "purchase" if purchase_score >= sale_score else "sale"
    
    def _extract_product(self, text: str) -> Optional[str]:
        """Extract product name from text"""
        # Enhanced product patterns to handle various formats
        product_patterns = [
            # Pattern for "X units of Y" format (e.g., "20 tons of Apples")
            r"(?:\d+)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$|$)",
            
            # Pattern for "bought/purchased X Y" format
            r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:\d+\s*(?:tons?|kg|pieces?|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+to|\s+at|\s+for|\s*€|\s*\$)",
            
            # Pattern for quantity followed by product
            r"(?:\d+)\s*(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$)",
            
            # Pattern for standalone capitalized product names
            r"\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*)\b(?!\s+(?:from|at|for|€|\$))",
        ]
        
        for pattern in product_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                product = match.group(1).strip()
                # Filter out common non-product words
                if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']:
                    return product
        
        # Use spaCy for named entity recognition if available
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2:
                    return ent.text
        
        return None
    
    def _extract_quantity(self, text: str) -> Optional[int]:
        """Extract quantity from text"""
        # Enhanced quantity patterns to handle various units
        quantity_patterns = [
            # Numbers with explicit units
            r"(\d+(?:\.\d+)?)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)",
            # Numbers followed by "of" or "x"
            r"(\d+(?:\.\d+)?)\s*(?:of|x)\s+",
            # Numbers in transaction context
            r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)",
            # Standalone numbers at start
            r"^(\d+(?:\.\d+)?)\s+",
        ]
        
        for pattern in quantity_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    # Convert to int, handling decimal quantities
                    quantity = float(match.group(1))
                    return int(quantity) if quantity.is_integer() else int(round(quantity))
                except (ValueError, AttributeError):
                    continue
        
        return None
    
    def _extract_unit(self, text: str) -> Optional[str]:
        """Extract unit from text (tons, kg, pieces, etc.)"""
        # Common unit patterns
        unit_patterns = [
            r"\d+(?:\.\d+)?\s*(tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?|liters?|gallons?)",
        ]
        
        for pattern in unit_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                unit = match.group(1).lower()
                # Normalize units
                unit_mapping = {
                    'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg',
                    'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs',
                    'piece': 'pieces', 'pieces': 'pieces',
                    'unit': 'units', 'units': 'units',
                    'item': 'items', 'items': 'items',
                    'box': 'boxes', 'boxes': 'boxes',
                    'liter': 'liters', 'liters': 'liters',
                    'gallon': 'gallons', 'gallons': 'gallons'
                }
                return unit_mapping.get(unit, unit)
        
        return None
    
    def _extract_supplier(self, text: str) -> Optional[str]:
        """Extract supplier name from text"""
        # Look for "from [supplier]" patterns
        supplier_patterns = [
            r"from\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
            r"supplier\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
        ]
        
        for pattern in supplier_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        # Use spaCy for organization detection
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    return ent.text
        
        return None
    
    def _extract_customer(self, text: str) -> Optional[str]:
        """Extract customer name from text"""
        # Look for "to [customer]" patterns
        customer_patterns = [
            r"to\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
            r"customer\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
        ]
        
        for pattern in customer_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        # Use spaCy for person detection
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ == "PERSON":
                    return ent.text
        
        return None
    
    def _extract_unit_price(self, text: str) -> Optional[float]:
        """Extract unit price from text"""
        # Look for price patterns
        price_patterns = [
            r"(?:at|for|€|$)\s*(\d+(?:\.\d{2})?)\s*(?:each|per|unit)?",
            r"(\d+(?:\.\d{2})?)\s*(?:€|$)\s*(?:each|per|unit)",
            r"(?:price|cost)?\s*(?:of)?\s*(\d+(?:\.\d{2})?)\s*(?:€|$)",
        ]
        
        for pattern in price_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return float(match.group(1))
        
        return None
    
    def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]:
        """Calculate total amount"""
        if quantity and unit_price:
            return quantity * unit_price
        return None