File size: 8,400 Bytes
401b16c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import re
import spacy
from typing import Optional, Dict, Any
from datetime import datetime
from dateutil import parser as date_parser
from models import EntityExtraction

class EntityExtractor:
    def __init__(self):
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None
    
    def extract_entities(self, text: str) -> EntityExtraction:
        """Extract entities from user input text"""
        text_lower = text.lower()
        
        # Determine transaction type
        transaction_type = self._detect_transaction_type(text_lower)
        
        # Extract entities
        product = self._extract_product(text)
        quantity = self._extract_quantity(text)
        unit = self._extract_unit(text)
        supplier = self._extract_supplier(text) if transaction_type == "purchase" else None
        customer = self._extract_customer(text) if transaction_type == "sale" else None
        unit_price = self._extract_unit_price(text)
        total_amount = self._calculate_total(quantity, unit_price)
        
        return EntityExtraction(
            product=product,
            quantity=quantity,
            unit=unit,
            supplier=supplier,
            customer=customer,
            unit_price=unit_price,
            total_amount=total_amount,
            transaction_type=transaction_type,
            notes=text
        )
    
    def _detect_transaction_type(self, text: str) -> str:
        """Detect if this is a purchase or sale"""
        purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"]
        sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"]
        
        purchase_score = sum(1 for keyword in purchase_keywords if keyword in text)
        sale_score = sum(1 for keyword in sale_keywords if keyword in text)
        
        return "purchase" if purchase_score >= sale_score else "sale"
    
    def _extract_product(self, text: str) -> Optional[str]:
        """Extract product name from text"""
        # Enhanced product patterns to handle various formats
        product_patterns = [
            # Pattern for "X units of Y" format (e.g., "20 tons of Apples")
            r"(?:\d+)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$|$)",
            
            # Pattern for "bought/purchased X Y" format
            r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:\d+\s*(?:tons?|kg|pieces?|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+to|\s+at|\s+for|\s*€|\s*\$)",
            
            # Pattern for quantity followed by product
            r"(?:\d+)\s*(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$)",
            
            # Pattern for standalone capitalized product names
            r"\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*)\b(?!\s+(?:from|at|for|€|\$))",
        ]
        
        for pattern in product_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                product = match.group(1).strip()
                # Filter out common non-product words
                if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']:
                    return product
        
        # Use spaCy for named entity recognition if available
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2:
                    return ent.text
        
        return None
    
    def _extract_quantity(self, text: str) -> Optional[int]:
        """Extract quantity from text"""
        # Enhanced quantity patterns to handle various units
        quantity_patterns = [
            # Numbers with explicit units
            r"(\d+(?:\.\d+)?)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)",
            # Numbers followed by "of" or "x"
            r"(\d+(?:\.\d+)?)\s*(?:of|x)\s+",
            # Numbers in transaction context
            r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)",
            # Standalone numbers at start
            r"^(\d+(?:\.\d+)?)\s+",
        ]
        
        for pattern in quantity_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    # Convert to int, handling decimal quantities
                    quantity = float(match.group(1))
                    return int(quantity) if quantity.is_integer() else int(round(quantity))
                except (ValueError, AttributeError):
                    continue
        
        return None
    
    def _extract_unit(self, text: str) -> Optional[str]:
        """Extract unit from text (tons, kg, pieces, etc.)"""
        # Common unit patterns
        unit_patterns = [
            r"\d+(?:\.\d+)?\s*(tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?|liters?|gallons?)",
        ]
        
        for pattern in unit_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                unit = match.group(1).lower()
                # Normalize units
                unit_mapping = {
                    'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg',
                    'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs',
                    'piece': 'pieces', 'pieces': 'pieces',
                    'unit': 'units', 'units': 'units',
                    'item': 'items', 'items': 'items',
                    'box': 'boxes', 'boxes': 'boxes',
                    'liter': 'liters', 'liters': 'liters',
                    'gallon': 'gallons', 'gallons': 'gallons'
                }
                return unit_mapping.get(unit, unit)
        
        return None
    
    def _extract_supplier(self, text: str) -> Optional[str]:
        """Extract supplier name from text"""
        # Look for "from [supplier]" patterns
        supplier_patterns = [
            r"from\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
            r"supplier\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
        ]
        
        for pattern in supplier_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        # Use spaCy for organization detection
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    return ent.text
        
        return None
    
    def _extract_customer(self, text: str) -> Optional[str]:
        """Extract customer name from text"""
        # Look for "to [customer]" patterns
        customer_patterns = [
            r"to\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
            r"customer\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
        ]
        
        for pattern in customer_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        # Use spaCy for person detection
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ == "PERSON":
                    return ent.text
        
        return None
    
    def _extract_unit_price(self, text: str) -> Optional[float]:
        """Extract unit price from text"""
        # Look for price patterns
        price_patterns = [
            r"(?:at|for|€|$)\s*(\d+(?:\.\d{2})?)\s*(?:each|per|unit)?",
            r"(\d+(?:\.\d{2})?)\s*(?:€|$)\s*(?:each|per|unit)",
            r"(?:price|cost)?\s*(?:of)?\s*(\d+(?:\.\d{2})?)\s*(?:€|$)",
        ]
        
        for pattern in price_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return float(match.group(1))
        
        return None
    
    def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]:
        """Calculate total amount"""
        if quantity and unit_price:
            return quantity * unit_price
        return None