Spaces:
Sleeping
Sleeping
| import re | |
| import spacy | |
| from typing import Optional, Dict, Any | |
| from datetime import datetime | |
| from dateutil import parser as date_parser | |
| from models import EntityExtraction | |
| class EntityExtractor: | |
| def __init__(self): | |
| try: | |
| self.nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm") | |
| self.nlp = None | |
| def extract_entities(self, text: str) -> EntityExtraction: | |
| """Extract entities from user input text""" | |
| text_lower = text.lower() | |
| # Determine transaction type | |
| transaction_type = self._detect_transaction_type(text_lower) | |
| # Extract entities | |
| product = self._extract_product(text) | |
| quantity = self._extract_quantity(text) | |
| unit = self._extract_unit(text) | |
| supplier = self._extract_supplier(text) if transaction_type == "purchase" else None | |
| customer = self._extract_customer(text) if transaction_type == "sale" else None | |
| unit_price = self._extract_unit_price(text) | |
| total_amount = self._calculate_total(quantity, unit_price) | |
| return EntityExtraction( | |
| product=product, | |
| quantity=quantity, | |
| unit=unit, | |
| supplier=supplier, | |
| customer=customer, | |
| unit_price=unit_price, | |
| total_amount=total_amount, | |
| transaction_type=transaction_type, | |
| notes=text | |
| ) | |
| def _detect_transaction_type(self, text: str) -> str: | |
| """Detect if this is a purchase or sale""" | |
| purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"] | |
| sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"] | |
| purchase_score = sum(1 for keyword in purchase_keywords if keyword in text) | |
| sale_score = sum(1 for keyword in sale_keywords if keyword in text) | |
| return "purchase" if purchase_score >= sale_score else "sale" | |
| def _extract_product(self, text: str) -> Optional[str]: | |
| """Extract product name from text""" | |
| # Enhanced product patterns to handle various formats | |
| product_patterns = [ | |
| # Pattern for "X units of Y" format (e.g., "20 tons of Apples") | |
| r"(?:\d+)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$|$)", | |
| # Pattern for "bought/purchased X Y" format | |
| r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:\d+\s*(?:tons?|kg|pieces?|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+to|\s+at|\s+for|\s*€|\s*\$)", | |
| # Pattern for quantity followed by product | |
| r"(?:\d+)\s*(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$)", | |
| # Pattern for standalone capitalized product names | |
| r"\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*)\b(?!\s+(?:from|at|for|€|\$))", | |
| ] | |
| for pattern in product_patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| product = match.group(1).strip() | |
| # Filter out common non-product words | |
| if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']: | |
| return product | |
| # Use spaCy for named entity recognition if available | |
| if self.nlp: | |
| doc = self.nlp(text) | |
| for ent in doc.ents: | |
| if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2: | |
| return ent.text | |
| return None | |
| def _extract_quantity(self, text: str) -> Optional[int]: | |
| """Extract quantity from text""" | |
| # Enhanced quantity patterns to handle various units | |
| quantity_patterns = [ | |
| # Numbers with explicit units | |
| r"(\d+(?:\.\d+)?)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)", | |
| # Numbers followed by "of" or "x" | |
| r"(\d+(?:\.\d+)?)\s*(?:of|x)\s+", | |
| # Numbers in transaction context | |
| r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)", | |
| # Standalone numbers at start | |
| r"^(\d+(?:\.\d+)?)\s+", | |
| ] | |
| for pattern in quantity_patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| try: | |
| # Convert to int, handling decimal quantities | |
| quantity = float(match.group(1)) | |
| return int(quantity) if quantity.is_integer() else int(round(quantity)) | |
| except (ValueError, AttributeError): | |
| continue | |
| return None | |
| def _extract_unit(self, text: str) -> Optional[str]: | |
| """Extract unit from text (tons, kg, pieces, etc.)""" | |
| # Common unit patterns | |
| unit_patterns = [ | |
| r"\d+(?:\.\d+)?\s*(tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?|liters?|gallons?)", | |
| ] | |
| for pattern in unit_patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| unit = match.group(1).lower() | |
| # Normalize units | |
| unit_mapping = { | |
| 'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg', | |
| 'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs', | |
| 'piece': 'pieces', 'pieces': 'pieces', | |
| 'unit': 'units', 'units': 'units', | |
| 'item': 'items', 'items': 'items', | |
| 'box': 'boxes', 'boxes': 'boxes', | |
| 'liter': 'liters', 'liters': 'liters', | |
| 'gallon': 'gallons', 'gallons': 'gallons' | |
| } | |
| return unit_mapping.get(unit, unit) | |
| return None | |
| def _extract_supplier(self, text: str) -> Optional[str]: | |
| """Extract supplier name from text""" | |
| # Look for "from [supplier]" patterns | |
| supplier_patterns = [ | |
| r"from\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", | |
| r"supplier\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", | |
| ] | |
| for pattern in supplier_patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() | |
| # Use spaCy for organization detection | |
| if self.nlp: | |
| doc = self.nlp(text) | |
| for ent in doc.ents: | |
| if ent.label_ == "ORG": | |
| return ent.text | |
| return None | |
| def _extract_customer(self, text: str) -> Optional[str]: | |
| """Extract customer name from text""" | |
| # Look for "to [customer]" patterns | |
| customer_patterns = [ | |
| r"to\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", | |
| r"customer\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)", | |
| ] | |
| for pattern in customer_patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() | |
| # Use spaCy for person detection | |
| if self.nlp: | |
| doc = self.nlp(text) | |
| for ent in doc.ents: | |
| if ent.label_ == "PERSON": | |
| return ent.text | |
| return None | |
| def _extract_unit_price(self, text: str) -> Optional[float]: | |
| """Extract unit price from text""" | |
| # Look for price patterns | |
| price_patterns = [ | |
| r"(?:at|for|€|$)\s*(\d+(?:\.\d{2})?)\s*(?:each|per|unit)?", | |
| r"(\d+(?:\.\d{2})?)\s*(?:€|$)\s*(?:each|per|unit)", | |
| r"(?:price|cost)?\s*(?:of)?\s*(\d+(?:\.\d{2})?)\s*(?:€|$)", | |
| ] | |
| for pattern in price_patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| return float(match.group(1)) | |
| return None | |
| def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]: | |
| """Calculate total amount""" | |
| if quantity and unit_price: | |
| return quantity * unit_price | |
| return None |