Spaces:
Sleeping
Sleeping
File size: 8,400 Bytes
401b16c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | import re
import spacy
from typing import Optional, Dict, Any
from datetime import datetime
from dateutil import parser as date_parser
from models import EntityExtraction
class EntityExtractor:
def __init__(self):
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
self.nlp = None
def extract_entities(self, text: str) -> EntityExtraction:
"""Extract entities from user input text"""
text_lower = text.lower()
# Determine transaction type
transaction_type = self._detect_transaction_type(text_lower)
# Extract entities
product = self._extract_product(text)
quantity = self._extract_quantity(text)
unit = self._extract_unit(text)
supplier = self._extract_supplier(text) if transaction_type == "purchase" else None
customer = self._extract_customer(text) if transaction_type == "sale" else None
unit_price = self._extract_unit_price(text)
total_amount = self._calculate_total(quantity, unit_price)
return EntityExtraction(
product=product,
quantity=quantity,
unit=unit,
supplier=supplier,
customer=customer,
unit_price=unit_price,
total_amount=total_amount,
transaction_type=transaction_type,
notes=text
)
def _detect_transaction_type(self, text: str) -> str:
"""Detect if this is a purchase or sale"""
purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"]
sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"]
purchase_score = sum(1 for keyword in purchase_keywords if keyword in text)
sale_score = sum(1 for keyword in sale_keywords if keyword in text)
return "purchase" if purchase_score >= sale_score else "sale"
def _extract_product(self, text: str) -> Optional[str]:
"""Extract product name from text"""
# Enhanced product patterns to handle various formats
product_patterns = [
# Pattern for "X units of Y" format (e.g., "20 tons of Apples")
r"(?:\d+)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$|$)",
# Pattern for "bought/purchased X Y" format
r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:\d+\s*(?:tons?|kg|pieces?|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+to|\s+at|\s+for|\s*€|\s*\$)",
# Pattern for quantity followed by product
r"(?:\d+)\s*(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from|\s+at|\s+for|\s*€|\s*\$)",
# Pattern for standalone capitalized product names
r"\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*)\b(?!\s+(?:from|at|for|€|\$))",
]
for pattern in product_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
product = match.group(1).strip()
# Filter out common non-product words
if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']:
return product
# Use spaCy for named entity recognition if available
if self.nlp:
doc = self.nlp(text)
for ent in doc.ents:
if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2:
return ent.text
return None
def _extract_quantity(self, text: str) -> Optional[int]:
"""Extract quantity from text"""
# Enhanced quantity patterns to handle various units
quantity_patterns = [
# Numbers with explicit units
r"(\d+(?:\.\d+)?)\s*(?:tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?)",
# Numbers followed by "of" or "x"
r"(\d+(?:\.\d+)?)\s*(?:of|x)\s+",
# Numbers in transaction context
r"(?:bought|purchased|buy|purchase|sold|sale|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)",
# Standalone numbers at start
r"^(\d+(?:\.\d+)?)\s+",
]
for pattern in quantity_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
# Convert to int, handling decimal quantities
quantity = float(match.group(1))
return int(quantity) if quantity.is_integer() else int(round(quantity))
except (ValueError, AttributeError):
continue
return None
def _extract_unit(self, text: str) -> Optional[str]:
"""Extract unit from text (tons, kg, pieces, etc.)"""
# Common unit patterns
unit_patterns = [
r"\d+(?:\.\d+)?\s*(tons?|kg|kilograms?|pounds?|lbs?|pieces?|units?|items?|boxes?|liters?|gallons?)",
]
for pattern in unit_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
unit = match.group(1).lower()
# Normalize units
unit_mapping = {
'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg',
'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs',
'piece': 'pieces', 'pieces': 'pieces',
'unit': 'units', 'units': 'units',
'item': 'items', 'items': 'items',
'box': 'boxes', 'boxes': 'boxes',
'liter': 'liters', 'liters': 'liters',
'gallon': 'gallons', 'gallons': 'gallons'
}
return unit_mapping.get(unit, unit)
return None
def _extract_supplier(self, text: str) -> Optional[str]:
"""Extract supplier name from text"""
# Look for "from [supplier]" patterns
supplier_patterns = [
r"from\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
r"supplier\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
]
for pattern in supplier_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).strip()
# Use spaCy for organization detection
if self.nlp:
doc = self.nlp(text)
for ent in doc.ents:
if ent.label_ == "ORG":
return ent.text
return None
def _extract_customer(self, text: str) -> Optional[str]:
"""Extract customer name from text"""
# Look for "to [customer]" patterns
customer_patterns = [
r"to\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
r"customer\s+([A-Za-z\s]+?)(?:\s+at|\s+for|\s*€|\s*\$|$)",
]
for pattern in customer_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).strip()
# Use spaCy for person detection
if self.nlp:
doc = self.nlp(text)
for ent in doc.ents:
if ent.label_ == "PERSON":
return ent.text
return None
def _extract_unit_price(self, text: str) -> Optional[float]:
"""Extract unit price from text"""
# Look for price patterns
price_patterns = [
r"(?:at|for|€|$)\s*(\d+(?:\.\d{2})?)\s*(?:each|per|unit)?",
r"(\d+(?:\.\d{2})?)\s*(?:€|$)\s*(?:each|per|unit)",
r"(?:price|cost)?\s*(?:of)?\s*(\d+(?:\.\d{2})?)\s*(?:€|$)",
]
for pattern in price_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return float(match.group(1))
return None
def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]:
"""Calculate total amount"""
if quantity and unit_price:
return quantity * unit_price
return None |