Spaces:

Ancastal
/

Business_Chatbot

Sleeping

App Files Files Community

Business_Chatbot / src /entity_extractor.py

Ancastal

Upload folder using huggingface_hub

401b16c verified 8 months ago

raw

history blame contribute delete

8.4 kB

	import re
	import spacy
	from typing import Optional, Dict, Any
	from datetime import datetime
	from dateutil import parser as date_parser
	from models import EntityExtraction

	class EntityExtractor:
	def __init__(self):
	try:
	self.nlp = spacy.load("en_core_web_sm")
	except OSError:
	print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
	self.nlp = None

	def extract_entities(self, text: str) -> EntityExtraction:
	"""Extract entities from user input text"""
	text_lower = text.lower()

	# Determine transaction type
	transaction_type = self._detect_transaction_type(text_lower)

	# Extract entities
	product = self._extract_product(text)
	quantity = self._extract_quantity(text)
	unit = self._extract_unit(text)
	supplier = self._extract_supplier(text) if transaction_type == "purchase" else None
	customer = self._extract_customer(text) if transaction_type == "sale" else None
	unit_price = self._extract_unit_price(text)
	total_amount = self._calculate_total(quantity, unit_price)

	return EntityExtraction(
	product=product,
	quantity=quantity,
	unit=unit,
	supplier=supplier,
	customer=customer,
	unit_price=unit_price,
	total_amount=total_amount,
	transaction_type=transaction_type,
	notes=text
	)

	def _detect_transaction_type(self, text: str) -> str:
	"""Detect if this is a purchase or sale"""
	purchase_keywords = ["purchase", "buy", "bought", "order", "from", "supplier"]
	sale_keywords = ["sale", "sell", "sold", "to", "customer", "client"]

	purchase_score = sum(1 for keyword in purchase_keywords if keyword in text)
	sale_score = sum(1 for keyword in sale_keywords if keyword in text)

	return "purchase" if purchase_score >= sale_score else "sale"

	def _extract_product(self, text: str) -> Optional[str]:
	"""Extract product name from text"""
	# Enhanced product patterns to handle various formats
	product_patterns = [
	# Pattern for "X units of Y" format (e.g., "20 tons of Apples")
	r"(?:\d+)\s(?:tons?\|kg\|kilograms?\|pounds?\|lbs?\|pieces?\|units?\|items?\|boxes?)\s+of\s+([a-zA-Z\s]+?)(?:\s+from\|\s+at\|\s+for\|\s€\|\s*\$\|$)",

	# Pattern for "bought/purchased X Y" format
	r"(?:bought\|purchased\|buy\|purchase\|sold\|sale\|sell)\s+(?:\d+\s(?:tons?\|kg\|pieces?\|units?)?\s+)?(?:of\s+)?([a-zA-Z\s]+?)(?:\s+from\|\s+to\|\s+at\|\s+for\|\s€\|\s*\$)",

	# Pattern for quantity followed by product
	r"(?:\d+)\s(?:x\s+)?([a-zA-Z\s]+?)(?:\s+from\|\s+at\|\s+for\|\s€\|\s*\$)",

	# Pattern for standalone capitalized product names
	r"\b([A-Z][a-zA-Z](?:\s+[A-Z][a-zA-Z])*)\b(?!\s+(?:from\|at\|for\|€\|\$))",
	]

	for pattern in product_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	product = match.group(1).strip()
	# Filter out common non-product words
	if product.lower() not in ['from', 'at', 'for', 'to', 'we', 'i', 'you', 'the', 'a', 'an', 'and', 'or']:
	return product

	# Use spaCy for named entity recognition if available
	if self.nlp:
	doc = self.nlp(text)
	for ent in doc.ents:
	if ent.label_ in ["PRODUCT", "ORG"] and len(ent.text) > 2:
	return ent.text

	return None

	def _extract_quantity(self, text: str) -> Optional[int]:
	"""Extract quantity from text"""
	# Enhanced quantity patterns to handle various units
	quantity_patterns = [
	# Numbers with explicit units
	r"(\d+(?:\.\d+)?)\s*(?:tons?\|kg\|kilograms?\|pounds?\|lbs?\|pieces?\|units?\|items?\|boxes?)",
	# Numbers followed by "of" or "x"
	r"(\d+(?:\.\d+)?)\s*(?:of\|x)\s+",
	# Numbers in transaction context
	r"(?:bought\|purchased\|buy\|purchase\|sold\|sale\|sell)\s+(?:of\s+)?(\d+(?:\.\d+)?)",
	# Standalone numbers at start
	r"^(\d+(?:\.\d+)?)\s+",
	]

	for pattern in quantity_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	try:
	# Convert to int, handling decimal quantities
	quantity = float(match.group(1))
	return int(quantity) if quantity.is_integer() else int(round(quantity))
	except (ValueError, AttributeError):
	continue

	return None

	def _extract_unit(self, text: str) -> Optional[str]:
	"""Extract unit from text (tons, kg, pieces, etc.)"""
	# Common unit patterns
	unit_patterns = [
	r"\d+(?:\.\d+)?\s*(tons?\|kg\|kilograms?\|pounds?\|lbs?\|pieces?\|units?\|items?\|boxes?\|liters?\|gallons?)",
	]

	for pattern in unit_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	unit = match.group(1).lower()
	# Normalize units
	unit_mapping = {
	'ton': 'tons', 'kg': 'kg', 'kilogram': 'kg', 'kilograms': 'kg',
	'pound': 'lbs', 'pounds': 'lbs', 'lb': 'lbs', 'lbs': 'lbs',
	'piece': 'pieces', 'pieces': 'pieces',
	'unit': 'units', 'units': 'units',
	'item': 'items', 'items': 'items',
	'box': 'boxes', 'boxes': 'boxes',
	'liter': 'liters', 'liters': 'liters',
	'gallon': 'gallons', 'gallons': 'gallons'
	}
	return unit_mapping.get(unit, unit)

	return None

	def _extract_supplier(self, text: str) -> Optional[str]:
	"""Extract supplier name from text"""
	# Look for "from [supplier]" patterns
	supplier_patterns = [
	r"from\s+([A-Za-z\s]+?)(?:\s+at\|\s+for\|\s€\|\s\$\|$)",
	r"supplier\s+([A-Za-z\s]+?)(?:\s+at\|\s+for\|\s€\|\s\$\|$)",
	]

	for pattern in supplier_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	return match.group(1).strip()

	# Use spaCy for organization detection
	if self.nlp:
	doc = self.nlp(text)
	for ent in doc.ents:
	if ent.label_ == "ORG":
	return ent.text

	return None

	def _extract_customer(self, text: str) -> Optional[str]:
	"""Extract customer name from text"""
	# Look for "to [customer]" patterns
	customer_patterns = [
	r"to\s+([A-Za-z\s]+?)(?:\s+at\|\s+for\|\s€\|\s\$\|$)",
	r"customer\s+([A-Za-z\s]+?)(?:\s+at\|\s+for\|\s€\|\s\$\|$)",
	]

	for pattern in customer_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	return match.group(1).strip()

	# Use spaCy for person detection
	if self.nlp:
	doc = self.nlp(text)
	for ent in doc.ents:
	if ent.label_ == "PERSON":
	return ent.text

	return None

	def _extract_unit_price(self, text: str) -> Optional[float]:
	"""Extract unit price from text"""
	# Look for price patterns
	price_patterns = [
	r"(?:at\|for\|€\|$)\s(\d+(?:\.\d{2})?)\s(?:each\|per\|unit)?",
	r"(\d+(?:\.\d{2})?)\s(?:€\|$)\s(?:each\|per\|unit)",
	r"(?:price\|cost)?\s(?:of)?\s(\d+(?:\.\d{2})?)\s*(?:€\|$)",
	]

	for pattern in price_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	return float(match.group(1))

	return None

	def _calculate_total(self, quantity: Optional[int], unit_price: Optional[float]) -> Optional[float]:
	"""Calculate total amount"""
	if quantity and unit_price:
	return quantity * unit_price
	return None