Spaces:

Akash076
/

ocr_product_scanner

Sleeping

ocr_product_scanner / src /text_cleaner.py

Upload 2 files

c4029c4 verified 8 months ago

1.31 kB

	import re

	def clean_product_text(text):
	# Remove quantities
	text = re.sub(r'\d+\s*(ml\|g\|kg\|oz\|l\|lb\|fl\.?)\b', '', text, flags=re.IGNORECASE)

	# Remove special characters
	text = re.sub(r'[^\w\s]', '', text)

	text = re.sub(r'\b\d+\b', '', text)

	# Remove non-product words
	stop_phrases = ['net wt', 'net weight', 'volume', 'best before', 'expiry']
	for phrase in stop_phrases:
	text = text.replace(phrase, '')

	# whitespaces
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def extract_keywords(text):
	# extract brand and product details
	words = text.split()
	if not words:
	return "", ""

	# assign the first word is brand
	brand = words[0]

	# Find longest product name
	product_phrase = ""
	current_phrase = ""

	for word in words[1:]:
	if word.isupper() or len(word) > 3:
	current_phrase += f"{word} "
	else:
	if len(current_phrase) > len(product_phrase):
	product_phrase = current_phrase
	current_phrase = ""

	# Fial checking
	product_phrase = product_phrase or current_phrase or " ".join(words[1:min(4, len(words))])

	return brand, product_phrase.strip()