ocr_product_scanner / src /text_cleaner.py
Akash076's picture
Upload 2 files
c4029c4 verified
import re
def clean_product_text(text):
# Remove quantities
text = re.sub(r'\d+\s*(ml|g|kg|oz|l|lb|fl\.?)\b', '', text, flags=re.IGNORECASE)
# Remove special characters
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\b\d+\b', '', text)
# Remove non-product words
stop_phrases = ['net wt', 'net weight', 'volume', 'best before', 'expiry']
for phrase in stop_phrases:
text = text.replace(phrase, '')
# whitespaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_keywords(text):
# extract brand and product details
words = text.split()
if not words:
return "", ""
# assign the first word is brand
brand = words[0]
# Find longest product name
product_phrase = ""
current_phrase = ""
for word in words[1:]:
if word.isupper() or len(word) > 3:
current_phrase += f"{word} "
else:
if len(current_phrase) > len(product_phrase):
product_phrase = current_phrase
current_phrase = ""
# Fial checking
product_phrase = product_phrase or current_phrase or " ".join(words[1:min(4, len(words))])
return brand, product_phrase.strip()