Akash076 commited on
Commit
c4029c4
·
verified ·
1 Parent(s): ecfb52e

Upload 2 files

Browse files
Files changed (2) hide show
  1. src/text_cleaner.py +46 -0
  2. src/utils.py +5 -0
src/text_cleaner.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_product_text(text):
4
+ # Remove quantities
5
+ text = re.sub(r'\d+\s*(ml|g|kg|oz|l|lb|fl\.?)\b', '', text, flags=re.IGNORECASE)
6
+
7
+ # Remove special characters
8
+ text = re.sub(r'[^\w\s]', '', text)
9
+
10
+ text = re.sub(r'\b\d+\b', '', text)
11
+
12
+ # Remove non-product words
13
+ stop_phrases = ['net wt', 'net weight', 'volume', 'best before', 'expiry']
14
+ for phrase in stop_phrases:
15
+ text = text.replace(phrase, '')
16
+
17
+ # whitespaces
18
+ text = re.sub(r'\s+', ' ', text).strip()
19
+
20
+ return text
21
+
22
+ def extract_keywords(text):
23
+ # extract brand and product details
24
+ words = text.split()
25
+ if not words:
26
+ return "", ""
27
+
28
+ # assign the first word is brand
29
+ brand = words[0]
30
+
31
+ # Find longest product name
32
+ product_phrase = ""
33
+ current_phrase = ""
34
+
35
+ for word in words[1:]:
36
+ if word.isupper() or len(word) > 3:
37
+ current_phrase += f"{word} "
38
+ else:
39
+ if len(current_phrase) > len(product_phrase):
40
+ product_phrase = current_phrase
41
+ current_phrase = ""
42
+
43
+ # Fial checking
44
+ product_phrase = product_phrase or current_phrase or " ".join(words[1:min(4, len(words))])
45
+
46
+ return brand, product_phrase.strip()
src/utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+
3
+ def clean_temp_files():
4
+ if os.path.exists("temp_img.jpg"):
5
+ os.remove("temp_img.jpg")