Akash076 commited on
Commit
b8e8ab6
·
verified ·
1 Parent(s): 3dbd7f5

Upload 5 files

Browse files
Files changed (5) hide show
  1. api_client.py +39 -0
  2. ocr_processor.py +23 -0
  3. temp_img.jpg +0 -0
  4. text_cleaner.py +46 -0
  5. utils.py +5 -0
api_client.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ def fetch_product(info):
4
+ """Multi-step search strategy"""
5
+ # barcode search first
6
+ for barcode in info['barcodes']:
7
+ product = fetch_by_barcode(barcode)
8
+ if product:
9
+ return product
10
+
11
+ # incase Try brand + product name search
12
+ if info['brand'] and info['product_name']:
13
+ product = fetch_by_query(f"{info['brand']} {info['product_name']}")
14
+ if product:
15
+ return product
16
+
17
+ # Trying product name only
18
+ if info['product_name']:
19
+ product = fetch_by_query(info['product_name'])
20
+ if product:
21
+ return product
22
+
23
+ # Fallback to brand search
24
+ if info['brand']:
25
+ return fetch_by_query(info['brand'])
26
+
27
+ return None
28
+
29
+ def fetch_by_barcode(barcode):
30
+ url = f"https://world.openfoodfacts.org/api/v0/product/{barcode}.json"
31
+ response = requests.get(url)
32
+ data = response.json()
33
+ return data.get('product') if data.get('status') == 1 else None
34
+
35
+ def fetch_by_query(query):
36
+ url = f"https://world.openfoodfacts.org/cgi/search.pl?search_terms={query}&sort_by=unique_scans_n&json=1"
37
+ response = requests.get(url)
38
+ products = response.json().get('products', [])
39
+ return products[0] if products else None
ocr_processor.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyocr
2
+ from text_cleaner import clean_product_text, extract_keywords
3
+ import re
4
+
5
+ def extract_info(image_path):
6
+ reader = easyocr.Reader(['en'])
7
+ results = reader.readtext(image_path, detail=0)
8
+ raw_text = " ".join(results)
9
+
10
+ # Clean and extract the info we did
11
+ clean_text = clean_product_text(raw_text)
12
+ brand, product_name = extract_keywords(clean_text)
13
+
14
+ # Barcode detection
15
+ barcodes = re.findall(r'\b(\d{12,13})\b', raw_text)
16
+
17
+ return {
18
+ "raw_text": raw_text,
19
+ "clean_text": clean_text,
20
+ "brand": brand,
21
+ "product_name": product_name,
22
+ "barcodes": barcodes
23
+ }
temp_img.jpg ADDED
text_cleaner.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_product_text(text):
4
+ # Remove quantities
5
+ text = re.sub(r'\d+\s*(ml|g|kg|oz|l|lb|fl\.?)\b', '', text, flags=re.IGNORECASE)
6
+
7
+ # Remove special characters
8
+ text = re.sub(r'[^\w\s]', '', text)
9
+
10
+ text = re.sub(r'\b\d+\b', '', text)
11
+
12
+ # Remove non-product words
13
+ stop_phrases = ['net wt', 'net weight', 'volume', 'best before', 'expiry']
14
+ for phrase in stop_phrases:
15
+ text = text.replace(phrase, '')
16
+
17
+ # whitespaces
18
+ text = re.sub(r'\s+', ' ', text).strip()
19
+
20
+ return text
21
+
22
+ def extract_keywords(text):
23
+ # extract brand and product details
24
+ words = text.split()
25
+ if not words:
26
+ return "", ""
27
+
28
+ # assign the first word is brand
29
+ brand = words[0]
30
+
31
+ # Find longest product name
32
+ product_phrase = ""
33
+ current_phrase = ""
34
+
35
+ for word in words[1:]:
36
+ if word.isupper() or len(word) > 3:
37
+ current_phrase += f"{word} "
38
+ else:
39
+ if len(current_phrase) > len(product_phrase):
40
+ product_phrase = current_phrase
41
+ current_phrase = ""
42
+
43
+ # Fial checking
44
+ product_phrase = product_phrase or current_phrase or " ".join(words[1:min(4, len(words))])
45
+
46
+ return brand, product_phrase.strip()
utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+
3
+ def clean_temp_files():
4
+ if os.path.exists("temp_img.jpg"):
5
+ os.remove("temp_img.jpg")