Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- api_client.py +39 -0
- ocr_processor.py +23 -0
- temp_img.jpg +0 -0
- text_cleaner.py +46 -0
- utils.py +5 -0
api_client.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
def fetch_product(info):
|
| 4 |
+
"""Multi-step search strategy"""
|
| 5 |
+
# barcode search first
|
| 6 |
+
for barcode in info['barcodes']:
|
| 7 |
+
product = fetch_by_barcode(barcode)
|
| 8 |
+
if product:
|
| 9 |
+
return product
|
| 10 |
+
|
| 11 |
+
# incase Try brand + product name search
|
| 12 |
+
if info['brand'] and info['product_name']:
|
| 13 |
+
product = fetch_by_query(f"{info['brand']} {info['product_name']}")
|
| 14 |
+
if product:
|
| 15 |
+
return product
|
| 16 |
+
|
| 17 |
+
# Trying product name only
|
| 18 |
+
if info['product_name']:
|
| 19 |
+
product = fetch_by_query(info['product_name'])
|
| 20 |
+
if product:
|
| 21 |
+
return product
|
| 22 |
+
|
| 23 |
+
# Fallback to brand search
|
| 24 |
+
if info['brand']:
|
| 25 |
+
return fetch_by_query(info['brand'])
|
| 26 |
+
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
def fetch_by_barcode(barcode):
|
| 30 |
+
url = f"https://world.openfoodfacts.org/api/v0/product/{barcode}.json"
|
| 31 |
+
response = requests.get(url)
|
| 32 |
+
data = response.json()
|
| 33 |
+
return data.get('product') if data.get('status') == 1 else None
|
| 34 |
+
|
| 35 |
+
def fetch_by_query(query):
|
| 36 |
+
url = f"https://world.openfoodfacts.org/cgi/search.pl?search_terms={query}&sort_by=unique_scans_n&json=1"
|
| 37 |
+
response = requests.get(url)
|
| 38 |
+
products = response.json().get('products', [])
|
| 39 |
+
return products[0] if products else None
|
ocr_processor.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import easyocr
|
| 2 |
+
from text_cleaner import clean_product_text, extract_keywords
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
def extract_info(image_path):
|
| 6 |
+
reader = easyocr.Reader(['en'])
|
| 7 |
+
results = reader.readtext(image_path, detail=0)
|
| 8 |
+
raw_text = " ".join(results)
|
| 9 |
+
|
| 10 |
+
# Clean and extract the info we did
|
| 11 |
+
clean_text = clean_product_text(raw_text)
|
| 12 |
+
brand, product_name = extract_keywords(clean_text)
|
| 13 |
+
|
| 14 |
+
# Barcode detection
|
| 15 |
+
barcodes = re.findall(r'\b(\d{12,13})\b', raw_text)
|
| 16 |
+
|
| 17 |
+
return {
|
| 18 |
+
"raw_text": raw_text,
|
| 19 |
+
"clean_text": clean_text,
|
| 20 |
+
"brand": brand,
|
| 21 |
+
"product_name": product_name,
|
| 22 |
+
"barcodes": barcodes
|
| 23 |
+
}
|
temp_img.jpg
ADDED
|
text_cleaner.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def clean_product_text(text):
|
| 4 |
+
# Remove quantities
|
| 5 |
+
text = re.sub(r'\d+\s*(ml|g|kg|oz|l|lb|fl\.?)\b', '', text, flags=re.IGNORECASE)
|
| 6 |
+
|
| 7 |
+
# Remove special characters
|
| 8 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 9 |
+
|
| 10 |
+
text = re.sub(r'\b\d+\b', '', text)
|
| 11 |
+
|
| 12 |
+
# Remove non-product words
|
| 13 |
+
stop_phrases = ['net wt', 'net weight', 'volume', 'best before', 'expiry']
|
| 14 |
+
for phrase in stop_phrases:
|
| 15 |
+
text = text.replace(phrase, '')
|
| 16 |
+
|
| 17 |
+
# whitespaces
|
| 18 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 19 |
+
|
| 20 |
+
return text
|
| 21 |
+
|
| 22 |
+
def extract_keywords(text):
|
| 23 |
+
# extract brand and product details
|
| 24 |
+
words = text.split()
|
| 25 |
+
if not words:
|
| 26 |
+
return "", ""
|
| 27 |
+
|
| 28 |
+
# assign the first word is brand
|
| 29 |
+
brand = words[0]
|
| 30 |
+
|
| 31 |
+
# Find longest product name
|
| 32 |
+
product_phrase = ""
|
| 33 |
+
current_phrase = ""
|
| 34 |
+
|
| 35 |
+
for word in words[1:]:
|
| 36 |
+
if word.isupper() or len(word) > 3:
|
| 37 |
+
current_phrase += f"{word} "
|
| 38 |
+
else:
|
| 39 |
+
if len(current_phrase) > len(product_phrase):
|
| 40 |
+
product_phrase = current_phrase
|
| 41 |
+
current_phrase = ""
|
| 42 |
+
|
| 43 |
+
# Fial checking
|
| 44 |
+
product_phrase = product_phrase or current_phrase or " ".join(words[1:min(4, len(words))])
|
| 45 |
+
|
| 46 |
+
return brand, product_phrase.strip()
|
utils.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
def clean_temp_files():
|
| 4 |
+
if os.path.exists("temp_img.jpg"):
|
| 5 |
+
os.remove("temp_img.jpg")
|