Upload 21 files
Browse files- .gitattributes +1 -0
- app/__init__.py +7 -0
- app/__pycache__/__init__.cpython-39.pyc +0 -0
- app/__pycache__/routes.cpython-39.pyc +0 -0
- app/data/COSING_Cleaned_Normalized_v7(1).csv +0 -0
- app/data/bert_training_data.csv +0 -0
- app/data/brend_cleaned.csv +3 -0
- app/product_embeddings.pt +3 -0
- app/routes.py +101 -0
- app/utils/__pycache__/helper.cpython-39.pyc +0 -0
- app/utils/__pycache__/ner.cpython-39.pyc +0 -0
- app/utils/__pycache__/ocr.cpython-39.pyc +0 -0
- app/utils/__pycache__/predict_afteruse.cpython-39.pyc +0 -0
- app/utils/__pycache__/prediction.cpython-39.pyc +0 -0
- app/utils/__pycache__/recommendation.cpython-39.pyc +0 -0
- app/utils/clean.py +11 -0
- app/utils/helper.py +21 -0
- app/utils/ner.py +12 -0
- app/utils/ocr.py +20 -0
- app/utils/predict_afteruse.py +103 -0
- app/utils/prediction.py +56 -0
- app/utils/recommendation.py +21 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
app/data/brend_cleaned.csv filter=lfs diff=lfs merge=lfs -text
|
app/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask
|
| 2 |
+
from app.routes import analyze_blueprint
|
| 3 |
+
|
| 4 |
+
def create_app():
|
| 5 |
+
app = Flask(__name__)
|
| 6 |
+
app.register_blueprint(analyze_blueprint)
|
| 7 |
+
return app
|
app/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (399 Bytes). View file
|
|
|
app/__pycache__/routes.cpython-39.pyc
ADDED
|
Binary file (3.86 kB). View file
|
|
|
app/data/COSING_Cleaned_Normalized_v7(1).csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/data/bert_training_data.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/data/brend_cleaned.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80f4cc1b0e837c5126c92a36dfc013a4fdec1b2dc5d861ebdb668d2c3b102d9b
|
| 3 |
+
size 11649428
|
app/product_embeddings.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5819991edf37061272f0359a8686954c7d985eeea309af75806a47a65fa40c62
|
| 3 |
+
size 26899667
|
app/routes.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from flask import Blueprint, request, jsonify
|
| 3 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 4 |
+
from app.utils.ocr import extract_text_from_image
|
| 5 |
+
from app.utils.ner import extract_ingredients
|
| 6 |
+
from app.utils.prediction import predict_with_description
|
| 7 |
+
from app.utils.recommendation import recommend_similar_products
|
| 8 |
+
from app.utils.helper import correct_spelling, load_data
|
| 9 |
+
from app.utils.predict_afteruse import predict_after_use, generate_afteruse_sentence_en, predict_after_use_with_probs
|
| 10 |
+
|
| 11 |
+
analyze_blueprint = Blueprint('analyze', __name__)
|
| 12 |
+
logging.basicConfig(level=logging.INFO)
|
| 13 |
+
|
| 14 |
+
# Load data + model only once
|
| 15 |
+
df_cosing, df_brand, product_embeddings = load_data()
|
| 16 |
+
|
| 17 |
+
@analyze_blueprint.route("/analyze", methods=["POST"])
|
| 18 |
+
def analyze_ingredients():
|
| 19 |
+
try:
|
| 20 |
+
logging.info("Start analyzing ingredients")
|
| 21 |
+
ingredients_input = []
|
| 22 |
+
|
| 23 |
+
# OCR from image (optional)
|
| 24 |
+
if 'ingredients' in request.files:
|
| 25 |
+
logging.info("Extracting ingredients from uploaded image using OCR")
|
| 26 |
+
text = extract_text_from_image(request.files['ingredients'])
|
| 27 |
+
logging.info(f"OCR text result: {text}")
|
| 28 |
+
if text.strip():
|
| 29 |
+
extracted = extract_ingredients(text)
|
| 30 |
+
logging.info(f"Extracted ingredients from OCR: {extracted}")
|
| 31 |
+
ingredients_input.extend(extracted)
|
| 32 |
+
|
| 33 |
+
# From JSON or form
|
| 34 |
+
data = request.get_json(silent=True) or {}
|
| 35 |
+
text_input = data.get('ingredients') or request.form.get('ingredients')
|
| 36 |
+
logging.info(f"Text input from JSON/form: {text_input}")
|
| 37 |
+
|
| 38 |
+
# Fallback OCR from form field 'image'
|
| 39 |
+
if not text_input and 'image' in request.files:
|
| 40 |
+
from PIL import Image
|
| 41 |
+
import pytesseract
|
| 42 |
+
logging.info("Fallback OCR from 'image' field")
|
| 43 |
+
image_file = request.files['image']
|
| 44 |
+
image = Image.open(image_file.stream)
|
| 45 |
+
text_input = pytesseract.image_to_string(image)
|
| 46 |
+
logging.info(f"Fallback OCR text result: {text_input}")
|
| 47 |
+
|
| 48 |
+
# Parse text input
|
| 49 |
+
if isinstance(text_input, str):
|
| 50 |
+
import re
|
| 51 |
+
manual_split = [i.strip() for i in re.split(r',|;', text_input) if i.strip()]
|
| 52 |
+
parsed = manual_split or extract_ingredients(text_input)
|
| 53 |
+
logging.info(f"Parsed ingredients from string input: {parsed}")
|
| 54 |
+
ingredients_input.extend(parsed)
|
| 55 |
+
elif isinstance(text_input, list):
|
| 56 |
+
cleaned_list = [i.strip().lower() for i in text_input if i.strip()]
|
| 57 |
+
logging.info(f"Parsed ingredients from list input: {cleaned_list}")
|
| 58 |
+
ingredients_input.extend(cleaned_list)
|
| 59 |
+
|
| 60 |
+
if not ingredients_input:
|
| 61 |
+
logging.warning("No ingredients recognized after processing input.")
|
| 62 |
+
return jsonify({"error": "No ingredients recognized."}), 400
|
| 63 |
+
|
| 64 |
+
# Clean & deduplicate
|
| 65 |
+
ingredients_input = list(set(ingredients_input))
|
| 66 |
+
logging.info(f"Unique ingredients before spell check: {ingredients_input}")
|
| 67 |
+
corrected = [correct_spelling(ing, df_cosing) for ing in ingredients_input]
|
| 68 |
+
logging.info(f"Corrected ingredients: {corrected}")
|
| 69 |
+
|
| 70 |
+
# Predict individual ingredient effects
|
| 71 |
+
logging.info("Predicting individual ingredient effects")
|
| 72 |
+
with ThreadPoolExecutor() as executor:
|
| 73 |
+
results = list(executor.map(lambda ing: predict_with_description(ing, df_cosing), corrected))
|
| 74 |
+
logging.info(f"Prediction results: {results}")
|
| 75 |
+
|
| 76 |
+
# Recommend products
|
| 77 |
+
logging.info("Generating product recommendations")
|
| 78 |
+
recommendations = recommend_similar_products(corrected, df_brand, product_embeddings)
|
| 79 |
+
logging.info(f"Product recommendations: {recommendations}")
|
| 80 |
+
|
| 81 |
+
# Predict combined after-use effects
|
| 82 |
+
logging.info("Predicting combined after-use effects")
|
| 83 |
+
combined_ingredients_str = " ".join(corrected)
|
| 84 |
+
after_use_predictions = predict_after_use(combined_ingredients_str)
|
| 85 |
+
after_use_sentence = generate_afteruse_sentence_en(after_use_predictions)
|
| 86 |
+
after_use_probs = predict_after_use_with_probs(combined_ingredients_str)
|
| 87 |
+
logging.info(f"After-use predictions: {after_use_predictions}")
|
| 88 |
+
|
| 89 |
+
return jsonify({
|
| 90 |
+
"Ingredient Analysis": results,
|
| 91 |
+
"Product Recommendations": recommendations,
|
| 92 |
+
"Predicted After Use Effects": {
|
| 93 |
+
"labels": after_use_predictions,
|
| 94 |
+
"description": after_use_sentence,
|
| 95 |
+
"skor": after_use_probs
|
| 96 |
+
}
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logging.exception(f"Error in analyze_ingredients: {e}")
|
| 101 |
+
return jsonify({"error": str(e)}), 500
|
app/utils/__pycache__/helper.cpython-39.pyc
ADDED
|
Binary file (1.04 kB). View file
|
|
|
app/utils/__pycache__/ner.cpython-39.pyc
ADDED
|
Binary file (606 Bytes). View file
|
|
|
app/utils/__pycache__/ocr.cpython-39.pyc
ADDED
|
Binary file (1.05 kB). View file
|
|
|
app/utils/__pycache__/predict_afteruse.cpython-39.pyc
ADDED
|
Binary file (3.11 kB). View file
|
|
|
app/utils/__pycache__/prediction.cpython-39.pyc
ADDED
|
Binary file (1.64 kB). View file
|
|
|
app/utils/__pycache__/recommendation.cpython-39.pyc
ADDED
|
Binary file (922 Bytes). View file
|
|
|
app/utils/clean.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
df = pd.read_csv("../../data/COSING_Cleaned_Normalized_v7(1).csv")
|
| 4 |
+
|
| 5 |
+
# Ubah label jadi binary
|
| 6 |
+
df['Label'] = df['Risk Level'].apply(lambda x: 1 if str(x).lower() == 'low risk' else 0)
|
| 7 |
+
|
| 8 |
+
# Hapus baris tanpa deskripsi
|
| 9 |
+
df = df.dropna(subset=['Description'])
|
| 10 |
+
|
| 11 |
+
df[['Description', 'Label']].to_csv("bert_training_data.csv", index=False)
|
app/utils/helper.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import torch
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
def correct_spelling(ingredient, df):
|
| 6 |
+
from difflib import get_close_matches
|
| 7 |
+
choices = df['INCI name'].dropna().str.lower().tolist()
|
| 8 |
+
match = get_close_matches(ingredient.lower(), choices, n=1, cutoff=0.7)
|
| 9 |
+
return match[0] if match else ingredient
|
| 10 |
+
|
| 11 |
+
def load_data():
|
| 12 |
+
base_path = Path(__file__).resolve().parent.parent
|
| 13 |
+
data_path = base_path / "data"
|
| 14 |
+
|
| 15 |
+
df_cosing = pd.read_csv(data_path / "COSING_Cleaned_Normalized_v7(1).csv")
|
| 16 |
+
df_brand = pd.read_csv(data_path / "brend_cleaned.csv")
|
| 17 |
+
df_brand['text'] = df_brand['name'].astype(str) + " " + df_brand['ingridients'].astype(str)
|
| 18 |
+
|
| 19 |
+
product_embeddings = torch.load(base_path / "product_embeddings.pt", map_location='cpu')
|
| 20 |
+
|
| 21 |
+
return df_cosing, df_brand, product_embeddings
|
app/utils/ner.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
|
| 4 |
+
|
| 5 |
+
def extract_ingredients(text):
|
| 6 |
+
entities = ner_pipeline(text)
|
| 7 |
+
ingredients = []
|
| 8 |
+
for ent in entities:
|
| 9 |
+
word = ent['word'].lower().strip()
|
| 10 |
+
if ent['entity_group'] == 'MISC' and word.isalpha() and len(word) > 2:
|
| 11 |
+
ingredients.append(word)
|
| 12 |
+
return list(set(ingredients))
|
app/utils/ocr.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from PIL import Image
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
import re
|
| 5 |
+
import logging
|
| 6 |
+
import easyocr
|
| 7 |
+
|
| 8 |
+
ocr_reader = easyocr.Reader(['en'], gpu=False)
|
| 9 |
+
|
| 10 |
+
def extract_text_from_image(image_file):
|
| 11 |
+
try:
|
| 12 |
+
image = Image.open(BytesIO(image_file.read())).convert("RGB")
|
| 13 |
+
image_np = np.array(image)
|
| 14 |
+
results = ocr_reader.readtext(image_np)
|
| 15 |
+
text = " ".join([res[1] for res in results])
|
| 16 |
+
text = re.sub(r'\b(Ingredients|Komposisi|Composition|Bahan|Daftar Bahan)\b', '', text, flags=re.IGNORECASE)
|
| 17 |
+
return re.sub(r'[^A-Za-z,.\s-]', '', text).strip()
|
| 18 |
+
except Exception as e:
|
| 19 |
+
logging.error(f"OCR error: {e}")
|
| 20 |
+
return ""
|
app/utils/predict_afteruse.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 7 |
+
|
| 8 |
+
# Load the pre-trained model and tokenizer
|
| 9 |
+
tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-multilabel", token=HF_TOKEN)
|
| 10 |
+
model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-multilabel", token=HF_TOKEN)
|
| 11 |
+
|
| 12 |
+
model.eval()
|
| 13 |
+
|
| 14 |
+
# Efek after use (harus sesuai saat training)
|
| 15 |
+
afteruse_labels = [
|
| 16 |
+
"acne fighting", "acne trigger", "anti aging", "brightening", "moisturizing",
|
| 17 |
+
"redness reducing", "skin texture", "soothing", "unknown", "whitening"
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
afteruse_descriptions_en = {
|
| 21 |
+
"acne fighting": "helps fight acne",
|
| 22 |
+
"acne trigger": "may trigger acne",
|
| 23 |
+
"anti aging": "reduces signs of aging",
|
| 24 |
+
"brightening": "brightens the skin",
|
| 25 |
+
"moisturizing": "moisturizes the skin",
|
| 26 |
+
"redness reducing": "reduces redness",
|
| 27 |
+
"skin texture": "improves skin texture",
|
| 28 |
+
"soothing": "soothes the skin",
|
| 29 |
+
"unknown": "has unknown effects",
|
| 30 |
+
"whitening": "whitens the skin"
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
def predict_after_use(input_ingredients):
|
| 34 |
+
if not input_ingredients:
|
| 35 |
+
return []
|
| 36 |
+
|
| 37 |
+
# Tokenisasi input
|
| 38 |
+
inputs = tokenizer(input_ingredients, return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 39 |
+
|
| 40 |
+
with torch.no_grad():
|
| 41 |
+
outputs = model(**inputs)
|
| 42 |
+
logits = outputs.logits
|
| 43 |
+
probs = torch.sigmoid(logits)
|
| 44 |
+
|
| 45 |
+
# Konversi probabilitas ke numpy dan ratakan
|
| 46 |
+
probs = probs.squeeze().cpu().numpy()
|
| 47 |
+
if probs.ndim == 0:
|
| 48 |
+
probs = [probs] # Jika scalar, ubah jadi list
|
| 49 |
+
|
| 50 |
+
print(f"[DEBUG] Model output shape: {logits.shape}")
|
| 51 |
+
print(f"[DEBUG] Jumlah output model: {len(probs)} | Jumlah label: {len(afteruse_labels)}")
|
| 52 |
+
|
| 53 |
+
# Antisipasi mismatch jumlah output vs label
|
| 54 |
+
min_len = min(len(probs), len(afteruse_labels))
|
| 55 |
+
predicted_labels = [
|
| 56 |
+
afteruse_labels[i]
|
| 57 |
+
for i in range(min_len)
|
| 58 |
+
if probs[i] > 0.5
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
return predicted_labels
|
| 62 |
+
|
| 63 |
+
def generate_afteruse_sentence_en(predicted_labels):
|
| 64 |
+
if not predicted_labels:
|
| 65 |
+
return "No effects were detected based on the provided ingredients."
|
| 66 |
+
|
| 67 |
+
descriptions = [afteruse_descriptions_en.get(label, label) for label in predicted_labels]
|
| 68 |
+
|
| 69 |
+
if len(descriptions) == 1:
|
| 70 |
+
return f"This product {descriptions[0]}."
|
| 71 |
+
elif len(descriptions) == 2:
|
| 72 |
+
return f"This product {descriptions[0]} and {descriptions[1]}."
|
| 73 |
+
else:
|
| 74 |
+
return f"This product {', '.join(descriptions[:-1])}, and {descriptions[-1]}."
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def predict_after_use_with_probs(input_ingredients):
|
| 78 |
+
if not input_ingredients:
|
| 79 |
+
return [], []
|
| 80 |
+
|
| 81 |
+
inputs = tokenizer(input_ingredients, return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 82 |
+
|
| 83 |
+
with torch.no_grad():
|
| 84 |
+
outputs = model(**inputs)
|
| 85 |
+
logits = outputs.logits
|
| 86 |
+
probs = torch.sigmoid(logits)
|
| 87 |
+
|
| 88 |
+
probs = probs.squeeze().cpu().numpy()
|
| 89 |
+
if probs.ndim == 0:
|
| 90 |
+
probs = [probs]
|
| 91 |
+
|
| 92 |
+
min_len = min(len(probs), len(afteruse_labels))
|
| 93 |
+
predicted_labels = [
|
| 94 |
+
afteruse_labels[i]
|
| 95 |
+
for i in range(min_len)
|
| 96 |
+
if probs[i] > 0.5
|
| 97 |
+
]
|
| 98 |
+
label_probs = {
|
| 99 |
+
afteruse_labels[i]: float(probs[i])
|
| 100 |
+
for i in range(min_len)
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
return predicted_labels, label_probs
|
app/utils/prediction.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from collections import OrderedDict
|
| 3 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 7 |
+
|
| 8 |
+
# Load the pre-trained model and tokenizer
|
| 9 |
+
tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-model", token=HF_TOKEN)
|
| 10 |
+
model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-model", token=HF_TOKEN)
|
| 11 |
+
|
| 12 |
+
def predict(desc):
|
| 13 |
+
if not desc:
|
| 14 |
+
return "Not Safe"
|
| 15 |
+
inputs = tokenizer(desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 16 |
+
with torch.no_grad():
|
| 17 |
+
outputs = model(**inputs)
|
| 18 |
+
logits = outputs.logits
|
| 19 |
+
pred = torch.argmax(logits, dim=1).item()
|
| 20 |
+
return "Safe" if pred == 1 else "Not Safe"
|
| 21 |
+
|
| 22 |
+
def predict_with_description(ingredient, df):
|
| 23 |
+
df_match = df.copy()
|
| 24 |
+
df_match['INCI name_lower'] = df_match['INCI name'].str.lower()
|
| 25 |
+
df_match['IUPAC Name_lower'] = df_match['IUPAC Name'].str.lower()
|
| 26 |
+
|
| 27 |
+
ingredient_lower = ingredient.lower()
|
| 28 |
+
match_row = df_match[(df_match['INCI name_lower'] == ingredient_lower) | (df_match['IUPAC Name_lower'] == ingredient_lower)]
|
| 29 |
+
|
| 30 |
+
if not match_row.empty:
|
| 31 |
+
row = match_row.iloc[0]
|
| 32 |
+
inci_name = row['INCI name'].title()
|
| 33 |
+
desc = row.get('Description', '')
|
| 34 |
+
func = row.get('Function', '')
|
| 35 |
+
Restriction = row.get('Restriction')
|
| 36 |
+
risk_lvl = row.get('Risk Level', '')
|
| 37 |
+
risk_desc = row.get('Risk Description', '')
|
| 38 |
+
else:
|
| 39 |
+
inci_name = ingredient.title()
|
| 40 |
+
desc = "Description not found"
|
| 41 |
+
func = "Function not found"
|
| 42 |
+
Restriction = "Restriction not found"
|
| 43 |
+
risk_lvl = "Unknown"
|
| 44 |
+
risk_desc = "Risk info not available"
|
| 45 |
+
|
| 46 |
+
result = predict(desc)
|
| 47 |
+
|
| 48 |
+
return OrderedDict([
|
| 49 |
+
("Ingredient Name", inci_name),
|
| 50 |
+
("Description", desc),
|
| 51 |
+
("Function", func),
|
| 52 |
+
("Risk Level", risk_lvl),
|
| 53 |
+
{"code Restriction ", Restriction}
|
| 54 |
+
("Risk Description", risk_desc),
|
| 55 |
+
("Prediction", result)
|
| 56 |
+
])
|
app/utils/recommendation.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer, util
|
| 2 |
+
|
| 3 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 4 |
+
|
| 5 |
+
def recommend_similar_products(input_ingredients, df_brand, product_embeddings, top_n=3):
|
| 6 |
+
query = " ".join(input_ingredients)
|
| 7 |
+
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
|
| 8 |
+
scores = util.cos_sim(query_embedding, product_embeddings)[0]
|
| 9 |
+
top_results = scores.topk(k=min(top_n, len(scores)))
|
| 10 |
+
|
| 11 |
+
recommended = []
|
| 12 |
+
for idx in top_results.indices:
|
| 13 |
+
row = df_brand.iloc[int(idx)]
|
| 14 |
+
recommended.append({
|
| 15 |
+
"brand": row.get('brand', ''),
|
| 16 |
+
"name": row.get('name', ''),
|
| 17 |
+
"type": row.get('type', ''),
|
| 18 |
+
"ingredients": row.get('ingridients', ''),
|
| 19 |
+
"description": row.get('afterUse', ''),
|
| 20 |
+
})
|
| 21 |
+
return recommended
|