Spaces:

Maulidaaa
/

predict

Sleeping

App Files Files Community

Maulidaaa commited on Apr 19, 2025

Commit

030432c

verified ·

1 Parent(s): 54b9f12

Upload 21 files

Browse files

Files changed (22) hide show

.gitattributes +1 -0
app/__init__.py +7 -0
app/__pycache__/__init__.cpython-39.pyc +0 -0
app/__pycache__/routes.cpython-39.pyc +0 -0
app/data/COSING_Cleaned_Normalized_v7(1).csv +0 -0
app/data/bert_training_data.csv +0 -0
app/data/brend_cleaned.csv +3 -0
app/product_embeddings.pt +3 -0
app/routes.py +101 -0
app/utils/__pycache__/helper.cpython-39.pyc +0 -0
app/utils/__pycache__/ner.cpython-39.pyc +0 -0
app/utils/__pycache__/ocr.cpython-39.pyc +0 -0
app/utils/__pycache__/predict_afteruse.cpython-39.pyc +0 -0
app/utils/__pycache__/prediction.cpython-39.pyc +0 -0
app/utils/__pycache__/recommendation.cpython-39.pyc +0 -0
app/utils/clean.py +11 -0
app/utils/helper.py +21 -0
app/utils/ner.py +12 -0
app/utils/ocr.py +20 -0
app/utils/predict_afteruse.py +103 -0
app/utils/prediction.py +56 -0
app/utils/recommendation.py +21 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+app/data/brend_cleaned.csv filter=lfs diff=lfs merge=lfs -text

app/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from flask import Flask
+from app.routes import analyze_blueprint
+def create_app():
+    app = Flask(__name__)
+    app.register_blueprint(analyze_blueprint)
+    return app

app/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (399 Bytes). View file

app/__pycache__/routes.cpython-39.pyc ADDED Viewed

Binary file (3.86 kB). View file

app/data/COSING_Cleaned_Normalized_v7(1).csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app/data/bert_training_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app/data/brend_cleaned.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80f4cc1b0e837c5126c92a36dfc013a4fdec1b2dc5d861ebdb668d2c3b102d9b
+size 11649428

app/product_embeddings.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5819991edf37061272f0359a8686954c7d985eeea309af75806a47a65fa40c62
+size 26899667

app/routes.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import logging
+from flask import Blueprint, request, jsonify
+from concurrent.futures import ThreadPoolExecutor
+from app.utils.ocr import extract_text_from_image
+from app.utils.ner import extract_ingredients
+from app.utils.prediction import predict_with_description
+from app.utils.recommendation import recommend_similar_products
+from app.utils.helper import correct_spelling, load_data
+from app.utils.predict_afteruse import predict_after_use, generate_afteruse_sentence_en, predict_after_use_with_probs
+analyze_blueprint = Blueprint('analyze', __name__)
+logging.basicConfig(level=logging.INFO)
+# Load data + model only once
+df_cosing, df_brand, product_embeddings = load_data()
+@analyze_blueprint.route("/analyze", methods=["POST"])
+def analyze_ingredients():
+    try:
+        logging.info("Start analyzing ingredients")
+        ingredients_input = []
+        # OCR from image (optional)
+        if 'ingredients' in request.files:
+            logging.info("Extracting ingredients from uploaded image using OCR")
+            text = extract_text_from_image(request.files['ingredients'])
+            logging.info(f"OCR text result: {text}")
+            if text.strip():
+                extracted = extract_ingredients(text)
+                logging.info(f"Extracted ingredients from OCR: {extracted}")
+                ingredients_input.extend(extracted)
+        # From JSON or form
+        data = request.get_json(silent=True) or {}
+        text_input = data.get('ingredients') or request.form.get('ingredients')
+        logging.info(f"Text input from JSON/form: {text_input}")
+        # Fallback OCR from form field 'image'
+        if not text_input and 'image' in request.files:
+            from PIL import Image
+            import pytesseract
+            logging.info("Fallback OCR from 'image' field")
+            image_file = request.files['image']
+            image = Image.open(image_file.stream)
+            text_input = pytesseract.image_to_string(image)
+            logging.info(f"Fallback OCR text result: {text_input}")
+        # Parse text input
+        if isinstance(text_input, str):
+            import re
+            manual_split = [i.strip() for i in re.split(r',|;', text_input) if i.strip()]
+            parsed = manual_split or extract_ingredients(text_input)
+            logging.info(f"Parsed ingredients from string input: {parsed}")
+            ingredients_input.extend(parsed)
+        elif isinstance(text_input, list):
+            cleaned_list = [i.strip().lower() for i in text_input if i.strip()]
+            logging.info(f"Parsed ingredients from list input: {cleaned_list}")
+            ingredients_input.extend(cleaned_list)
+        if not ingredients_input:
+            logging.warning("No ingredients recognized after processing input.")
+            return jsonify({"error": "No ingredients recognized."}), 400
+        # Clean & deduplicate
+        ingredients_input = list(set(ingredients_input))
+        logging.info(f"Unique ingredients before spell check: {ingredients_input}")
+        corrected = [correct_spelling(ing, df_cosing) for ing in ingredients_input]
+        logging.info(f"Corrected ingredients: {corrected}")
+        # Predict individual ingredient effects
+        logging.info("Predicting individual ingredient effects")
+        with ThreadPoolExecutor() as executor:
+            results = list(executor.map(lambda ing: predict_with_description(ing, df_cosing), corrected))
+        logging.info(f"Prediction results: {results}")
+        # Recommend products
+        logging.info("Generating product recommendations")
+        recommendations = recommend_similar_products(corrected, df_brand, product_embeddings)
+        logging.info(f"Product recommendations: {recommendations}")
+        # Predict combined after-use effects
+        logging.info("Predicting combined after-use effects")
+        combined_ingredients_str = " ".join(corrected)
+        after_use_predictions = predict_after_use(combined_ingredients_str)
+        after_use_sentence = generate_afteruse_sentence_en(after_use_predictions)
+        after_use_probs = predict_after_use_with_probs(combined_ingredients_str)
+        logging.info(f"After-use predictions: {after_use_predictions}")
+        return jsonify({
+            "Ingredient Analysis": results,
+            "Product Recommendations": recommendations,
+            "Predicted After Use Effects": {
+                "labels": after_use_predictions,
+                "description": after_use_sentence,
+                "skor": after_use_probs
+            }
+        })
+    except Exception as e:
+        logging.exception(f"Error in analyze_ingredients: {e}")
+        return jsonify({"error": str(e)}), 500

app/utils/__pycache__/helper.cpython-39.pyc ADDED Viewed

Binary file (1.04 kB). View file

app/utils/__pycache__/ner.cpython-39.pyc ADDED Viewed

Binary file (606 Bytes). View file

app/utils/__pycache__/ocr.cpython-39.pyc ADDED Viewed

Binary file (1.05 kB). View file

app/utils/__pycache__/predict_afteruse.cpython-39.pyc ADDED Viewed

Binary file (3.11 kB). View file

app/utils/__pycache__/prediction.cpython-39.pyc ADDED Viewed

Binary file (1.64 kB). View file

app/utils/__pycache__/recommendation.cpython-39.pyc ADDED Viewed

Binary file (922 Bytes). View file

app/utils/clean.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import pandas as pd
+df = pd.read_csv("../../data/COSING_Cleaned_Normalized_v7(1).csv")
+# Ubah label jadi binary
+df['Label'] = df['Risk Level'].apply(lambda x: 1 if str(x).lower() == 'low risk' else 0)
+# Hapus baris tanpa deskripsi
+df = df.dropna(subset=['Description'])
+df[['Description', 'Label']].to_csv("bert_training_data.csv", index=False)

app/utils/helper.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import pandas as pd
+import torch
+from pathlib import Path
+def correct_spelling(ingredient, df):
+    from difflib import get_close_matches
+    choices = df['INCI name'].dropna().str.lower().tolist()
+    match = get_close_matches(ingredient.lower(), choices, n=1, cutoff=0.7)
+    return match[0] if match else ingredient
+def load_data():
+    base_path = Path(__file__).resolve().parent.parent
+    data_path = base_path / "data"
+    df_cosing = pd.read_csv(data_path / "COSING_Cleaned_Normalized_v7(1).csv")
+    df_brand = pd.read_csv(data_path / "brend_cleaned.csv")
+    df_brand['text'] = df_brand['name'].astype(str) + " " + df_brand['ingridients'].astype(str)
+    product_embeddings = torch.load(base_path / "product_embeddings.pt", map_location='cpu')
+    return df_cosing, df_brand, product_embeddings

app/utils/ner.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import pipeline
+ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
+def extract_ingredients(text):
+    entities = ner_pipeline(text)
+    ingredients = []
+    for ent in entities:
+        word = ent['word'].lower().strip()
+        if ent['entity_group'] == 'MISC' and word.isalpha() and len(word) > 2:
+            ingredients.append(word)
+    return list(set(ingredients))

app/utils/ocr.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import numpy as np
+from PIL import Image
+from io import BytesIO
+import re
+import logging
+import easyocr
+ocr_reader = easyocr.Reader(['en'], gpu=False)
+def extract_text_from_image(image_file):
+    try:
+        image = Image.open(BytesIO(image_file.read())).convert("RGB")
+        image_np = np.array(image)
+        results = ocr_reader.readtext(image_np)
+        text = " ".join([res[1] for res in results])
+        text = re.sub(r'\b(Ingredients|Komposisi|Composition|Bahan|Daftar Bahan)\b', '', text, flags=re.IGNORECASE)
+        return re.sub(r'[^A-Za-z,.\s-]', '', text).strip()
+    except Exception as e:
+        logging.error(f"OCR error: {e}")
+        return ""

app/utils/predict_afteruse.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+from transformers import BertTokenizer, BertForSequenceClassification
+import numpy as np
+import os
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Load the pre-trained model and tokenizer
+tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-multilabel", token=HF_TOKEN)
+model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-multilabel", token=HF_TOKEN)
+model.eval()
+# Efek after use (harus sesuai saat training)
+afteruse_labels = [
+    "acne fighting", "acne trigger", "anti aging", "brightening", "moisturizing",
+    "redness reducing", "skin texture", "soothing", "unknown", "whitening"
+]
+afteruse_descriptions_en = {
+    "acne fighting": "helps fight acne",
+    "acne trigger": "may trigger acne",
+    "anti aging": "reduces signs of aging",
+    "brightening": "brightens the skin",
+    "moisturizing": "moisturizes the skin",
+    "redness reducing": "reduces redness",
+    "skin texture": "improves skin texture",
+    "soothing": "soothes the skin",
+    "unknown": "has unknown effects",
+    "whitening": "whitens the skin"
+}
+def predict_after_use(input_ingredients):
+    if not input_ingredients:
+        return []
+    # Tokenisasi input
+    inputs = tokenizer(input_ingredients, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probs = torch.sigmoid(logits)
+    # Konversi probabilitas ke numpy dan ratakan
+    probs = probs.squeeze().cpu().numpy()
+    if probs.ndim == 0:
+        probs = [probs]  # Jika scalar, ubah jadi list
+    print(f"[DEBUG] Model output shape: {logits.shape}")
+    print(f"[DEBUG] Jumlah output model: {len(probs)} | Jumlah label: {len(afteruse_labels)}")
+    # Antisipasi mismatch jumlah output vs label
+    min_len = min(len(probs), len(afteruse_labels))
+    predicted_labels = [
+        afteruse_labels[i]
+        for i in range(min_len)
+        if probs[i] > 0.5
+    ]
+    return predicted_labels
+def generate_afteruse_sentence_en(predicted_labels):
+    if not predicted_labels:
+        return "No effects were detected based on the provided ingredients."
+    descriptions = [afteruse_descriptions_en.get(label, label) for label in predicted_labels]
+    if len(descriptions) == 1:
+        return f"This product {descriptions[0]}."
+    elif len(descriptions) == 2:
+        return f"This product {descriptions[0]} and {descriptions[1]}."
+    else:
+        return f"This product {', '.join(descriptions[:-1])}, and {descriptions[-1]}."
+def predict_after_use_with_probs(input_ingredients):
+    if not input_ingredients:
+        return [], []
+    inputs = tokenizer(input_ingredients, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probs = torch.sigmoid(logits)
+    probs = probs.squeeze().cpu().numpy()
+    if probs.ndim == 0:
+        probs = [probs]
+    min_len = min(len(probs), len(afteruse_labels))
+    predicted_labels = [
+        afteruse_labels[i]
+        for i in range(min_len)
+        if probs[i] > 0.5
+    ]
+    label_probs = {
+        afteruse_labels[i]: float(probs[i])
+        for i in range(min_len)
+    }
+    return predicted_labels, label_probs

app/utils/prediction.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from collections import OrderedDict
+from transformers import BertTokenizer, BertForSequenceClassification
+import os
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Load the pre-trained model and tokenizer
+tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-model", token=HF_TOKEN)
+model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-model", token=HF_TOKEN)
+def predict(desc):
+    if not desc:
+        return "Not Safe"
+    inputs = tokenizer(desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    pred = torch.argmax(logits, dim=1).item()
+    return "Safe" if pred == 1 else "Not Safe"
+def predict_with_description(ingredient, df):
+    df_match = df.copy()
+    df_match['INCI name_lower'] = df_match['INCI name'].str.lower()
+    df_match['IUPAC Name_lower'] = df_match['IUPAC Name'].str.lower()
+    ingredient_lower = ingredient.lower()
+    match_row = df_match[(df_match['INCI name_lower'] == ingredient_lower) | (df_match['IUPAC Name_lower'] == ingredient_lower)]
+    if not match_row.empty:
+        row = match_row.iloc[0]
+        inci_name = row['INCI name'].title()
+        desc = row.get('Description', '')
+        func = row.get('Function', '')
+        Restriction = row.get('Restriction')
+        risk_lvl = row.get('Risk Level', '')
+        risk_desc = row.get('Risk Description', '')
+    else:
+        inci_name = ingredient.title()
+        desc = "Description not found"
+        func = "Function not found"
+        Restriction = "Restriction not found"
+        risk_lvl = "Unknown"
+        risk_desc = "Risk info not available"
+    result = predict(desc)
+    return OrderedDict([
+        ("Ingredient Name", inci_name),
+        ("Description", desc),
+        ("Function", func),
+        ("Risk Level", risk_lvl),
+        {"code Restriction ", Restriction}
+        ("Risk Description", risk_desc),
+        ("Prediction", result)
+    ])

app/utils/recommendation.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from sentence_transformers import SentenceTransformer, util
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+def recommend_similar_products(input_ingredients, df_brand, product_embeddings, top_n=3):
+    query = " ".join(input_ingredients)
+    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
+    scores = util.cos_sim(query_embedding, product_embeddings)[0]
+    top_results = scores.topk(k=min(top_n, len(scores)))
+    recommended = []
+    for idx in top_results.indices:
+        row = df_brand.iloc[int(idx)]
+        recommended.append({
+            "brand": row.get('brand', ''),
+            "name": row.get('name', ''),
+            "type": row.get('type', ''),
+            "ingredients": row.get('ingridients', ''),
+            "description": row.get('afterUse', ''),
+        })
+    return recommended