Maulidaaa commited on
Commit
030432c
·
verified ·
1 Parent(s): 54b9f12

Upload 21 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/data/brend_cleaned.csv filter=lfs diff=lfs merge=lfs -text
app/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from flask import Flask
2
+ from app.routes import analyze_blueprint
3
+
4
+ def create_app():
5
+ app = Flask(__name__)
6
+ app.register_blueprint(analyze_blueprint)
7
+ return app
app/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (399 Bytes). View file
 
app/__pycache__/routes.cpython-39.pyc ADDED
Binary file (3.86 kB). View file
 
app/data/COSING_Cleaned_Normalized_v7(1).csv ADDED
The diff for this file is too large to render. See raw diff
 
app/data/bert_training_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
app/data/brend_cleaned.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80f4cc1b0e837c5126c92a36dfc013a4fdec1b2dc5d861ebdb668d2c3b102d9b
3
+ size 11649428
app/product_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5819991edf37061272f0359a8686954c7d985eeea309af75806a47a65fa40c62
3
+ size 26899667
app/routes.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from flask import Blueprint, request, jsonify
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from app.utils.ocr import extract_text_from_image
5
+ from app.utils.ner import extract_ingredients
6
+ from app.utils.prediction import predict_with_description
7
+ from app.utils.recommendation import recommend_similar_products
8
+ from app.utils.helper import correct_spelling, load_data
9
+ from app.utils.predict_afteruse import predict_after_use, generate_afteruse_sentence_en, predict_after_use_with_probs
10
+
11
+ analyze_blueprint = Blueprint('analyze', __name__)
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+ # Load data + model only once
15
+ df_cosing, df_brand, product_embeddings = load_data()
16
+
17
+ @analyze_blueprint.route("/analyze", methods=["POST"])
18
+ def analyze_ingredients():
19
+ try:
20
+ logging.info("Start analyzing ingredients")
21
+ ingredients_input = []
22
+
23
+ # OCR from image (optional)
24
+ if 'ingredients' in request.files:
25
+ logging.info("Extracting ingredients from uploaded image using OCR")
26
+ text = extract_text_from_image(request.files['ingredients'])
27
+ logging.info(f"OCR text result: {text}")
28
+ if text.strip():
29
+ extracted = extract_ingredients(text)
30
+ logging.info(f"Extracted ingredients from OCR: {extracted}")
31
+ ingredients_input.extend(extracted)
32
+
33
+ # From JSON or form
34
+ data = request.get_json(silent=True) or {}
35
+ text_input = data.get('ingredients') or request.form.get('ingredients')
36
+ logging.info(f"Text input from JSON/form: {text_input}")
37
+
38
+ # Fallback OCR from form field 'image'
39
+ if not text_input and 'image' in request.files:
40
+ from PIL import Image
41
+ import pytesseract
42
+ logging.info("Fallback OCR from 'image' field")
43
+ image_file = request.files['image']
44
+ image = Image.open(image_file.stream)
45
+ text_input = pytesseract.image_to_string(image)
46
+ logging.info(f"Fallback OCR text result: {text_input}")
47
+
48
+ # Parse text input
49
+ if isinstance(text_input, str):
50
+ import re
51
+ manual_split = [i.strip() for i in re.split(r',|;', text_input) if i.strip()]
52
+ parsed = manual_split or extract_ingredients(text_input)
53
+ logging.info(f"Parsed ingredients from string input: {parsed}")
54
+ ingredients_input.extend(parsed)
55
+ elif isinstance(text_input, list):
56
+ cleaned_list = [i.strip().lower() for i in text_input if i.strip()]
57
+ logging.info(f"Parsed ingredients from list input: {cleaned_list}")
58
+ ingredients_input.extend(cleaned_list)
59
+
60
+ if not ingredients_input:
61
+ logging.warning("No ingredients recognized after processing input.")
62
+ return jsonify({"error": "No ingredients recognized."}), 400
63
+
64
+ # Clean & deduplicate
65
+ ingredients_input = list(set(ingredients_input))
66
+ logging.info(f"Unique ingredients before spell check: {ingredients_input}")
67
+ corrected = [correct_spelling(ing, df_cosing) for ing in ingredients_input]
68
+ logging.info(f"Corrected ingredients: {corrected}")
69
+
70
+ # Predict individual ingredient effects
71
+ logging.info("Predicting individual ingredient effects")
72
+ with ThreadPoolExecutor() as executor:
73
+ results = list(executor.map(lambda ing: predict_with_description(ing, df_cosing), corrected))
74
+ logging.info(f"Prediction results: {results}")
75
+
76
+ # Recommend products
77
+ logging.info("Generating product recommendations")
78
+ recommendations = recommend_similar_products(corrected, df_brand, product_embeddings)
79
+ logging.info(f"Product recommendations: {recommendations}")
80
+
81
+ # Predict combined after-use effects
82
+ logging.info("Predicting combined after-use effects")
83
+ combined_ingredients_str = " ".join(corrected)
84
+ after_use_predictions = predict_after_use(combined_ingredients_str)
85
+ after_use_sentence = generate_afteruse_sentence_en(after_use_predictions)
86
+ after_use_probs = predict_after_use_with_probs(combined_ingredients_str)
87
+ logging.info(f"After-use predictions: {after_use_predictions}")
88
+
89
+ return jsonify({
90
+ "Ingredient Analysis": results,
91
+ "Product Recommendations": recommendations,
92
+ "Predicted After Use Effects": {
93
+ "labels": after_use_predictions,
94
+ "description": after_use_sentence,
95
+ "skor": after_use_probs
96
+ }
97
+ })
98
+
99
+ except Exception as e:
100
+ logging.exception(f"Error in analyze_ingredients: {e}")
101
+ return jsonify({"error": str(e)}), 500
app/utils/__pycache__/helper.cpython-39.pyc ADDED
Binary file (1.04 kB). View file
 
app/utils/__pycache__/ner.cpython-39.pyc ADDED
Binary file (606 Bytes). View file
 
app/utils/__pycache__/ocr.cpython-39.pyc ADDED
Binary file (1.05 kB). View file
 
app/utils/__pycache__/predict_afteruse.cpython-39.pyc ADDED
Binary file (3.11 kB). View file
 
app/utils/__pycache__/prediction.cpython-39.pyc ADDED
Binary file (1.64 kB). View file
 
app/utils/__pycache__/recommendation.cpython-39.pyc ADDED
Binary file (922 Bytes). View file
 
app/utils/clean.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ df = pd.read_csv("../../data/COSING_Cleaned_Normalized_v7(1).csv")
4
+
5
+ # Ubah label jadi binary
6
+ df['Label'] = df['Risk Level'].apply(lambda x: 1 if str(x).lower() == 'low risk' else 0)
7
+
8
+ # Hapus baris tanpa deskripsi
9
+ df = df.dropna(subset=['Description'])
10
+
11
+ df[['Description', 'Label']].to_csv("bert_training_data.csv", index=False)
app/utils/helper.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from pathlib import Path
4
+
5
+ def correct_spelling(ingredient, df):
6
+ from difflib import get_close_matches
7
+ choices = df['INCI name'].dropna().str.lower().tolist()
8
+ match = get_close_matches(ingredient.lower(), choices, n=1, cutoff=0.7)
9
+ return match[0] if match else ingredient
10
+
11
+ def load_data():
12
+ base_path = Path(__file__).resolve().parent.parent
13
+ data_path = base_path / "data"
14
+
15
+ df_cosing = pd.read_csv(data_path / "COSING_Cleaned_Normalized_v7(1).csv")
16
+ df_brand = pd.read_csv(data_path / "brend_cleaned.csv")
17
+ df_brand['text'] = df_brand['name'].astype(str) + " " + df_brand['ingridients'].astype(str)
18
+
19
+ product_embeddings = torch.load(base_path / "product_embeddings.pt", map_location='cpu')
20
+
21
+ return df_cosing, df_brand, product_embeddings
app/utils/ner.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
4
+
5
+ def extract_ingredients(text):
6
+ entities = ner_pipeline(text)
7
+ ingredients = []
8
+ for ent in entities:
9
+ word = ent['word'].lower().strip()
10
+ if ent['entity_group'] == 'MISC' and word.isalpha() and len(word) > 2:
11
+ ingredients.append(word)
12
+ return list(set(ingredients))
app/utils/ocr.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image
3
+ from io import BytesIO
4
+ import re
5
+ import logging
6
+ import easyocr
7
+
8
+ ocr_reader = easyocr.Reader(['en'], gpu=False)
9
+
10
+ def extract_text_from_image(image_file):
11
+ try:
12
+ image = Image.open(BytesIO(image_file.read())).convert("RGB")
13
+ image_np = np.array(image)
14
+ results = ocr_reader.readtext(image_np)
15
+ text = " ".join([res[1] for res in results])
16
+ text = re.sub(r'\b(Ingredients|Komposisi|Composition|Bahan|Daftar Bahan)\b', '', text, flags=re.IGNORECASE)
17
+ return re.sub(r'[^A-Za-z,.\s-]', '', text).strip()
18
+ except Exception as e:
19
+ logging.error(f"OCR error: {e}")
20
+ return ""
app/utils/predict_afteruse.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertTokenizer, BertForSequenceClassification
3
+ import numpy as np
4
+ import os
5
+
6
+ HF_TOKEN = os.getenv("HF_TOKEN")
7
+
8
+ # Load the pre-trained model and tokenizer
9
+ tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-multilabel", token=HF_TOKEN)
10
+ model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-multilabel", token=HF_TOKEN)
11
+
12
+ model.eval()
13
+
14
+ # Efek after use (harus sesuai saat training)
15
+ afteruse_labels = [
16
+ "acne fighting", "acne trigger", "anti aging", "brightening", "moisturizing",
17
+ "redness reducing", "skin texture", "soothing", "unknown", "whitening"
18
+ ]
19
+
20
+ afteruse_descriptions_en = {
21
+ "acne fighting": "helps fight acne",
22
+ "acne trigger": "may trigger acne",
23
+ "anti aging": "reduces signs of aging",
24
+ "brightening": "brightens the skin",
25
+ "moisturizing": "moisturizes the skin",
26
+ "redness reducing": "reduces redness",
27
+ "skin texture": "improves skin texture",
28
+ "soothing": "soothes the skin",
29
+ "unknown": "has unknown effects",
30
+ "whitening": "whitens the skin"
31
+ }
32
+
33
+ def predict_after_use(input_ingredients):
34
+ if not input_ingredients:
35
+ return []
36
+
37
+ # Tokenisasi input
38
+ inputs = tokenizer(input_ingredients, return_tensors="pt", truncation=True, padding=True, max_length=512)
39
+
40
+ with torch.no_grad():
41
+ outputs = model(**inputs)
42
+ logits = outputs.logits
43
+ probs = torch.sigmoid(logits)
44
+
45
+ # Konversi probabilitas ke numpy dan ratakan
46
+ probs = probs.squeeze().cpu().numpy()
47
+ if probs.ndim == 0:
48
+ probs = [probs] # Jika scalar, ubah jadi list
49
+
50
+ print(f"[DEBUG] Model output shape: {logits.shape}")
51
+ print(f"[DEBUG] Jumlah output model: {len(probs)} | Jumlah label: {len(afteruse_labels)}")
52
+
53
+ # Antisipasi mismatch jumlah output vs label
54
+ min_len = min(len(probs), len(afteruse_labels))
55
+ predicted_labels = [
56
+ afteruse_labels[i]
57
+ for i in range(min_len)
58
+ if probs[i] > 0.5
59
+ ]
60
+
61
+ return predicted_labels
62
+
63
+ def generate_afteruse_sentence_en(predicted_labels):
64
+ if not predicted_labels:
65
+ return "No effects were detected based on the provided ingredients."
66
+
67
+ descriptions = [afteruse_descriptions_en.get(label, label) for label in predicted_labels]
68
+
69
+ if len(descriptions) == 1:
70
+ return f"This product {descriptions[0]}."
71
+ elif len(descriptions) == 2:
72
+ return f"This product {descriptions[0]} and {descriptions[1]}."
73
+ else:
74
+ return f"This product {', '.join(descriptions[:-1])}, and {descriptions[-1]}."
75
+
76
+
77
+ def predict_after_use_with_probs(input_ingredients):
78
+ if not input_ingredients:
79
+ return [], []
80
+
81
+ inputs = tokenizer(input_ingredients, return_tensors="pt", truncation=True, padding=True, max_length=512)
82
+
83
+ with torch.no_grad():
84
+ outputs = model(**inputs)
85
+ logits = outputs.logits
86
+ probs = torch.sigmoid(logits)
87
+
88
+ probs = probs.squeeze().cpu().numpy()
89
+ if probs.ndim == 0:
90
+ probs = [probs]
91
+
92
+ min_len = min(len(probs), len(afteruse_labels))
93
+ predicted_labels = [
94
+ afteruse_labels[i]
95
+ for i in range(min_len)
96
+ if probs[i] > 0.5
97
+ ]
98
+ label_probs = {
99
+ afteruse_labels[i]: float(probs[i])
100
+ for i in range(min_len)
101
+ }
102
+
103
+ return predicted_labels, label_probs
app/utils/prediction.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from collections import OrderedDict
3
+ from transformers import BertTokenizer, BertForSequenceClassification
4
+ import os
5
+
6
+ HF_TOKEN = os.getenv("HF_TOKEN")
7
+
8
+ # Load the pre-trained model and tokenizer
9
+ tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-model", token=HF_TOKEN)
10
+ model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-model", token=HF_TOKEN)
11
+
12
+ def predict(desc):
13
+ if not desc:
14
+ return "Not Safe"
15
+ inputs = tokenizer(desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
16
+ with torch.no_grad():
17
+ outputs = model(**inputs)
18
+ logits = outputs.logits
19
+ pred = torch.argmax(logits, dim=1).item()
20
+ return "Safe" if pred == 1 else "Not Safe"
21
+
22
+ def predict_with_description(ingredient, df):
23
+ df_match = df.copy()
24
+ df_match['INCI name_lower'] = df_match['INCI name'].str.lower()
25
+ df_match['IUPAC Name_lower'] = df_match['IUPAC Name'].str.lower()
26
+
27
+ ingredient_lower = ingredient.lower()
28
+ match_row = df_match[(df_match['INCI name_lower'] == ingredient_lower) | (df_match['IUPAC Name_lower'] == ingredient_lower)]
29
+
30
+ if not match_row.empty:
31
+ row = match_row.iloc[0]
32
+ inci_name = row['INCI name'].title()
33
+ desc = row.get('Description', '')
34
+ func = row.get('Function', '')
35
+ Restriction = row.get('Restriction')
36
+ risk_lvl = row.get('Risk Level', '')
37
+ risk_desc = row.get('Risk Description', '')
38
+ else:
39
+ inci_name = ingredient.title()
40
+ desc = "Description not found"
41
+ func = "Function not found"
42
+ Restriction = "Restriction not found"
43
+ risk_lvl = "Unknown"
44
+ risk_desc = "Risk info not available"
45
+
46
+ result = predict(desc)
47
+
48
+ return OrderedDict([
49
+ ("Ingredient Name", inci_name),
50
+ ("Description", desc),
51
+ ("Function", func),
52
+ ("Risk Level", risk_lvl),
53
+ {"code Restriction ", Restriction}
54
+ ("Risk Description", risk_desc),
55
+ ("Prediction", result)
56
+ ])
app/utils/recommendation.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+
3
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
4
+
5
+ def recommend_similar_products(input_ingredients, df_brand, product_embeddings, top_n=3):
6
+ query = " ".join(input_ingredients)
7
+ query_embedding = embedding_model.encode(query, convert_to_tensor=True)
8
+ scores = util.cos_sim(query_embedding, product_embeddings)[0]
9
+ top_results = scores.topk(k=min(top_n, len(scores)))
10
+
11
+ recommended = []
12
+ for idx in top_results.indices:
13
+ row = df_brand.iloc[int(idx)]
14
+ recommended.append({
15
+ "brand": row.get('brand', ''),
16
+ "name": row.get('name', ''),
17
+ "type": row.get('type', ''),
18
+ "ingredients": row.get('ingridients', ''),
19
+ "description": row.get('afterUse', ''),
20
+ })
21
+ return recommended