skincare / app /utils /prediction.py
Maulidaaa's picture
Update app/utils/prediction.py
d9e4ce9 verified
import torch
from collections import OrderedDict
from transformers import BertTokenizer, BertForSequenceClassification
import os
from deep_translator import GoogleTranslator # Import GoogleTranslator for translation
# Load glossary
def load_glossary(file_path):
glossary = {}
try:
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
if line.strip():
# Membaca file dan memisahkan key dan value berdasarkan tanda "="
key, value = line.strip().split('=')
glossary[key.strip().lower()] = value.strip()
except Exception as e:
print(f"Error loading glossary: {e}")
return glossary
# Load the pre-trained model and tokenizer
HF_TOKEN = os.getenv("HF_TOKEN")
tokenizer = BertTokenizer.from_pretrained("Maulidaaa/bert-safe-model", use_auth_token=HF_TOKEN)
model = BertForSequenceClassification.from_pretrained("Maulidaaa/bert-safe-model", use_auth_token=HF_TOKEN)
# Load glossary from file
glossary = load_glossary('glossary.txt')
# Translate function using Google Translator and glossary
def translate_with_glossary(text, target_lang='id'):
try:
# Translate the text first
translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
# Replace terms based on glossary
for en_term, id_term in glossary.items():
translated = translated.replace(en_term.lower(), id_term.lower())
return translated.capitalize()
except Exception as e:
print(f"Error during translation: {e}")
return text.capitalize()
# Prediction function
def predict(desc):
if not desc:
return "Not Safe"
inputs = tokenizer(desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
pred = torch.argmax(logits, dim=1).item()
return "Safe" if pred == 1 else "Not Safe"
# Function to predict with ingredient description, translate terms, and return results
def predict_with_description(ingredient, df, target_lang='id'):
df_match = df.copy()
df_match['INCI name_lower'] = df_match['INCI name'].str.lower()
df_match['IUPAC Name_lower'] = df_match['IUPAC Name'].str.lower()
ingredient_lower = ingredient.lower()
match_row = df_match[
(df_match['INCI name_lower'] == ingredient_lower) |
(df_match['IUPAC Name_lower'] == ingredient_lower)
]
if not match_row.empty:
row = match_row.iloc[0]
inci_name = row['INCI name'].capitalize()
desc = row.get('Description', 'Description not available')
func = row.get('Function', 'Function not available').capitalize()
restriction = row.get('Restriction', 'None')
risk_lvl = row.get('Risk Level', 'Unknown')
risk_desc = row.get('Risk Description', 'Risk info not available')
else:
inci_name = ingredient.title()
desc = "Description not found"
func = "Function not found"
restriction = "Restriction not found"
risk_lvl = "Unknown"
risk_desc = "Risk info not available"
result = predict(desc)
# Translate only Description, Function, and Risk Description using glossary and Google Translator
translated_desc = translate_with_glossary(desc, target_lang)
translated_risk_desc = translate_with_glossary(risk_desc, target_lang)
translated_function = translate_with_glossary(func, target_lang)
return OrderedDict([
("Ingredient Name", inci_name),
("Description", translated_desc),
("Function", translated_function),
("Restriction", restriction),
("Risk Level", risk_lvl),
("Risk Description", translated_risk_desc),
("Prediction", result),
])