Spaces:

shingguy1
/

Calorie_Estimator

Sleeping

App Files Files Community

shingguy1 commited on May 18, 2025

Commit

c5c8acf

verified ·

1 Parent(s): caf1197

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +35 -27

src/streamlit_app.py CHANGED Viewed

@@ -24,37 +24,40 @@ cache_dir = "/tmp/cache"
 os.makedirs(cache_dir, exist_ok=True)
 os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
-# 3. Image transform (ViT)
 manual_transform = transforms.Compose([
     transforms.Resize(256),
     transforms.CenterCrop(224),
-    transforms.Lambda(lambda img: img.convert("RGB")),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
     transforms.ConvertImageDtype(torch.float32)
 ])
-# Sidebar
 st.sidebar.header("Models Used")
 st.sidebar.markdown("""
 - 🖼️ **Image Classifier**: `shingguy1/fine_tuned_vit`
 - 💬 **Text Generator**: `TinyLlama/TinyLlama-1.1B-Chat-v1.0`
 """)
-# 4. Load models
 @st.cache_resource
 def load_models():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model_vit = ViTForImageClassification.from_pretrained(
         "shingguy1/fine_tuned_vit",
         cache_dir=cache_dir,
         use_auth_token=hf_token
     ).to(device)
     tokenizer = AutoTokenizer.from_pretrained(
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", cache_dir=cache_dir
     )
     model_llm = AutoModelForCausalLM.from_pretrained(
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -67,54 +70,59 @@ def load_models():
 model_vit, tokenizer, model_llm, device = load_models()
-# 5. Image uploader
 uploaded_file = st.file_uploader("Upload a food image...", type=["jpg", "jpeg", "png"])
-if uploaded_file:
     try:
         image = Image.open(uploaded_file)
         st.image(image, caption="Uploaded Image", use_column_width=True)
-        # Predict
-        batch = manual_transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
-            out = model_vit(pixel_values=batch)
-        label = out.logits.argmax(-1).item()
-        pred = model_vit.config.id2label[label]
-        st.success(f"🍴 Predicted Food: **{pred}**")
-        # Build prompt
         prompt = (
-            f"Provide a concise nutritional overview for a {pred}. "
-            "Include serving size, calories, protein, carbs, fat, main ingredients, cooking method, and one substitution. "
             "Answer only the overview—do not repeat this instruction."
         )
         st.subheader("🧾 Nutrition Information")
         st.write(f"🤖 Prompt to LLM:\n\n{prompt}")
-        # Tokenize & move
         inputs = tokenizer(prompt, return_tensors="pt")
         inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}
-        # Generate
-        max_len = inputs["input_ids"].shape[-1] + 150
         outputs = model_llm.generate(
             **inputs,
-            max_length=max_len,
-            temperature=0.7,
-            top_p=0.9,
             do_sample=True,
             no_repeat_ngram_size=2,
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
-        # Decode all, then strip prompt if echoed
-        text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-        if text.lower().startswith(prompt.lower()):
-            text = text[len(prompt):].strip()
-        st.info(text or "⚠️ The LLM did not produce any text.")
     except Exception as e:
         st.error(f"Something went wrong: {e}")

 os.makedirs(cache_dir, exist_ok=True)
 os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
+# 3. Image transform for ViT
 manual_transform = transforms.Compose([
     transforms.Resize(256),
     transforms.CenterCrop(224),
+    transforms.Lambda(lambda img: img.convert("RGB")),  # ensure 3 channels
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
     transforms.ConvertImageDtype(torch.float32)
 ])
+# 4. Sidebar info
 st.sidebar.header("Models Used")
 st.sidebar.markdown("""
 - 🖼️ **Image Classifier**: `shingguy1/fine_tuned_vit`
 - 💬 **Text Generator**: `TinyLlama/TinyLlama-1.1B-Chat-v1.0`
 """)
+# 5. Load models (cached)
 @st.cache_resource
 def load_models():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # ViT for classification
     model_vit = ViTForImageClassification.from_pretrained(
         "shingguy1/fine_tuned_vit",
         cache_dir=cache_dir,
         use_auth_token=hf_token
     ).to(device)
+    # TinyLlama for nutrition text
     tokenizer = AutoTokenizer.from_pretrained(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        cache_dir=cache_dir
     )
     model_llm = AutoModelForCausalLM.from_pretrained(
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 model_vit, tokenizer, model_llm, device = load_models()
+# 6. Image uploader
 uploaded_file = st.file_uploader("Upload a food image...", type=["jpg", "jpeg", "png"])
+if uploaded_file is not None:
     try:
+        # Load & display image
         image = Image.open(uploaded_file)
         st.image(image, caption="Uploaded Image", use_column_width=True)
+        # Classify with ViT
+        input_tensor = manual_transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
+            outputs = model_vit(pixel_values=input_tensor)
+        pred_idx = outputs.logits.argmax(-1).item()
+        pred_label = model_vit.config.id2label[pred_idx]
+        st.success(f"🍴 Predicted Food: **{pred_label}**")
+        # Prepare LLM prompt
         prompt = (
+            "Provide a concise nutritional overview for a tacos. "
+            "Include serving size, calories, protein, carbs, fat, "
+            "main ingredients, cooking method, and one substitution. "
             "Answer only the overview—do not repeat this instruction."
         )
         st.subheader("🧾 Nutrition Information")
         st.write(f"🤖 Prompt to LLM:\n\n{prompt}")
+        # Tokenize & move to device
         inputs = tokenizer(prompt, return_tensors="pt")
         inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}
+        input_len = inputs["input_ids"].shape[1]
+        # Generate with constraints
         outputs = model_llm.generate(
             **inputs,
+            max_length=input_len + 150,
             do_sample=True,
+            temperature=0.8,
+            top_p=0.9,
             no_repeat_ngram_size=2,
+            early_stopping=True,
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
+        # Decode generated tokens only
+        gen_ids = outputs[0][input_len:]
+        caption = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+        if caption:
+            st.info(caption)
+        else:
+            st.error("⚠️ The LLM failed to generate any text.")
     except Exception as e:
         st.error(f"Something went wrong: {e}")