Spaces:

shingguy1
/

Calorie_Estimator

Running

App Files Files Community

shingguy1 commited on May 18, 2025

Commit

aaf3765

verified ·

1 Parent(s): 749ea77

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +26 -27

src/streamlit_app.py CHANGED Viewed

@@ -16,13 +16,13 @@ from transformers import (
 )
 def main():
-    # 2. Environment & cache
     hf_token = os.getenv("HF_TOKEN", None)
     cache_dir = "/tmp/cache"
     os.makedirs(cache_dir, exist_ok=True)
     os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
-    # 3. Image transform for ViT
     manual_transform = transforms.Compose([
         transforms.Resize(256),
         transforms.CenterCrop(224),
@@ -33,26 +33,26 @@ def main():
         transforms.ConvertImageDtype(torch.float32)
     ])
-    # 4. Sidebar info
     st.sidebar.header("Models Used")
     st.sidebar.markdown("""
     - 🖼️ **Image Classifier**: `shingguy1/fine_tuned_vit`
     - 💬 **Text Generator**: `tiiuae/falcon-7b-instruct`
     """)
-    # 5. Load models
     @st.cache_resource
     def load_models():
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # ViT classifier
         model_vit = ViTForImageClassification.from_pretrained(
             "shingguy1/fine_tuned_vit",
             cache_dir=cache_dir,
             use_auth_token=hf_token
         ).to(device)
-        # Falcon-7B Instruct LLM
         tokenizer_llm = AutoTokenizer.from_pretrained(
             "tiiuae/falcon-7b-instruct",
             cache_dir=cache_dir,
@@ -61,33 +61,32 @@ def main():
         model_llm = AutoModelForCausalLM.from_pretrained(
             "tiiuae/falcon-7b-instruct",
             cache_dir=cache_dir,
-            use_auth_token=hf_token,
             torch_dtype=torch.float16,
-            device_map="auto"
         )
         return model_vit, tokenizer_llm, model_llm, device
     model_vit, tokenizer_llm, model_llm, device = load_models()
-    # 6. Image uploader
     uploaded_file = st.file_uploader("Upload a food image...", type=["jpg", "jpeg", "png"])
     if uploaded_file is not None:
         try:
-            # Display image
             image = Image.open(uploaded_file)
             st.image(image, caption="Uploaded Image", use_column_width=True)
-            # Classify with ViT
-            input_tensor = manual_transform(image).unsqueeze(0).to(device)
             with torch.no_grad():
-                outputs = model_vit(pixel_values=input_tensor)
-            pred_idx = outputs.logits.argmax(-1).item()
-            pred_label = model_vit.config.id2label[pred_idx]
             st.success(f"🍴 Predicted Food: **{pred_label}**")
-            # Build prompt
             prompt = (
                 "### Instruction\n"
                 f"Provide a concise nutritional overview for a {pred_label}, including:\n"
@@ -104,12 +103,12 @@ def main():
             # Tokenize & generate
             inputs = tokenizer_llm(prompt, return_tensors="pt")
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            input_len = inputs["input_ids"].shape[1]
-            outputs = model_llm.generate(
                 **inputs,
-                max_length=input_len + 150,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True,
@@ -117,14 +116,14 @@ def main():
                 early_stopping=True,
                 pad_token_id=tokenizer_llm.eos_token_id,
                 eos_token_id=tokenizer_llm.eos_token_id
-            )
-            # Decode and strip prompt
-            full = tokenizer_llm.decode(outputs[0], skip_special_tokens=True).strip()
-            if "### Response" in full:
-                caption = full.split("### Response", 1)[1].strip()
             else:
-                caption = full[input_len:].strip()
             if caption:
                 st.info(caption)

 )
 def main():
+    # Environment & cache
     hf_token = os.getenv("HF_TOKEN", None)
     cache_dir = "/tmp/cache"
     os.makedirs(cache_dir, exist_ok=True)
     os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
+    # Image transform for ViT
     manual_transform = transforms.Compose([
         transforms.Resize(256),
         transforms.CenterCrop(224),
         transforms.ConvertImageDtype(torch.float32)
     ])
+    # Sidebar info
     st.sidebar.header("Models Used")
     st.sidebar.markdown("""
     - 🖼️ **Image Classifier**: `shingguy1/fine_tuned_vit`
     - 💬 **Text Generator**: `tiiuae/falcon-7b-instruct`
     """)
+    # Load models (cached)
     @st.cache_resource
     def load_models():
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # ViT classifier → GPU/CPU
         model_vit = ViTForImageClassification.from_pretrained(
             "shingguy1/fine_tuned_vit",
             cache_dir=cache_dir,
             use_auth_token=hf_token
         ).to(device)
+        # Falcon-7B Instruct → 8-bit quant on GPU
         tokenizer_llm = AutoTokenizer.from_pretrained(
             "tiiuae/falcon-7b-instruct",
             cache_dir=cache_dir,
         model_llm = AutoModelForCausalLM.from_pretrained(
             "tiiuae/falcon-7b-instruct",
             cache_dir=cache_dir,
+            load_in_8bit=True,
+            device_map="auto",
             torch_dtype=torch.float16,
+            use_auth_token=hf_token
         )
         return model_vit, tokenizer_llm, model_llm, device
     model_vit, tokenizer_llm, model_llm, device = load_models()
+    # Image uploader
     uploaded_file = st.file_uploader("Upload a food image...", type=["jpg", "jpeg", "png"])
     if uploaded_file is not None:
         try:
             image = Image.open(uploaded_file)
             st.image(image, caption="Uploaded Image", use_column_width=True)
+            # Classify
+            inputs_v = manual_transform(image).unsqueeze(0).to(device)
             with torch.no_grad():
+                out = model_vit(pixel_values=inputs_v)
+            idx = out.logits.argmax(-1).item()
+            pred_label = model_vit.config.id2label[idx]
             st.success(f"🍴 Predicted Food: **{pred_label}**")
+            # Unified instruction prompt
             prompt = (
                 "### Instruction\n"
                 f"Provide a concise nutritional overview for a {pred_label}, including:\n"
             # Tokenize & generate
             inputs = tokenizer_llm(prompt, return_tensors="pt")
+            inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}
+            inp_len = inputs["input_ids"].shape[1]
+            out_ids = model_llm.generate(
                 **inputs,
+                max_length=inp_len + 150,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True,
                 early_stopping=True,
                 pad_token_id=tokenizer_llm.eos_token_id,
                 eos_token_id=tokenizer_llm.eos_token_id
+            )[0]
+            # Decode & strip prompt
+            decoded = tokenizer_llm.decode(out_ids, skip_special_tokens=True).strip()
+            if "### Response" in decoded:
+                caption = decoded.split("### Response", 1)[1].strip()
             else:
+                caption = decoded[inp_len:].strip()
             if caption:
                 st.info(caption)