Spaces:

CreatorJarvis
/

FoodExtract-Vision

Sleeping

App Files Files Community

CreatorJarvis commited on Jan 30

Commit

c220cef

verified ·

1 Parent(s): c4bebab

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -178

app.py CHANGED Viewed

@@ -1,181 +1,4 @@
-# import os
-# import torch
-# import gradio as gr
-# import spaces
-# from transformers import pipeline
-# BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
-# FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
-# OUTPUT_TOKENS = 256
-# original_pipeline = None
-# ft_pipe = None
-# FORCE_CPU = os.getenv("FORCE_CPU", "0") == "1"
-# DEVICE_TYPE = "cuda" if (torch.cuda.is_available() and not FORCE_CPU) else "cpu"
-# if DEVICE_TYPE == "cuda":
-#     torch.backends.cuda.matmul.allow_tf32 = True
-#     torch.backends.cudnn.allow_tf32 = True
-# def _get_dtype(device: str):
-#     if device == "cuda":
-#         if os.getenv("USE_FP16", "0") == "1":
-#             return torch.float16
-#         if os.getenv("USE_BF16", "0") == "1":
-#             is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
-#             if callable(is_bf16_supported) and is_bf16_supported():
-#                 return torch.bfloat16
-#         return torch.float32
-#     return torch.float32
-# def _make_pipe(model_id: str, device_type: str):
-#     dtype = _get_dtype(device_type)
-#     device_arg = 0 if device_type == "cuda" else -1
-#     pipe = pipeline(
-#         "image-text-to-text",
-#         model=model_id,
-#         device=device_arg,
-#         torch_dtype=dtype,
-#     )
-#     model = getattr(pipe, "model", None)
-#     generation_config = getattr(model, "generation_config", None)
-#     if generation_config is not None:
-#         generation_config.do_sample = False
-#         generation_config.max_new_tokens = OUTPUT_TOKENS
-#         try:
-#             generation_config.max_length = None
-#         except Exception:
-#             pass
-#     return pipe
-# ACTIVE_DEVICE_TYPE = DEVICE_TYPE
-# def _load_pipes(device_type: str):
-#     global original_pipeline, ft_pipe, ACTIVE_DEVICE_TYPE
-#     ACTIVE_DEVICE_TYPE = device_type
-#     print(f"[INFO] Using device_type={ACTIVE_DEVICE_TYPE}")
-#     original_pipeline = _make_pipe(BASE_MODEL_ID, ACTIVE_DEVICE_TYPE)
-#     ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID, ACTIVE_DEVICE_TYPE)
-# _load_pipes(DEVICE_TYPE)
-# def _extract_generated_text(pipe_output) -> str:
-#     try:
-#         item0 = pipe_output[0]
-#         if isinstance(item0, dict) and "generated_text" in item0:
-#             gt = item0["generated_text"]
-#         else:
-#             gt = pipe_output[0][0]["generated_text"]
-#         if isinstance(gt, str):
-#             return gt
-#         if isinstance(gt, list) and gt:
-#             last = gt[-1]
-#             if isinstance(last, dict) and "content" in last:
-#                 return last["content"]
-#         return str(gt)
-#     except Exception:
-#         return str(pipe_output)
-# def create_message(input_image):
-#     return [{'role': 'user',
-#  'content': [{'type': 'image',
-#    'image': input_image},
-#   {'type': 'text',
-#    'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n  'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n  'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n  'food_items': [], # list[str] - list of visible edible food item nouns\n  'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
-# @spaces.GPU
-# def extract_foods_from_image(input_image):
-#     if input_image is None:
-#         return "Please upload an image", "Please upload an image"
-#     input_image = input_image.convert("RGB")
-#     input_image = input_image.resize(size=(512, 512))
-#     input_message = create_message(input_image=input_image)
-#     try:
-#         original_pipeline_output = original_pipeline(text=[input_message])
-#         outputs_pretrained = _extract_generated_text(original_pipeline_output)
-#         ft_pipe_output = ft_pipe(text=[input_message])
-#         outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
-#     except RuntimeError as e:
-#         msg = str(e)
-#         is_cuda_linear_failure = (
-#             "CUBLAS_STATUS_INVALID_VALUE" in msg
-#             or "cublasGemmEx" in msg
-#             or ("CUDA error" in msg and "CUBLAS" in msg)
-#         )
-#         if ACTIVE_DEVICE_TYPE == "cuda" and is_cuda_linear_failure:
-#             try:
-#                 print("[WARN] CUDA GEMM failed, falling back to CPU.")
-#                 _load_pipes("cpu")
-#                 if torch.cuda.is_available():
-#                     torch.cuda.empty_cache()
-#                 original_pipeline_output = original_pipeline(text=[input_message])
-#                 outputs_pretrained = _extract_generated_text(original_pipeline_output)
-#                 ft_pipe_output = ft_pipe(text=[input_message])
-#                 outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
-#             except Exception:
-#                 raise e
-#         else:
-#             raise
-#     return outputs_pretrained, outputs_fine_tuned
-# demo_title = "🥑➡️📝 FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
-# demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
-# * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
-# * **Fine-tuned model:** https://huggingface.co/CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
-# ## Overview
-# Extract food and drink items in a structured way from images.
-# The original model outputs fail to capture the desired structure. But the fine-tuned model sticks to the output structure quite well.
-# However, the fine-tuned model could definitely be improved with respects to its ability to extract the right food/drink items.
-# Both models use the input prompt:
-# ````
-# Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
-# Only return valid JSON in the following form:
-# ```json
-# {
-#   'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)
-#   'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present
-#   'food_items': [], # list[str] - list of visible edible food item nouns
-#   'drink_items': [] # list[str] - list of visible edible drink item nouns
-# }
-# ```
-# ````
-# Except one model has been fine-tuned on the structured data whereas the other hasn't.
-# Notable next steps would be:
-# * **Remove the input prompt:** Just train the model to go straight from image -> text (no text prompt on input), this would save on inference tokens.
-# * **Fine-tune on more real-world data:** Right now the model is only trained on 1k food images (from Food101) and 500 not food (random internet images), training on real world data would likely significantly improve performance.
-# * **Fix the repetitive generation:** The model can sometimes get stuck in a repetitive generation pattern, e.g. "onions", "onions", "onions", etc. We could look into patterns to help reduce this.
-# """
-# demo = gr.Interface(
-#     fn=extract_foods_from_image,
-#     inputs=gr.Image(type="pil"),
-#     title=demo_title,
-#     description=demo_description,
-#     outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
-#              gr.Textbox(lines=4, label="Fine-tuned Model")],
-#     examples=[["examples/food1.jpeg"],
-#               ["examples/food2.jpg"],
-#               ["examples/food3.jpg"],
-#               ["examples/food4.jpeg"]],
-# )
-# if __name__ == "__main__":
-#     demo.launch(share=False)
 import os
 import torch
@@ -558,7 +381,7 @@ with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft(), title="FoodExtract Vision
     <div class="footer-section">
         <p style="margin: 0;">Built with ❤️ by <strong>Jarvis Zhang</strong> |
         <a href="https://huggingface.co/CreatorJarvis" target="_blank" style="color: #4f46e5;">🤗 Hugging Face</a> |
-        <a href="https://github.com/CreatorJarvis" target="_blank" style="color: #4f46e5;">💻 GitHub</a>
         </p>
         <p style="margin: 0.5rem 0 0 0; font-size: 0.8rem; color: #9ca3af;">Fine-tuning Demo • Vision Language Model • Structured Output Generation</p>
     </div>

 import os
 import torch
     <div class="footer-section">
         <p style="margin: 0;">Built with ❤️ by <strong>Jarvis Zhang</strong> |
         <a href="https://huggingface.co/CreatorJarvis" target="_blank" style="color: #4f46e5;">🤗 Hugging Face</a> |
+        <a href="https://github.com/JarvisZhang24" target="_blank" style="color: #4f46e5;">💻 GitHub</a>
         </p>
         <p style="margin: 0.5rem 0 0 0; font-size: 0.8rem; color: #9ca3af;">Fine-tuning Demo • Vision Language Model • Structured Output Generation</p>
     </div>