Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -197,6 +197,9 @@ with st.sidebar:
|
|
| 197 |
model = load_xception_model()
|
| 198 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 199 |
|
|
|
|
|
|
|
|
|
|
| 200 |
if model is not None:
|
| 201 |
st.session_state.xception_model = model
|
| 202 |
st.session_state.device = device
|
|
@@ -253,14 +256,34 @@ with st.sidebar:
|
|
| 253 |
if not st.session_state.llm_model_loaded:
|
| 254 |
if st.button("📥 Load Vision LLM", type="primary"):
|
| 255 |
# Load LLM model
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
else:
|
| 265 |
st.success("✅ Vision LLM loaded")
|
| 266 |
|
|
@@ -519,33 +542,6 @@ def fix_cross_attention_mask(inputs):
|
|
| 519 |
inputs['cross_attention_mask'] = new_mask
|
| 520 |
return inputs
|
| 521 |
|
| 522 |
-
# Load model function
|
| 523 |
-
@st.cache_resource
|
| 524 |
-
def load_llm_model():
|
| 525 |
-
with st.spinner("Loading LLM vision model... This may take a few minutes. Please be patient..."):
|
| 526 |
-
try:
|
| 527 |
-
# Check for GPU
|
| 528 |
-
has_gpu = check_gpu()
|
| 529 |
-
|
| 530 |
-
# Load base model and tokenizer using Unsloth
|
| 531 |
-
base_model_id = "unsloth/llama-3.2-11b-vision-instruct"
|
| 532 |
-
model, tokenizer = FastVisionModel.from_pretrained(
|
| 533 |
-
base_model_id,
|
| 534 |
-
load_in_4bit=True,
|
| 535 |
-
)
|
| 536 |
-
|
| 537 |
-
# Load the adapter
|
| 538 |
-
adapter_id = "saakshigupta/deepfake-explainer-new"
|
| 539 |
-
model = PeftModel.from_pretrained(model, adapter_id)
|
| 540 |
-
|
| 541 |
-
# Set to inference mode
|
| 542 |
-
FastVisionModel.for_inference(model)
|
| 543 |
-
|
| 544 |
-
return model, tokenizer
|
| 545 |
-
except Exception as e:
|
| 546 |
-
st.error(f"Error loading model: {str(e)}")
|
| 547 |
-
return None, None
|
| 548 |
-
|
| 549 |
# Analyze image function
|
| 550 |
def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""):
|
| 551 |
# Create a prompt that includes GradCAM information
|
|
@@ -876,7 +872,8 @@ def main():
|
|
| 876 |
device = st.session_state.device
|
| 877 |
model = st.session_state.xception_model
|
| 878 |
|
| 879 |
-
# Ensure model is in eval mode
|
|
|
|
| 880 |
model.eval()
|
| 881 |
|
| 882 |
# Move tensor to device
|
|
@@ -918,7 +915,7 @@ def main():
|
|
| 918 |
st.subheader("GradCAM Visualization")
|
| 919 |
try:
|
| 920 |
cam, overlay, comparison, detected_face_box = process_image_with_xception_gradcam(
|
| 921 |
-
image, model, device, pred_class
|
| 922 |
)
|
| 923 |
|
| 924 |
if comparison:
|
|
|
|
| 197 |
model = load_xception_model()
|
| 198 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 199 |
|
| 200 |
+
# Explicitly move model to device
|
| 201 |
+
model = model.to(device)
|
| 202 |
+
|
| 203 |
if model is not None:
|
| 204 |
st.session_state.xception_model = model
|
| 205 |
st.session_state.device = device
|
|
|
|
| 256 |
if not st.session_state.llm_model_loaded:
|
| 257 |
if st.button("📥 Load Vision LLM", type="primary"):
|
| 258 |
# Load LLM model
|
| 259 |
+
try:
|
| 260 |
+
with st.spinner("Loading LLM vision model... This may take a few minutes. Please be patient..."):
|
| 261 |
+
# Check for GPU
|
| 262 |
+
has_gpu = check_gpu()
|
| 263 |
+
|
| 264 |
+
# Load base model and tokenizer using Unsloth
|
| 265 |
+
base_model_id = "unsloth/llama-3.2-11b-vision-instruct"
|
| 266 |
+
model, tokenizer = FastVisionModel.from_pretrained(
|
| 267 |
+
base_model_id,
|
| 268 |
+
load_in_4bit=True,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Load the adapter
|
| 272 |
+
adapter_id = "saakshigupta/deepfake-explainer-new"
|
| 273 |
+
model = PeftModel.from_pretrained(model, adapter_id)
|
| 274 |
+
|
| 275 |
+
# Set to inference mode
|
| 276 |
+
FastVisionModel.for_inference(model)
|
| 277 |
+
|
| 278 |
+
if model is not None and tokenizer is not None:
|
| 279 |
+
st.session_state.llm_model = model
|
| 280 |
+
st.session_state.tokenizer = tokenizer
|
| 281 |
+
st.session_state.llm_model_loaded = True
|
| 282 |
+
st.success("✅ Vision LLM loaded!")
|
| 283 |
+
else:
|
| 284 |
+
st.error("❌ Failed to load Vision LLM.")
|
| 285 |
+
except Exception as e:
|
| 286 |
+
st.error(f"Error loading LLM model: {str(e)}")
|
| 287 |
else:
|
| 288 |
st.success("✅ Vision LLM loaded")
|
| 289 |
|
|
|
|
| 542 |
inputs['cross_attention_mask'] = new_mask
|
| 543 |
return inputs
|
| 544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
# Analyze image function
|
| 546 |
def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""):
|
| 547 |
# Create a prompt that includes GradCAM information
|
|
|
|
| 872 |
device = st.session_state.device
|
| 873 |
model = st.session_state.xception_model
|
| 874 |
|
| 875 |
+
# Ensure model is in eval mode and on the correct device
|
| 876 |
+
model = model.to(device)
|
| 877 |
model.eval()
|
| 878 |
|
| 879 |
# Move tensor to device
|
|
|
|
| 915 |
st.subheader("GradCAM Visualization")
|
| 916 |
try:
|
| 917 |
cam, overlay, comparison, detected_face_box = process_image_with_xception_gradcam(
|
| 918 |
+
image, model.to(device), device, pred_class
|
| 919 |
)
|
| 920 |
|
| 921 |
if comparison:
|