mknolan
/

internvl25-image-analyzer

Model card Files Files and versions

xet

Community

mknolan commited on Mar 22, 2025

Commit

b517f60

verified ·

1 Parent(s): fd5ea34

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +170 -64

app.py CHANGED Viewed

@@ -127,6 +127,7 @@ def load_image(image_pil, max_num=12):
         # Stack tensors - this is where the error might occur if any element isn't a tensor
         try:
             pixel_values = torch.stack(pixel_values)
         except Exception as stack_error:
             print(f"Error during tensor stacking: {str(stack_error)}")
             # Try to recover - convert any non-tensor to tensor
@@ -142,7 +143,7 @@ def load_image(image_pil, max_num=12):
                             else:
                                 val = np.array(val)
                         # Then to tensor
-                        val = torch.from_numpy(val)
                         fixed_values.append(val)
                     except Exception as convert_err:
                         print(f"Failed to convert item {i}: {str(convert_err)}")
@@ -175,7 +176,17 @@ def load_image(image_pil, max_num=12):
             # Simplest approach: just convert the single image without splitting
             image_pil = image_pil.convert('RGB')
             transform = build_transform(IMAGE_SIZE)
-            tensor = transform(image_pil).unsqueeze(0)
             if torch.cuda.is_available():
                 tensor = tensor.cuda().to(torch.bfloat16)
@@ -186,7 +197,20 @@ def load_image(image_pil, max_num=12):
             return tensor
         except Exception as recovery_error:
             print(f"Recovery attempt also failed: {str(recovery_error)}")
-            return None
 # Function to split model across GPUs
 def split_model(model_name):
@@ -821,78 +845,99 @@ def analyze_folder_images(folder_path, prompt):
         file_name = os.path.basename(image_file)
         result += f"---\nImage: {file_name}\n"
-        # For PDF files, convert to images and analyze each page
         if file_name.lower().endswith('.pdf'):
             try:
                 print(f"Processing PDF file: {image_file}")
-                # Use a completely different approach for PDFs that avoids tensor issues
                 model, tokenizer = load_model()
                 if model is None or tokenizer is None:
                     result += "Error: Model failed to load for PDF analysis.\n"
                     continue
-                # Try conversion with pdf2image
                 try:
                     pdf_images = convert_from_path(image_file)
-                    print(f"Converted PDF to {len(pdf_images)} pages")
-                except Exception as pdf_err:
-                    print(f"PDF conversion error: {str(pdf_err)}")
-                    result += f"Failed to convert PDF: {str(pdf_err)}\n"
-                    continue
-                if not pdf_images or len(pdf_images) == 0:
-                    result += "PDF converted but no pages were extracted.\n"
-                    continue
-                for i, img in enumerate(pdf_images):
-                    try:
-                        print(f"Processing PDF page {i+1} of {len(pdf_images)}")
-                        # Convert to RGB and resize to standard size
-                        img = img.convert('RGB')
-                        # Use the chat() function directly which is more reliable
-                        question = f"<image>\n{prompt}"
-                        # Process image with proper error handling
                         try:
-                            # Manually preprocess image
-                            transform = build_transform(IMAGE_SIZE)
-                            # Resize to a standard size to avoid splitting issues
                             img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
-                            pixel_values = transform(img_resized).unsqueeze(0)
-                            # Move to appropriate device
                             if torch.cuda.is_available():
-                                pixel_values = pixel_values.cuda().to(torch.bfloat16)
-                            else:
-                                pixel_values = pixel_values.to(torch.float32)
-                            print(f"Processed image tensor shape: {pixel_values.shape}, type: {type(pixel_values)}")
-                            # Use direct generation
-                            input_tokens = tokenizer(prompt)
-                            output_ids = model.generate(
-                                input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
-                                pixel_values,
-                                max_new_tokens=512,
-                                temperature=0.1,
-                                do_sample=False
-                            )
-                            response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-                            print(f"Successfully generated response for page {i+1}")
-                        except Exception as model_err:
-                            print(f"Error in model generation: {str(model_err)}")
                             import traceback
                             print(traceback.format_exc())
-                            response = f"Model error: {str(model_err)}"
-                        result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
-                    except Exception as page_err:
-                        print(f"Page processing error: {str(page_err)}")
-                        import traceback
-                        print(traceback.format_exc())
-                        result += f"\n-- PDF Page {i+1} --\nError: {str(page_err)}\n"
             except Exception as e:
-                print(f"General PDF error: {str(e)}")
                 import traceback
                 print(traceback.format_exc())
                 result += f"Failed to process PDF: {str(e)}\n"
@@ -903,6 +948,9 @@ def analyze_folder_images(folder_path, prompt):
                 image_result = process_image_with_text(image, prompt)
                 result += f"\n{image_result}\n"
             except Exception as e:
                 result += f"Error processing image: {str(e)}\n"
     return result
@@ -911,6 +959,7 @@ def analyze_folder_images(folder_path, prompt):
 def process_image_with_text(image, prompt):
     """Process a single image with the InternVL model and a text prompt."""
     try:
         # Load model if not already loaded
         model, tokenizer = load_model()
         if model is None or tokenizer is None:
@@ -921,6 +970,9 @@ def process_image_with_text(image, prompt):
         if pixel_values is None:
             return "Error preparing image."
         # Process the prompt
         input_tokens = tokenizer(prompt)
@@ -952,17 +1004,39 @@ def process_image_with_text(image, prompt):
                         # Move to device
                         pv = pv.to("cuda" if torch.cuda.is_available() else "cpu")
-                        output_ids = model.generate(
-                            input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
-                            pv,
-                            max_new_tokens=512,
-                            temperature=0.1,
-                            do_sample=False
-                        )
-                        output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
                         results.append(output.strip())
                     except Exception as item_error:
                         print(f"Error processing item {i}: {str(item_error)}")
                         results.append(f"Error: {str(item_error)}")
                 return "\n".join(results)
@@ -972,12 +1046,16 @@ def process_image_with_text(image, prompt):
                     # Ensure pixel_values is a proper 4D tensor [batch, channels, height, width]
                     if len(pixel_values.shape) == 3:
                         pixel_values = pixel_values.unsqueeze(0)
                     # Move tensors to the same device
                     device = "cuda" if torch.cuda.is_available() else "cpu"
                     pixel_values = pixel_values.to(device)
                     input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
                     # Run the model
                     output_ids = model.generate(
                         input_ids,
@@ -1010,6 +1088,34 @@ def process_image_with_text(image, prompt):
                         return response
                     except Exception as chat_error:
                         print(f"Fallback also failed: {str(chat_error)}")
                         return f"Error processing image: Unable to generate analysis. {str(tensor_error)}"
     except Exception as e:
         print(f"Outer exception in process_image_with_text: {str(e)}")

         # Stack tensors - this is where the error might occur if any element isn't a tensor
         try:
             pixel_values = torch.stack(pixel_values)
+            print(f"Successfully stacked tensors into shape: {pixel_values.shape}")
         except Exception as stack_error:
             print(f"Error during tensor stacking: {str(stack_error)}")
             # Try to recover - convert any non-tensor to tensor
                             else:
                                 val = np.array(val)
                         # Then to tensor
+                        val = torch.from_numpy(val).float()  # Specify float type explicitly
                         fixed_values.append(val)
                     except Exception as convert_err:
                         print(f"Failed to convert item {i}: {str(convert_err)}")
             # Simplest approach: just convert the single image without splitting
             image_pil = image_pil.convert('RGB')
             transform = build_transform(IMAGE_SIZE)
+            tensor = transform(image_pil)
+            # Make sure it's a tensor before using unsqueeze
+            if not isinstance(tensor, torch.Tensor):
+                print(f"Warning: transform did not return a tensor, got {type(tensor)}")
+                if hasattr(tensor, 'numpy'):
+                    tensor = torch.from_numpy(tensor.numpy()).float()
+                else:
+                    tensor = torch.tensor(tensor, dtype=torch.float32)
+            tensor = tensor.unsqueeze(0)  # Now safe to use unsqueeze
             if torch.cuda.is_available():
                 tensor = tensor.cuda().to(torch.bfloat16)
             return tensor
         except Exception as recovery_error:
             print(f"Recovery attempt also failed: {str(recovery_error)}")
+            print(traceback.format_exc())
+            # Last resort - return a dummy tensor of the right shape
+            try:
+                print("Creating fallback dummy tensor...")
+                dummy_tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
+                                         dtype=torch.float32)
+                if torch.cuda.is_available():
+                    dummy_tensor = dummy_tensor.cuda().to(torch.bfloat16)
+                print("Returning dummy tensor as last resort")
+                return dummy_tensor
+            except:
+                print("Even dummy tensor creation failed. Cannot proceed.")
+                return None
 # Function to split model across GPUs
 def split_model(model_name):
         file_name = os.path.basename(image_file)
         result += f"---\nImage: {file_name}\n"
+        # For PDF files, handle differently
         if file_name.lower().endswith('.pdf'):
             try:
                 print(f"Processing PDF file: {image_file}")
+                # Load model here to ensure it's ready
                 model, tokenizer = load_model()
                 if model is None or tokenizer is None:
                     result += "Error: Model failed to load for PDF analysis.\n"
                     continue
+                # Try a completely different approach for PDFs to avoid tensor issues
                 try:
+                    # Convert PDF to images
                     pdf_images = convert_from_path(image_file)
+                    print(f"PDF converted to {len(pdf_images)} pages")
+                    if not pdf_images or len(pdf_images) == 0:
+                        result += "PDF converted but no pages were extracted.\n"
+                        continue
+                    # Process each page separately to avoid batch issues
+                    for i, img in enumerate(pdf_images):
                         try:
+                            print(f"Processing PDF page {i+1}/{len(pdf_images)}")
+                            # Manual preprocessing - don't use the typical image loading pipeline
+                            img = img.convert('RGB')
+                            # Resize and transform manually
                             img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
+                            transform = T.Compose([
+                                T.ToTensor(),
+                                T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+                            ])
+                            tensor = transform(img_resized).unsqueeze(0)
+                            # Move to device and set data type
+                            device = "cuda" if torch.cuda.is_available() else "cpu"
+                            tensor = tensor.to(device)
                             if torch.cuda.is_available():
+                                tensor = tensor.to(torch.bfloat16)
+                            print(f"Preprocessed tensor shape: {tensor.shape}, device: {tensor.device}")
+                            # Use direct text generation
+                            page_prompt = f"PDF Page {i+1}: {prompt}"
+                            input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
+                            # Generate with proper error handling
+                            try:
+                                # Try direct generation first
+                                outputs = model.generate(
+                                    input_tokens["input_ids"],
+                                    pixel_values=tensor,
+                                    max_new_tokens=512,
+                                    do_sample=False
+                                )
+                                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                            except Exception as gen_err:
+                                print(f"Error in direct generation: {str(gen_err)}")
+                                # Fall back to chat method
+                                try:
+                                    question = f"<image>\n{page_prompt}"
+                                    response, _ = model.chat(
+                                        tokenizer=tokenizer,
+                                        pixel_values=tensor,
+                                        question=question,
+                                        generation_config={"max_new_tokens": 512, "do_sample": False},
+                                        history=None,
+                                        return_history=True
+                                    )
+                                except Exception as chat_err:
+                                    print(f"Chat fallback failed: {str(chat_err)}")
+                                    response = f"Analysis failed due to model error: {str(chat_err)}"
+                            # Add to result
+                            result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
+                        except Exception as page_err:
+                            print(f"Error processing page {i+1}: {str(page_err)}")
                             import traceback
                             print(traceback.format_exc())
+                            result += f"\n-- PDF Page {i+1} --\nError: {str(page_err)}\n"
+                except Exception as pdf_err:
+                    print(f"PDF processing error: {str(pdf_err)}")
+                    import traceback
+                    print(traceback.format_exc())
+                    result += f"Failed to process PDF: {str(pdf_err)}\n"
             except Exception as e:
+                print(f"General exception in PDF processing: {str(e)}")
                 import traceback
                 print(traceback.format_exc())
                 result += f"Failed to process PDF: {str(e)}\n"
                 image_result = process_image_with_text(image, prompt)
                 result += f"\n{image_result}\n"
             except Exception as e:
+                print(f"Error processing image {image_file}: {str(e)}")
+                import traceback
+                print(traceback.format_exc())
                 result += f"Error processing image: {str(e)}\n"
     return result
 def process_image_with_text(image, prompt):
     """Process a single image with the InternVL model and a text prompt."""
     try:
+        print(f"process_image_with_text called with image type: {type(image)}")
         # Load model if not already loaded
         model, tokenizer = load_model()
         if model is None or tokenizer is None:
         if pixel_values is None:
             return "Error preparing image."
+        # Debug info
+        print(f"Image processed: tensor type {type(pixel_values)}, shape {pixel_values.shape if hasattr(pixel_values, 'shape') else 'unknown'}, dtype {pixel_values.dtype if hasattr(pixel_values, 'dtype') else 'unknown'}")
         # Process the prompt
         input_tokens = tokenizer(prompt)
                         # Move to device
                         pv = pv.to("cuda" if torch.cuda.is_available() else "cpu")
+                        # Use model.generate directly
+                        try:
+                            output_ids = model.generate(
+                                input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
+                                pv,
+                                max_new_tokens=512,
+                                temperature=0.1,
+                                do_sample=False
+                            )
+                            output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+                        except Exception as gen_error:
+                            print(f"Error in direct generation: {str(gen_error)}")
+                            # Fall back to chat method
+                            try:
+                                question = f"<image>\n{prompt}"
+                                response, _ = model.chat(
+                                    tokenizer=tokenizer,
+                                    pixel_values=pv,
+                                    question=question,
+                                    generation_config={"max_new_tokens": 512, "do_sample": False},
+                                    history=None,
+                                    return_history=True
+                                )
+                            except Exception as chat_err:
+                                print(f"Chat fallback failed: {str(chat_err)}")
+                                output = f"Error analyzing image: {str(chat_err)}"
                         results.append(output.strip())
                     except Exception as item_error:
                         print(f"Error processing item {i}: {str(item_error)}")
+                        import traceback
+                        print(traceback.format_exc())
                         results.append(f"Error: {str(item_error)}")
                 return "\n".join(results)
                     # Ensure pixel_values is a proper 4D tensor [batch, channels, height, width]
                     if len(pixel_values.shape) == 3:
                         pixel_values = pixel_values.unsqueeze(0)
+                        print(f"Added batch dimension, new shape: {pixel_values.shape}")
                     # Move tensors to the same device
                     device = "cuda" if torch.cuda.is_available() else "cpu"
                     pixel_values = pixel_values.to(device)
                     input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
+                    print(f"Running model with pixel_values shape: {pixel_values.shape}, device: {pixel_values.device}")
+                    print(f"Input IDs shape: {input_ids.shape}, device: {input_ids.device}")
                     # Run the model
                     output_ids = model.generate(
                         input_ids,
                         return response
                     except Exception as chat_error:
                         print(f"Fallback also failed: {str(chat_error)}")
+                        print(traceback.format_exc())
+                        # Try one more approach - use the raw model architecture directly
+                        try:
+                            print("Attempting direct model call as last resort")
+                            # Try to reshape tensors to make them compatible
+                            if hasattr(model, "forward"):
+                                # Get only necessary inputs
+                                inputs = {
+                                    "input_ids": input_ids,
+                                    "pixel_values": pixel_values,
+                                    "return_dict": True,
+                                }
+                                # Call model directly
+                                outputs = model(**inputs)
+                                # Try to get some meaningful output
+                                if hasattr(outputs, "logits") and outputs.logits is not None:
+                                    pred_ids = torch.argmax(outputs.logits, dim=-1)
+                                    response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
+                                    return response
+                                else:
+                                    return "Model output did not contain usable results"
+                            else:
+                                return "Model does not support direct calling"
+                        except Exception as direct_error:
+                            print(f"Direct model call failed: {str(direct_error)}")
+                            print(traceback.format_exc())
                         return f"Error processing image: Unable to generate analysis. {str(tensor_error)}"
     except Exception as e:
         print(f"Outer exception in process_image_with_text: {str(e)}")