mknolan
/

internvl25-image-analyzer

Model card Files Files and versions

xet

Community

mknolan commited on Mar 22, 2025

Commit

c39dc7d

verified ·

1 Parent(s): b517f60

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +450 -163

app.py CHANGED Viewed

@@ -17,6 +17,71 @@ import json
 import re
 from pdf2image import convert_from_path, convert_from_bytes
 import tempfile
 # Constants
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
@@ -36,10 +101,10 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 # If HF_TOKEN exists in environment, use it for authentication
 hf_token = os.environ.get("HUGGINGFACE_TOKEN", None)
 if hf_token:
-    print("Logging in to Hugging Face Hub with token...")
     login(token=hf_token)
 else:
-    print("No Hugging Face token found in environment. Model may not load if it's private.")
 # Supported image file extensions
 SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.pdf']
@@ -402,25 +467,79 @@ def analyze_dual_images(model, tokenizer, image1, image2, prompt):
 def process_pdf(pdf_path=None, pdf_bytes=None):
     """Convert PDF file to a list of PIL images."""
     try:
-        print(f"Processing PDF: {pdf_path}")
         if pdf_path:
             # Convert PDF file pages to PIL images
-            print(f"Converting PDF from path: {pdf_path}")
-            images = convert_from_path(pdf_path)
         elif pdf_bytes:
             # Convert PDF bytes to PIL images
-            print("Converting PDF from bytes")
-            images = convert_from_bytes(pdf_bytes)
         else:
-            print("No PDF source provided")
             return None
-        print(f"PDF converted to {len(images)} images")
         return images
     except Exception as e:
-        print(f"Error processing PDF: {str(e)}")
-        import traceback
-        print(traceback.format_exc())
         return None
 # Function to analyze images with a prompt
@@ -848,67 +967,159 @@ def analyze_folder_images(folder_path, prompt):
         # For PDF files, handle differently
         if file_name.lower().endswith('.pdf'):
             try:
-                print(f"Processing PDF file: {image_file}")
                 # Load model here to ensure it's ready
                 model, tokenizer = load_model()
                 if model is None or tokenizer is None:
                     result += "Error: Model failed to load for PDF analysis.\n"
                     continue
                 # Try a completely different approach for PDFs to avoid tensor issues
                 try:
-                    # Convert PDF to images
-                    pdf_images = convert_from_path(image_file)
-                    print(f"PDF converted to {len(pdf_images)} pages")
                     if not pdf_images or len(pdf_images) == 0:
                         result += "PDF converted but no pages were extracted.\n"
                         continue
                     # Process each page separately to avoid batch issues
                     for i, img in enumerate(pdf_images):
                         try:
-                            print(f"Processing PDF page {i+1}/{len(pdf_images)}")
                             # Manual preprocessing - don't use the typical image loading pipeline
                             img = img.convert('RGB')
                             # Resize and transform manually
                             img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
                             transform = T.Compose([
                                 T.ToTensor(),
                                 T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
                             ])
-                            tensor = transform(img_resized).unsqueeze(0)
                             # Move to device and set data type
                             device = "cuda" if torch.cuda.is_available() else "cpu"
                             tensor = tensor.to(device)
                             if torch.cuda.is_available():
                                 tensor = tensor.to(torch.bfloat16)
-                            print(f"Preprocessed tensor shape: {tensor.shape}, device: {tensor.device}")
                             # Use direct text generation
                             page_prompt = f"PDF Page {i+1}: {prompt}"
                             input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
                             # Generate with proper error handling
                             try:
                                 # Try direct generation first
                                 outputs = model.generate(
                                     input_tokens["input_ids"],
                                     pixel_values=tensor,
                                     max_new_tokens=512,
                                     do_sample=False
                                 )
                                 response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                             except Exception as gen_err:
-                                print(f"Error in direct generation: {str(gen_err)}")
                                 # Fall back to chat method
                                 try:
                                     question = f"<image>\n{page_prompt}"
                                     response, _ = model.chat(
                                         tokenizer=tokenizer,
                                         pixel_values=tensor,
@@ -919,7 +1130,47 @@ def analyze_folder_images(folder_path, prompt):
                                     )
                                 except Exception as chat_err:
                                     print(f"Chat fallback failed: {str(chat_err)}")
-                                    response = f"Analysis failed due to model error: {str(chat_err)}"
                             # Add to result
                             result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
@@ -959,168 +1210,204 @@ def analyze_folder_images(folder_path, prompt):
 def process_image_with_text(image, prompt):
     """Process a single image with the InternVL model and a text prompt."""
     try:
-        print(f"process_image_with_text called with image type: {type(image)}")
         # Load model if not already loaded
         model, tokenizer = load_model()
         if model is None or tokenizer is None:
             return "Error loading model. Please check the logs for details."
-        # Prepare image
-        pixel_values = load_image(image)
-        if pixel_values is None:
-            return "Error preparing image."
-        # Debug info
-        print(f"Image processed: tensor type {type(pixel_values)}, shape {pixel_values.shape if hasattr(pixel_values, 'shape') else 'unknown'}, dtype {pixel_values.dtype if hasattr(pixel_values, 'dtype') else 'unknown'}")
         # Process the prompt
-        input_tokens = tokenizer(prompt)
-        # Generate description
         with torch.inference_mode():
-            # Check if pixel_values is a list or tensor and handle accordingly
-            if isinstance(pixel_values, list):
-                # If it's a list, we need to process each element separately
-                print(f"WARNING: pixel_values is a list of length {len(pixel_values)} instead of a tensor")
-                results = []
-                for i, pv in enumerate(pixel_values):
-                    try:
-                        # Convert to tensor if it's not already
-                        if not isinstance(pv, torch.Tensor):
-                            print(f"Converting item {i} from {type(pv)} to tensor")
-                            # Convert to numpy first if needed
-                            if not isinstance(pv, np.ndarray):
-                                if hasattr(pv, 'numpy'):
-                                    pv = pv.numpy()
-                                else:
-                                    pv = np.array(pv)
-                            # Then convert to tensor
-                            pv = torch.from_numpy(pv).float()
-                        # Make sure it's the right shape
-                        if len(pv.shape) == 3:  # Add batch dimension if needed
-                            pv = pv.unsqueeze(0)
-                        # Move to device
-                        pv = pv.to("cuda" if torch.cuda.is_available() else "cpu")
-                        # Use model.generate directly
-                        try:
-                            output_ids = model.generate(
-                                input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
-                                pv,
-                                max_new_tokens=512,
-                                temperature=0.1,
-                                do_sample=False
-                            )
-                            output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-                        except Exception as gen_error:
-                            print(f"Error in direct generation: {str(gen_error)}")
-                            # Fall back to chat method
-                            try:
-                                question = f"<image>\n{prompt}"
-                                response, _ = model.chat(
-                                    tokenizer=tokenizer,
-                                    pixel_values=pv,
-                                    question=question,
-                                    generation_config={"max_new_tokens": 512, "do_sample": False},
-                                    history=None,
-                                    return_history=True
-                                )
-                            except Exception as chat_err:
-                                print(f"Chat fallback failed: {str(chat_err)}")
-                                output = f"Error analyzing image: {str(chat_err)}"
-                        results.append(output.strip())
-                    except Exception as item_error:
-                        print(f"Error processing item {i}: {str(item_error)}")
-                        import traceback
-                        print(traceback.format_exc())
-                        results.append(f"Error: {str(item_error)}")
-                return "\n".join(results)
-            else:
-                # Normal tensor processing
                 try:
-                    # Ensure pixel_values is a proper 4D tensor [batch, channels, height, width]
-                    if len(pixel_values.shape) == 3:
-                        pixel_values = pixel_values.unsqueeze(0)
-                        print(f"Added batch dimension, new shape: {pixel_values.shape}")
-                    # Move tensors to the same device
-                    device = "cuda" if torch.cuda.is_available() else "cpu"
-                    pixel_values = pixel_values.to(device)
-                    input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
-                    print(f"Running model with pixel_values shape: {pixel_values.shape}, device: {pixel_values.device}")
-                    print(f"Input IDs shape: {input_ids.shape}, device: {input_ids.device}")
-                    # Run the model
-                    output_ids = model.generate(
-                        input_ids,
-                        pixel_values,
-                        max_new_tokens=512,
-                        temperature=0.1,
-                        do_sample=False
                     )
-                    # Decode the output
-                    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-                    return output.strip()
-                except Exception as tensor_error:
-                    print(f"Error in tensor processing: {str(tensor_error)}")
-                    import traceback
-                    print(traceback.format_exc())
-                    # Try to use the model's chat method instead as a fallback
                     try:
-                        print("Falling back to model.chat() method")
-                        question = f"<image>\n{prompt}"
-                        response, _ = model.chat(
-                            tokenizer=tokenizer,
-                            pixel_values=pixel_values,
-                            question=question,
-                            generation_config={"max_new_tokens": 512, "do_sample": False},
-                            history=None,
-                            return_history=True
-                        )
-                        return response
-                    except Exception as chat_error:
-                        print(f"Fallback also failed: {str(chat_error)}")
-                        print(traceback.format_exc())
-                        # Try one more approach - use the raw model architecture directly
-                        try:
-                            print("Attempting direct model call as last resort")
-                            # Try to reshape tensors to make them compatible
-                            if hasattr(model, "forward"):
-                                # Get only necessary inputs
-                                inputs = {
-                                    "input_ids": input_ids,
-                                    "pixel_values": pixel_values,
-                                    "return_dict": True,
-                                }
-                                # Call model directly
-                                outputs = model(**inputs)
-                                # Try to get some meaningful output
-                                if hasattr(outputs, "logits") and outputs.logits is not None:
-                                    pred_ids = torch.argmax(outputs.logits, dim=-1)
-                                    response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
-                                    return response
-                                else:
-                                    return "Model output did not contain usable results"
-                            else:
-                                return "Model does not support direct calling"
-                        except Exception as direct_error:
-                            print(f"Direct model call failed: {str(direct_error)}")
-                            print(traceback.format_exc())
-                        return f"Error processing image: Unable to generate analysis. {str(tensor_error)}"
     except Exception as e:
-        print(f"Outer exception in process_image_with_text: {str(e)}")
-        import traceback
-        print(traceback.format_exc())
         return f"Error processing image: {str(e)}"
 # Main function

 import re
 from pdf2image import convert_from_path, convert_from_bytes
 import tempfile
+import logging
+import traceback
+# Set up logging
+LOG_DIR = "logs"
+os.makedirs(LOG_DIR, exist_ok=True)
+log_file = os.path.join(LOG_DIR, f"app_debug_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s [%(levelname)s] %(message)s',
+    handlers=[
+        logging.FileHandler(log_file),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+# Create a logger
+logger = logging.getLogger("internvl_analyzer")
+logger.setLevel(logging.DEBUG)
+# Log startup information
+logger.info("="*50)
+logger.info("InternVL2.5 Image Analyzer starting up")
+logger.info(f"Log file: {log_file}")
+logger.info(f"Python version: {sys.version}")
+logger.info(f"Torch version: {torch.__version__}")
+logger.info(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    logger.info(f"CUDA version: {torch.version.cuda}")
+    logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
+logger.info("="*50)
+# Function to log tensor info for debugging
+def log_tensor_info(tensor, name="tensor"):
+    """Log detailed information about a tensor or list for debugging."""
+    if tensor is None:
+        logger.warning(f"{name} is None")
+        return
+    try:
+        if isinstance(tensor, list):
+            logger.debug(f"{name} is a list of length {len(tensor)}")
+            for i, item in enumerate(tensor[:3]):  # Log first 3 items
+                item_type = type(item)
+                item_shape = getattr(item, "shape", "unknown")
+                item_dtype = getattr(item, "dtype", "unknown")
+                logger.debug(f"  - Item {i}: type={item_type}, shape={item_shape}, dtype={item_dtype}")
+            if len(tensor) > 3:
+                logger.debug(f"  - ... and {len(tensor)-3} more items")
+        elif isinstance(tensor, torch.Tensor):
+            logger.debug(f"{name} is a tensor: shape={tensor.shape}, dtype={tensor.dtype}, device={tensor.device}")
+            # Log additional stats for numerical issues
+            if tensor.numel() > 0:
+                try:
+                    logger.debug(f"  - Stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
+                                f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
+                except:
+                    pass  # Skip stats if they can't be computed
+            logger.debug(f"  - Requires grad: {tensor.requires_grad}")
+        else:
+            logger.debug(f"{name} is type {type(tensor)}")
+    except Exception as e:
+        logger.error(f"Error logging tensor info for {name}: {str(e)}")
 # Constants
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 # If HF_TOKEN exists in environment, use it for authentication
 hf_token = os.environ.get("HUGGINGFACE_TOKEN", None)
 if hf_token:
+    logger.info("Logging in to Hugging Face Hub with token...")
     login(token=hf_token)
 else:
+    logger.info("No Hugging Face token found in environment. Model may not load if it's private.")
 # Supported image file extensions
 SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.pdf']
 def process_pdf(pdf_path=None, pdf_bytes=None):
     """Convert PDF file to a list of PIL images."""
     try:
+        logger.info(f"Processing PDF: {pdf_path}")
+        logger.debug(f"Current working directory: {os.getcwd()}")
         if pdf_path:
             # Convert PDF file pages to PIL images
+            logger.info(f"Converting PDF from path: {pdf_path}")
+            logger.debug(f"PDF path exists: {os.path.exists(pdf_path)}")
+            logger.debug(f"PDF path is file: {os.path.isfile(pdf_path)}")
+            logger.debug(f"PDF file size: {os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 'N/A'} bytes")
+            try:
+                images = convert_from_path(pdf_path)
+                logger.info(f"PDF successfully converted to {len(images)} images")
+            except Exception as pdf_err:
+                logger.error(f"Error in convert_from_path: {str(pdf_err)}")
+                logger.error(traceback.format_exc())
+                # Try with different parameters
+                logger.info("Attempting alternative PDF conversion")
+                try:
+                    images = convert_from_path(
+                        pdf_path,
+                        dpi=150,  # Lower DPI for better compatibility
+                        use_pdftocairo=False,  # Try different backend
+                        strict=False  # Be more lenient with errors
+                    )
+                    logger.info(f"Alternative conversion successful: {len(images)} images")
+                except Exception as alt_err:
+                    logger.error(f"Alternative conversion also failed: {str(alt_err)}")
+                    logger.error(traceback.format_exc())
+                    raise
         elif pdf_bytes:
             # Convert PDF bytes to PIL images
+            logger.info("Converting PDF from bytes")
+            logger.debug(f"PDF bytes size: {len(pdf_bytes)} bytes")
+            try:
+                images = convert_from_bytes(pdf_bytes)
+                logger.info(f"PDF bytes successfully converted to {len(images)} images")
+            except Exception as bytes_err:
+                logger.error(f"Error in convert_from_bytes: {str(bytes_err)}")
+                logger.error(traceback.format_exc())
+                # Try with different parameters
+                logger.info("Attempting alternative PDF bytes conversion")
+                try:
+                    images = convert_from_bytes(
+                        pdf_bytes,
+                        dpi=150,  # Lower DPI
+                        use_pdftocairo=False,
+                        strict=False
+                    )
+                    logger.info(f"Alternative bytes conversion successful: {len(images)} images")
+                except Exception as alt_bytes_err:
+                    logger.error(f"Alternative bytes conversion also failed: {str(alt_bytes_err)}")
+                    logger.error(traceback.format_exc())
+                    raise
         else:
+            logger.error("No PDF source provided")
+            return None
+        # Validate and log the output
+        if not images:
+            logger.error("PDF conversion returned empty list")
             return None
+        # Log details about the first few converted images
+        for i, img in enumerate(images[:2]):  # Log first 2 pages
+            logger.debug(f"PDF Page {i+1}: size={img.size}, mode={img.mode}")
+        logger.info(f"PDF successfully processed, returning {len(images)} images")
         return images
     except Exception as e:
+        logger.error(f"Fatal error in process_pdf: {str(e)}")
+        logger.error(traceback.format_exc())
         return None
 # Function to analyze images with a prompt
         # For PDF files, handle differently
         if file_name.lower().endswith('.pdf'):
             try:
+                logger.info(f"Processing PDF file in folder analysis: {image_file}")
+                logger.debug(f"PDF absolute path: {os.path.abspath(image_file)}")
+                logger.debug(f"PDF exists: {os.path.exists(image_file)}")
+                logger.debug(f"PDF file size: {os.path.getsize(image_file) if os.path.exists(image_file) else 'N/A'}")
                 # Load model here to ensure it's ready
                 model, tokenizer = load_model()
                 if model is None or tokenizer is None:
+                    logger.error("Model failed to load for PDF analysis")
                     result += "Error: Model failed to load for PDF analysis.\n"
                     continue
                 # Try a completely different approach for PDFs to avoid tensor issues
                 try:
+                    # Convert PDF to images with detailed logging
+                    logger.info(f"Starting PDF to image conversion for {file_name}")
+                    with open(image_file, 'rb') as pdf_file:
+                        pdf_data = pdf_file.read()
+                        logger.debug(f"Read {len(pdf_data)} bytes from PDF file")
+                    # Try both methods
+                    try:
+                        logger.debug("Attempting convert_from_path...")
+                        pdf_images = convert_from_path(image_file)
+                        logger.info(f"convert_from_path successful: {len(pdf_images)} pages")
+                    except Exception as path_err:
+                        logger.error(f"convert_from_path failed: {str(path_err)}")
+                        logger.error(traceback.format_exc())
+                        # Fall back to bytes method
+                        logger.debug("Falling back to convert_from_bytes...")
+                        pdf_images = convert_from_bytes(pdf_data)
+                        logger.info(f"convert_from_bytes successful: {len(pdf_images)} pages")
+                    logger.info(f"PDF converted to {len(pdf_images)} pages")
                     if not pdf_images or len(pdf_images) == 0:
+                        logger.error("PDF converted but no pages were extracted")
                         result += "PDF converted but no pages were extracted.\n"
                         continue
                     # Process each page separately to avoid batch issues
                     for i, img in enumerate(pdf_images):
                         try:
+                            logger.info(f"Processing PDF page {i+1}/{len(pdf_images)}")
+                            logger.debug(f"Page {i+1} image: size={img.size}, mode={img.mode}")
                             # Manual preprocessing - don't use the typical image loading pipeline
+                            logger.debug("Converting image to RGB")
                             img = img.convert('RGB')
+                            # Log the image info for debugging
+                            logger.debug(f"After RGB conversion: size={img.size}, mode={img.mode}")
                             # Resize and transform manually
+                            logger.debug(f"Resizing image to {IMAGE_SIZE}x{IMAGE_SIZE}")
                             img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
+                            logger.debug(f"After resize: size={img_resized.size}, mode={img_resized.mode}")
+                            # Build transform and apply it
+                            logger.debug("Building and applying transform")
                             transform = T.Compose([
                                 T.ToTensor(),
                                 T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
                             ])
+                            # Transform to tensor with explicit error handling
+                            try:
+                                logger.debug("Converting image to tensor")
+                                tensor = transform(img_resized)
+                                logger.debug(f"Transform successful: tensor shape={tensor.shape}, dtype={tensor.dtype}")
+                                # Log tensor stats for debugging numerical issues
+                                if tensor.numel() > 0:
+                                    logger.debug(f"Tensor stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
+                                                f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
+                                # Add batch dimension with careful checking
+                                if isinstance(tensor, torch.Tensor):
+                                    logger.debug("Adding batch dimension")
+                                    tensor = tensor.unsqueeze(0)
+                                    logger.debug(f"After unsqueeze: shape={tensor.shape}")
+                                else:
+                                    logger.error(f"Expected tensor but got {type(tensor)}")
+                                    raise TypeError(f"Transform returned {type(tensor)} instead of tensor")
+                            except Exception as tensor_err:
+                                logger.error(f"Error in tensor creation: {str(tensor_err)}")
+                                logger.error(traceback.format_exc())
+                                raise
                             # Move to device and set data type
                             device = "cuda" if torch.cuda.is_available() else "cpu"
+                            logger.debug(f"Moving tensor to device: {device}")
                             tensor = tensor.to(device)
                             if torch.cuda.is_available():
+                                logger.debug("Converting tensor to bfloat16")
                                 tensor = tensor.to(torch.bfloat16)
+                            logger.info(f"Preprocessed tensor: shape={tensor.shape}, device={tensor.device}, dtype={tensor.dtype}")
                             # Use direct text generation
                             page_prompt = f"PDF Page {i+1}: {prompt}"
+                            logger.debug(f"Preparing tokenization for prompt: {page_prompt}")
                             input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
+                            logger.debug(f"Tokenization complete: shape={input_tokens['input_ids'].shape}")
                             # Generate with proper error handling
                             try:
                                 # Try direct generation first
+                                logger.info("Attempting direct generation")
                                 outputs = model.generate(
                                     input_tokens["input_ids"],
                                     pixel_values=tensor,
                                     max_new_tokens=512,
                                     do_sample=False
                                 )
+                                logger.info("Generation successful")
                                 response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                                logger.debug(f"Response length: {len(response)} chars")
                             except Exception as gen_err:
+                                logger.error(f"Error in direct generation: {str(gen_err)}")
+                                logger.error(traceback.format_exc())
                                 # Fall back to chat method
                                 try:
+                                    print("Trying chat method fallback")
                                     question = f"<image>\n{page_prompt}"
+                                    # IMPORTANT: Ensure we're not passing a list here!
+                                    if isinstance(tensor, list):
+                                        print("WARNING: tensor is a list, converting...")
+                                        if len(tensor) > 0:
+                                            # Take the first item if it's a tensor
+                                            if isinstance(tensor[0], torch.Tensor):
+                                                tensor = tensor[0].unsqueeze(0)
+                                            else:
+                                                # Create a new tensor from scratch
+                                                print("Creating new tensor from scratch")
+                                                tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
+                                                                   dtype=torch.float32).to(device)
+                                                if torch.cuda.is_available():
+                                                    tensor = tensor.to(torch.bfloat16)
+                                        else:
+                                            # Create a dummy tensor
+                                            tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
+                                                               dtype=torch.float32).to(device)
+                                            if torch.cuda.is_available():
+                                                tensor = tensor.to(torch.bfloat16)
+                                    # Verify tensor shape and type before passing to model
+                                    print(f"Final tensor type: {type(tensor)}, shape: {tensor.shape if hasattr(tensor, 'shape') else 'unknown'}")
+                                    # Use the chat method with verified tensor
                                     response, _ = model.chat(
                                         tokenizer=tokenizer,
                                         pixel_values=tensor,
                                     )
                                 except Exception as chat_err:
                                     print(f"Chat fallback failed: {str(chat_err)}")
+                                    import traceback
+                                    print(traceback.format_exc())
+                                    # Last attempt - use direct model forward pass
+                                    try:
+                                        print("Attempting direct model forward pass")
+                                        # Create inputs manually
+                                        if hasattr(model, "forward"):
+                                            # Create tensors from scratch if needed
+                                            if not isinstance(tensor, torch.Tensor):
+                                                tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
+                                                                   dtype=torch.float32).to(device)
+                                                if torch.cuda.is_available():
+                                                    tensor = tensor.to(torch.bfloat16)
+                                            # Get input tokens in the right format
+                                            input_ids = input_tokens["input_ids"]
+                                            if len(input_ids.shape) == 1:
+                                                input_ids = input_ids.unsqueeze(0)
+                                            # Prepare inputs for direct call
+                                            inputs = {
+                                                "input_ids": input_ids,
+                                                "pixel_values": tensor,
+                                                "return_dict": True,
+                                            }
+                                            # Call model directly
+                                            outputs = model(**inputs)
+                                            # Try to get some output
+                                            if hasattr(outputs, "logits") and outputs.logits is not None:
+                                                pred_ids = torch.argmax(outputs.logits, dim=-1)
+                                                response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
+                                            else:
+                                                response = "Failed to generate analysis - model output didn't contain usable data"
+                                        else:
+                                            response = "Failed to generate analysis - model doesn't support direct calling"
+                                    except Exception as final_err:
+                                        print(f"All attempts failed: {str(final_err)}")
+                                        import traceback
+                                        print(traceback.format_exc())
+                                        response = f"Analysis failed due to model error: {str(final_err)}"
                             # Add to result
                             result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
 def process_image_with_text(image, prompt):
     """Process a single image with the InternVL model and a text prompt."""
     try:
+        logger.info(f"process_image_with_text called with image type: {type(image)}")
+        # Debug info for image
+        if hasattr(image, 'size'):
+            logger.debug(f"Image dimensions: {image.size}")
+        if hasattr(image, 'mode'):
+            logger.debug(f"Image mode: {image.mode}")
+        # Log memory usage
+        if torch.cuda.is_available():
+            logger.debug(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+            logger.debug(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
         # Load model if not already loaded
+        logger.debug("Loading model")
         model, tokenizer = load_model()
         if model is None or tokenizer is None:
+            logger.error("Model failed to load")
             return "Error loading model. Please check the logs for details."
+        logger.debug("Model loaded successfully")
+        # Skip the standard load_image function which might return a list
+        # Instead, process the image directly to avoid list issues
+        try:
+            # Convert to RGB if needed
+            logger.debug("Converting image to RGB if needed")
+            if hasattr(image, 'convert'):
+                image = image.convert('RGB')
+                logger.debug(f"After conversion: mode={image.mode}, size={image.size}")
+            else:
+                logger.error("Image does not have convert method")
+                return "Error: Unable to convert image to RGB"
+            # Resize for consistent dimensions
+            logger.debug(f"Resizing image to {IMAGE_SIZE}x{IMAGE_SIZE}")
+            if hasattr(image, 'resize'):
+                image_resized = image.resize((IMAGE_SIZE, IMAGE_SIZE))
+                logger.debug(f"After resize: size={image_resized.size}")
+            else:
+                logger.error("Image does not have resize method")
+                return "Error: Unable to resize image"
+            # Apply transforms directly
+            logger.debug("Creating transform")
+            transform = T.Compose([
+                T.ToTensor(),
+                T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+            ])
+            # Convert to tensor safely
+            logger.debug("Converting image to tensor")
+            tensor = transform(image_resized)
+            # Log detailed tensor info
+            if isinstance(tensor, torch.Tensor):
+                logger.debug(f"Image transformed to tensor: shape={tensor.shape}, dtype={tensor.dtype}")
+                if tensor.numel() > 0:
+                    logger.debug(f"Tensor stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
+                               f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
+            else:
+                logger.error(f"Transform did not return a tensor: {type(tensor)}")
+                raise TypeError(f"Expected torch.Tensor but got {type(tensor)}")
+            # Ensure we have a 4D tensor [batch, channels, height, width]
+            logger.debug("Adding batch dimension if needed")
+            if len(tensor.shape) == 3:
+                tensor = tensor.unsqueeze(0)  # Add batch dimension
+                logger.debug(f"Added batch dimension, new shape: {tensor.shape}")
+            # Move to appropriate device
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.debug(f"Moving tensor to device: {device}")
+            tensor = tensor.to(device)
+            if torch.cuda.is_available():
+                logger.debug("Converting tensor to bfloat16")
+                tensor = tensor.to(torch.bfloat16)
+                logger.debug(f"Tensor converted to bfloat16, new dtype: {tensor.dtype}")
+            logger.info(f"Final tensor prepared: shape={tensor.shape}, device={tensor.device}, dtype={tensor.dtype}")
+        except Exception as tensor_err:
+            logger.error(f"Error in tensor creation: {str(tensor_err)}")
+            logger.error(traceback.format_exc())
+            return f"Error preparing image for analysis: {str(tensor_err)}"
         # Process the prompt
+        logger.debug(f"Tokenizing prompt: {prompt}")
+        input_tokens = tokenizer(prompt, return_tensors="pt").to(device)
+        logger.debug(f"Input tokens shape: {input_tokens['input_ids'].shape}")
+        # Generate description - try multiple approaches with proper error handling
         with torch.inference_mode():
+            try:
+                # Approach 1: Try direct generation
+                logger.info("Attempting direct generation")
+                # Double-check inputs
+                logger.debug(f"Checking input token tensor: shape={input_tokens['input_ids'].shape}, device={input_tokens['input_ids'].device}")
+                logger.debug(f"Checking image tensor: shape={tensor.shape}, device={tensor.device}")
+                output_ids = model.generate(
+                    input_tokens["input_ids"],
+                    tensor,
+                    max_new_tokens=512,
+                    temperature=0.1,
+                    do_sample=False
+                )
+                logger.info("Direct generation successful")
+                logger.debug(f"Output IDs shape: {output_ids.shape}")
+                output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+                logger.debug(f"Decoded output length: {len(output)} chars")
+                return output.strip()
+            except Exception as gen_error:
+                logger.error(f"Direct generation failed: {str(gen_error)}")
+                logger.error(traceback.format_exc())
+                # Approach 2: Try the chat method
                 try:
+                    logger.info("Attempting chat method")
+                    question = f"<image>\n{prompt}"
+                    logger.debug(f"Chat question: {question}")
+                    # Double check tensor
+                    if not isinstance(tensor, torch.Tensor):
+                        logger.error(f"Chat method: expected torch.Tensor but got {type(tensor)}")
+                        raise TypeError(f"Expected torch.Tensor but got {type(tensor)}")
+                    response, _ = model.chat(
+                        tokenizer=tokenizer,
+                        pixel_values=tensor,
+                        question=question,
+                        generation_config={"max_new_tokens": 512, "do_sample": False},
+                        history=None,
+                        return_history=True
                     )
+                    logger.info("Chat method successful")
+                    logger.debug(f"Chat response length: {len(response)} chars")
+                    return response.strip()
+                except Exception as chat_error:
+                    logger.error(f"Chat method failed: {str(chat_error)}")
+                    logger.error(traceback.format_exc())
+                    # Approach 3: Try direct model forward pass
                     try:
+                        logger.info("Attempting direct model forward call")
+                        if hasattr(model, "forward"):
+                            logger.debug("Model has forward method")
+                            # Prepare inputs
+                            logger.debug("Preparing inputs for direct forward pass")
+                            inputs = {
+                                "input_ids": input_tokens["input_ids"],
+                                "pixel_values": tensor,
+                                "return_dict": True,
+                            }
+                            # Log input shapes
+                            for k, v in inputs.items():
+                                if hasattr(v, 'shape'):
+                                    logger.debug(f"Input '{k}' shape: {v.shape}")
+                            # Call model directly
+                            logger.debug("Calling model.forward")
+                            outputs = model(**inputs)
+                            # Try to extract output
+                            if hasattr(outputs, "logits") and outputs.logits is not None:
+                                logger.debug(f"Got logits with shape: {outputs.logits.shape}")
+                                pred_ids = torch.argmax(outputs.logits, dim=-1)
+                                logger.debug(f"Prediction IDs shape: {pred_ids.shape}")
+                                response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
+                                logger.debug(f"Decoded response length: {len(response)} chars")
+                                return response.strip()
+                            else:
+                                logger.error("Model output does not contain logits")
+                                return "Failed to analyze image - model output contains no usable data"
+                        else:
+                            logger.error("Model does not have forward method")
+                            return "Failed to analyze image - model doesn't support direct calling"
+                    except Exception as forward_error:
+                        logger.error(f"Forward method failed: {str(forward_error)}")
+                        logger.error(traceback.format_exc())
+                        # All methods failed
+                        return f"Error generating analysis: All methods failed to process the image"
     except Exception as e:
+        logger.error(f"Fatal error in process_image_with_text: {str(e)}")
+        logger.error(traceback.format_exc())
         return f"Error processing image: {str(e)}"
 # Main function