mknolan commited on
Commit
c39dc7d
·
verified ·
1 Parent(s): b517f60

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +450 -163
app.py CHANGED
@@ -17,6 +17,71 @@ import json
17
  import re
18
  from pdf2image import convert_from_path, convert_from_bytes
19
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Constants
22
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
@@ -36,10 +101,10 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
36
  # If HF_TOKEN exists in environment, use it for authentication
37
  hf_token = os.environ.get("HUGGINGFACE_TOKEN", None)
38
  if hf_token:
39
- print("Logging in to Hugging Face Hub with token...")
40
  login(token=hf_token)
41
  else:
42
- print("No Hugging Face token found in environment. Model may not load if it's private.")
43
 
44
  # Supported image file extensions
45
  SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.pdf']
@@ -402,25 +467,79 @@ def analyze_dual_images(model, tokenizer, image1, image2, prompt):
402
  def process_pdf(pdf_path=None, pdf_bytes=None):
403
  """Convert PDF file to a list of PIL images."""
404
  try:
405
- print(f"Processing PDF: {pdf_path}")
 
 
406
  if pdf_path:
407
  # Convert PDF file pages to PIL images
408
- print(f"Converting PDF from path: {pdf_path}")
409
- images = convert_from_path(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  elif pdf_bytes:
411
  # Convert PDF bytes to PIL images
412
- print("Converting PDF from bytes")
413
- images = convert_from_bytes(pdf_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  else:
415
- print("No PDF source provided")
 
 
 
 
 
416
  return None
417
 
418
- print(f"PDF converted to {len(images)} images")
 
 
 
 
419
  return images
420
  except Exception as e:
421
- print(f"Error processing PDF: {str(e)}")
422
- import traceback
423
- print(traceback.format_exc())
424
  return None
425
 
426
  # Function to analyze images with a prompt
@@ -848,67 +967,159 @@ def analyze_folder_images(folder_path, prompt):
848
  # For PDF files, handle differently
849
  if file_name.lower().endswith('.pdf'):
850
  try:
851
- print(f"Processing PDF file: {image_file}")
 
 
 
 
852
  # Load model here to ensure it's ready
853
  model, tokenizer = load_model()
854
  if model is None or tokenizer is None:
 
855
  result += "Error: Model failed to load for PDF analysis.\n"
856
  continue
857
 
858
  # Try a completely different approach for PDFs to avoid tensor issues
859
  try:
860
- # Convert PDF to images
861
- pdf_images = convert_from_path(image_file)
862
- print(f"PDF converted to {len(pdf_images)} pages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
 
864
  if not pdf_images or len(pdf_images) == 0:
 
865
  result += "PDF converted but no pages were extracted.\n"
866
  continue
867
 
868
  # Process each page separately to avoid batch issues
869
  for i, img in enumerate(pdf_images):
870
  try:
871
- print(f"Processing PDF page {i+1}/{len(pdf_images)}")
 
872
 
873
  # Manual preprocessing - don't use the typical image loading pipeline
 
874
  img = img.convert('RGB')
875
 
 
 
 
876
  # Resize and transform manually
 
877
  img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
 
 
 
 
878
  transform = T.Compose([
879
  T.ToTensor(),
880
  T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
881
  ])
882
- tensor = transform(img_resized).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
 
884
  # Move to device and set data type
885
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
886
  tensor = tensor.to(device)
 
887
  if torch.cuda.is_available():
 
888
  tensor = tensor.to(torch.bfloat16)
889
 
890
- print(f"Preprocessed tensor shape: {tensor.shape}, device: {tensor.device}")
891
 
892
  # Use direct text generation
893
  page_prompt = f"PDF Page {i+1}: {prompt}"
 
894
  input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
 
895
 
896
  # Generate with proper error handling
897
  try:
898
  # Try direct generation first
 
899
  outputs = model.generate(
900
  input_tokens["input_ids"],
901
  pixel_values=tensor,
902
  max_new_tokens=512,
903
  do_sample=False
904
  )
 
905
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
906
  except Exception as gen_err:
907
- print(f"Error in direct generation: {str(gen_err)}")
 
908
 
909
  # Fall back to chat method
910
  try:
 
911
  question = f"<image>\n{page_prompt}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
912
  response, _ = model.chat(
913
  tokenizer=tokenizer,
914
  pixel_values=tensor,
@@ -919,7 +1130,47 @@ def analyze_folder_images(folder_path, prompt):
919
  )
920
  except Exception as chat_err:
921
  print(f"Chat fallback failed: {str(chat_err)}")
922
- response = f"Analysis failed due to model error: {str(chat_err)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
 
924
  # Add to result
925
  result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
@@ -959,168 +1210,204 @@ def analyze_folder_images(folder_path, prompt):
959
  def process_image_with_text(image, prompt):
960
  """Process a single image with the InternVL model and a text prompt."""
961
  try:
962
- print(f"process_image_with_text called with image type: {type(image)}")
 
 
 
 
 
 
 
 
 
 
 
 
963
  # Load model if not already loaded
 
964
  model, tokenizer = load_model()
965
  if model is None or tokenizer is None:
 
966
  return "Error loading model. Please check the logs for details."
967
 
968
- # Prepare image
969
- pixel_values = load_image(image)
970
- if pixel_values is None:
971
- return "Error preparing image."
 
 
 
 
 
 
 
 
 
972
 
973
- # Debug info
974
- print(f"Image processed: tensor type {type(pixel_values)}, shape {pixel_values.shape if hasattr(pixel_values, 'shape') else 'unknown'}, dtype {pixel_values.dtype if hasattr(pixel_values, 'dtype') else 'unknown'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975
 
976
  # Process the prompt
977
- input_tokens = tokenizer(prompt)
 
 
978
 
979
- # Generate description
980
  with torch.inference_mode():
981
- # Check if pixel_values is a list or tensor and handle accordingly
982
- if isinstance(pixel_values, list):
983
- # If it's a list, we need to process each element separately
984
- print(f"WARNING: pixel_values is a list of length {len(pixel_values)} instead of a tensor")
985
- results = []
986
- for i, pv in enumerate(pixel_values):
987
- try:
988
- # Convert to tensor if it's not already
989
- if not isinstance(pv, torch.Tensor):
990
- print(f"Converting item {i} from {type(pv)} to tensor")
991
- # Convert to numpy first if needed
992
- if not isinstance(pv, np.ndarray):
993
- if hasattr(pv, 'numpy'):
994
- pv = pv.numpy()
995
- else:
996
- pv = np.array(pv)
997
- # Then convert to tensor
998
- pv = torch.from_numpy(pv).float()
999
-
1000
- # Make sure it's the right shape
1001
- if len(pv.shape) == 3: # Add batch dimension if needed
1002
- pv = pv.unsqueeze(0)
1003
-
1004
- # Move to device
1005
- pv = pv.to("cuda" if torch.cuda.is_available() else "cpu")
1006
-
1007
- # Use model.generate directly
1008
- try:
1009
- output_ids = model.generate(
1010
- input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
1011
- pv,
1012
- max_new_tokens=512,
1013
- temperature=0.1,
1014
- do_sample=False
1015
- )
1016
- output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
1017
- except Exception as gen_error:
1018
- print(f"Error in direct generation: {str(gen_error)}")
1019
-
1020
- # Fall back to chat method
1021
- try:
1022
- question = f"<image>\n{prompt}"
1023
- response, _ = model.chat(
1024
- tokenizer=tokenizer,
1025
- pixel_values=pv,
1026
- question=question,
1027
- generation_config={"max_new_tokens": 512, "do_sample": False},
1028
- history=None,
1029
- return_history=True
1030
- )
1031
- except Exception as chat_err:
1032
- print(f"Chat fallback failed: {str(chat_err)}")
1033
- output = f"Error analyzing image: {str(chat_err)}"
1034
-
1035
- results.append(output.strip())
1036
- except Exception as item_error:
1037
- print(f"Error processing item {i}: {str(item_error)}")
1038
- import traceback
1039
- print(traceback.format_exc())
1040
- results.append(f"Error: {str(item_error)}")
1041
-
1042
- return "\n".join(results)
1043
- else:
1044
- # Normal tensor processing
1045
  try:
1046
- # Ensure pixel_values is a proper 4D tensor [batch, channels, height, width]
1047
- if len(pixel_values.shape) == 3:
1048
- pixel_values = pixel_values.unsqueeze(0)
1049
- print(f"Added batch dimension, new shape: {pixel_values.shape}")
1050
-
1051
- # Move tensors to the same device
1052
- device = "cuda" if torch.cuda.is_available() else "cpu"
1053
- pixel_values = pixel_values.to(device)
1054
- input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
1055
 
1056
- print(f"Running model with pixel_values shape: {pixel_values.shape}, device: {pixel_values.device}")
1057
- print(f"Input IDs shape: {input_ids.shape}, device: {input_ids.device}")
 
 
1058
 
1059
- # Run the model
1060
- output_ids = model.generate(
1061
- input_ids,
1062
- pixel_values,
1063
- max_new_tokens=512,
1064
- temperature=0.1,
1065
- do_sample=False
1066
  )
1067
 
1068
- # Decode the output
1069
- output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
1070
- return output.strip()
1071
- except Exception as tensor_error:
1072
- print(f"Error in tensor processing: {str(tensor_error)}")
1073
- import traceback
1074
- print(traceback.format_exc())
1075
 
1076
- # Try to use the model's chat method instead as a fallback
1077
  try:
1078
- print("Falling back to model.chat() method")
1079
- question = f"<image>\n{prompt}"
1080
- response, _ = model.chat(
1081
- tokenizer=tokenizer,
1082
- pixel_values=pixel_values,
1083
- question=question,
1084
- generation_config={"max_new_tokens": 512, "do_sample": False},
1085
- history=None,
1086
- return_history=True
1087
- )
1088
- return response
1089
- except Exception as chat_error:
1090
- print(f"Fallback also failed: {str(chat_error)}")
1091
- print(traceback.format_exc())
1092
 
1093
- # Try one more approach - use the raw model architecture directly
1094
- try:
1095
- print("Attempting direct model call as last resort")
1096
- # Try to reshape tensors to make them compatible
1097
- if hasattr(model, "forward"):
1098
- # Get only necessary inputs
1099
- inputs = {
1100
- "input_ids": input_ids,
1101
- "pixel_values": pixel_values,
1102
- "return_dict": True,
1103
- }
1104
- # Call model directly
1105
- outputs = model(**inputs)
1106
- # Try to get some meaningful output
1107
- if hasattr(outputs, "logits") and outputs.logits is not None:
1108
- pred_ids = torch.argmax(outputs.logits, dim=-1)
1109
- response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
1110
- return response
1111
- else:
1112
- return "Model output did not contain usable results"
1113
- else:
1114
- return "Model does not support direct calling"
1115
- except Exception as direct_error:
1116
- print(f"Direct model call failed: {str(direct_error)}")
1117
- print(traceback.format_exc())
1118
 
1119
- return f"Error processing image: Unable to generate analysis. {str(tensor_error)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1120
  except Exception as e:
1121
- print(f"Outer exception in process_image_with_text: {str(e)}")
1122
- import traceback
1123
- print(traceback.format_exc())
1124
  return f"Error processing image: {str(e)}"
1125
 
1126
  # Main function
 
17
  import re
18
  from pdf2image import convert_from_path, convert_from_bytes
19
  import tempfile
20
+ import logging
21
+ import traceback
22
+
23
+ # Set up logging
24
+ LOG_DIR = "logs"
25
+ os.makedirs(LOG_DIR, exist_ok=True)
26
+ log_file = os.path.join(LOG_DIR, f"app_debug_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
27
+
28
+ # Configure logging
29
+ logging.basicConfig(
30
+ level=logging.DEBUG,
31
+ format='%(asctime)s [%(levelname)s] %(message)s',
32
+ handlers=[
33
+ logging.FileHandler(log_file),
34
+ logging.StreamHandler(sys.stdout)
35
+ ]
36
+ )
37
+
38
+ # Create a logger
39
+ logger = logging.getLogger("internvl_analyzer")
40
+ logger.setLevel(logging.DEBUG)
41
+
42
+ # Log startup information
43
+ logger.info("="*50)
44
+ logger.info("InternVL2.5 Image Analyzer starting up")
45
+ logger.info(f"Log file: {log_file}")
46
+ logger.info(f"Python version: {sys.version}")
47
+ logger.info(f"Torch version: {torch.__version__}")
48
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
49
+ if torch.cuda.is_available():
50
+ logger.info(f"CUDA version: {torch.version.cuda}")
51
+ logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
52
+ logger.info("="*50)
53
+
54
+ # Function to log tensor info for debugging
55
+ def log_tensor_info(tensor, name="tensor"):
56
+ """Log detailed information about a tensor or list for debugging."""
57
+ if tensor is None:
58
+ logger.warning(f"{name} is None")
59
+ return
60
+
61
+ try:
62
+ if isinstance(tensor, list):
63
+ logger.debug(f"{name} is a list of length {len(tensor)}")
64
+ for i, item in enumerate(tensor[:3]): # Log first 3 items
65
+ item_type = type(item)
66
+ item_shape = getattr(item, "shape", "unknown")
67
+ item_dtype = getattr(item, "dtype", "unknown")
68
+ logger.debug(f" - Item {i}: type={item_type}, shape={item_shape}, dtype={item_dtype}")
69
+ if len(tensor) > 3:
70
+ logger.debug(f" - ... and {len(tensor)-3} more items")
71
+ elif isinstance(tensor, torch.Tensor):
72
+ logger.debug(f"{name} is a tensor: shape={tensor.shape}, dtype={tensor.dtype}, device={tensor.device}")
73
+ # Log additional stats for numerical issues
74
+ if tensor.numel() > 0:
75
+ try:
76
+ logger.debug(f" - Stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
77
+ f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
78
+ except:
79
+ pass # Skip stats if they can't be computed
80
+ logger.debug(f" - Requires grad: {tensor.requires_grad}")
81
+ else:
82
+ logger.debug(f"{name} is type {type(tensor)}")
83
+ except Exception as e:
84
+ logger.error(f"Error logging tensor info for {name}: {str(e)}")
85
 
86
  # Constants
87
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
 
101
  # If HF_TOKEN exists in environment, use it for authentication
102
  hf_token = os.environ.get("HUGGINGFACE_TOKEN", None)
103
  if hf_token:
104
+ logger.info("Logging in to Hugging Face Hub with token...")
105
  login(token=hf_token)
106
  else:
107
+ logger.info("No Hugging Face token found in environment. Model may not load if it's private.")
108
 
109
  # Supported image file extensions
110
  SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.pdf']
 
467
  def process_pdf(pdf_path=None, pdf_bytes=None):
468
  """Convert PDF file to a list of PIL images."""
469
  try:
470
+ logger.info(f"Processing PDF: {pdf_path}")
471
+ logger.debug(f"Current working directory: {os.getcwd()}")
472
+
473
  if pdf_path:
474
  # Convert PDF file pages to PIL images
475
+ logger.info(f"Converting PDF from path: {pdf_path}")
476
+ logger.debug(f"PDF path exists: {os.path.exists(pdf_path)}")
477
+ logger.debug(f"PDF path is file: {os.path.isfile(pdf_path)}")
478
+ logger.debug(f"PDF file size: {os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 'N/A'} bytes")
479
+
480
+ try:
481
+ images = convert_from_path(pdf_path)
482
+ logger.info(f"PDF successfully converted to {len(images)} images")
483
+ except Exception as pdf_err:
484
+ logger.error(f"Error in convert_from_path: {str(pdf_err)}")
485
+ logger.error(traceback.format_exc())
486
+ # Try with different parameters
487
+ logger.info("Attempting alternative PDF conversion")
488
+ try:
489
+ images = convert_from_path(
490
+ pdf_path,
491
+ dpi=150, # Lower DPI for better compatibility
492
+ use_pdftocairo=False, # Try different backend
493
+ strict=False # Be more lenient with errors
494
+ )
495
+ logger.info(f"Alternative conversion successful: {len(images)} images")
496
+ except Exception as alt_err:
497
+ logger.error(f"Alternative conversion also failed: {str(alt_err)}")
498
+ logger.error(traceback.format_exc())
499
+ raise
500
  elif pdf_bytes:
501
  # Convert PDF bytes to PIL images
502
+ logger.info("Converting PDF from bytes")
503
+ logger.debug(f"PDF bytes size: {len(pdf_bytes)} bytes")
504
+
505
+ try:
506
+ images = convert_from_bytes(pdf_bytes)
507
+ logger.info(f"PDF bytes successfully converted to {len(images)} images")
508
+ except Exception as bytes_err:
509
+ logger.error(f"Error in convert_from_bytes: {str(bytes_err)}")
510
+ logger.error(traceback.format_exc())
511
+ # Try with different parameters
512
+ logger.info("Attempting alternative PDF bytes conversion")
513
+ try:
514
+ images = convert_from_bytes(
515
+ pdf_bytes,
516
+ dpi=150, # Lower DPI
517
+ use_pdftocairo=False,
518
+ strict=False
519
+ )
520
+ logger.info(f"Alternative bytes conversion successful: {len(images)} images")
521
+ except Exception as alt_bytes_err:
522
+ logger.error(f"Alternative bytes conversion also failed: {str(alt_bytes_err)}")
523
+ logger.error(traceback.format_exc())
524
+ raise
525
  else:
526
+ logger.error("No PDF source provided")
527
+ return None
528
+
529
+ # Validate and log the output
530
+ if not images:
531
+ logger.error("PDF conversion returned empty list")
532
  return None
533
 
534
+ # Log details about the first few converted images
535
+ for i, img in enumerate(images[:2]): # Log first 2 pages
536
+ logger.debug(f"PDF Page {i+1}: size={img.size}, mode={img.mode}")
537
+
538
+ logger.info(f"PDF successfully processed, returning {len(images)} images")
539
  return images
540
  except Exception as e:
541
+ logger.error(f"Fatal error in process_pdf: {str(e)}")
542
+ logger.error(traceback.format_exc())
 
543
  return None
544
 
545
  # Function to analyze images with a prompt
 
967
  # For PDF files, handle differently
968
  if file_name.lower().endswith('.pdf'):
969
  try:
970
+ logger.info(f"Processing PDF file in folder analysis: {image_file}")
971
+ logger.debug(f"PDF absolute path: {os.path.abspath(image_file)}")
972
+ logger.debug(f"PDF exists: {os.path.exists(image_file)}")
973
+ logger.debug(f"PDF file size: {os.path.getsize(image_file) if os.path.exists(image_file) else 'N/A'}")
974
+
975
  # Load model here to ensure it's ready
976
  model, tokenizer = load_model()
977
  if model is None or tokenizer is None:
978
+ logger.error("Model failed to load for PDF analysis")
979
  result += "Error: Model failed to load for PDF analysis.\n"
980
  continue
981
 
982
  # Try a completely different approach for PDFs to avoid tensor issues
983
  try:
984
+ # Convert PDF to images with detailed logging
985
+ logger.info(f"Starting PDF to image conversion for {file_name}")
986
+ with open(image_file, 'rb') as pdf_file:
987
+ pdf_data = pdf_file.read()
988
+ logger.debug(f"Read {len(pdf_data)} bytes from PDF file")
989
+
990
+ # Try both methods
991
+ try:
992
+ logger.debug("Attempting convert_from_path...")
993
+ pdf_images = convert_from_path(image_file)
994
+ logger.info(f"convert_from_path successful: {len(pdf_images)} pages")
995
+ except Exception as path_err:
996
+ logger.error(f"convert_from_path failed: {str(path_err)}")
997
+ logger.error(traceback.format_exc())
998
+
999
+ # Fall back to bytes method
1000
+ logger.debug("Falling back to convert_from_bytes...")
1001
+ pdf_images = convert_from_bytes(pdf_data)
1002
+ logger.info(f"convert_from_bytes successful: {len(pdf_images)} pages")
1003
+
1004
+ logger.info(f"PDF converted to {len(pdf_images)} pages")
1005
 
1006
  if not pdf_images or len(pdf_images) == 0:
1007
+ logger.error("PDF converted but no pages were extracted")
1008
  result += "PDF converted but no pages were extracted.\n"
1009
  continue
1010
 
1011
  # Process each page separately to avoid batch issues
1012
  for i, img in enumerate(pdf_images):
1013
  try:
1014
+ logger.info(f"Processing PDF page {i+1}/{len(pdf_images)}")
1015
+ logger.debug(f"Page {i+1} image: size={img.size}, mode={img.mode}")
1016
 
1017
  # Manual preprocessing - don't use the typical image loading pipeline
1018
+ logger.debug("Converting image to RGB")
1019
  img = img.convert('RGB')
1020
 
1021
+ # Log the image info for debugging
1022
+ logger.debug(f"After RGB conversion: size={img.size}, mode={img.mode}")
1023
+
1024
  # Resize and transform manually
1025
+ logger.debug(f"Resizing image to {IMAGE_SIZE}x{IMAGE_SIZE}")
1026
  img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
1027
+ logger.debug(f"After resize: size={img_resized.size}, mode={img_resized.mode}")
1028
+
1029
+ # Build transform and apply it
1030
+ logger.debug("Building and applying transform")
1031
  transform = T.Compose([
1032
  T.ToTensor(),
1033
  T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
1034
  ])
1035
+
1036
+ # Transform to tensor with explicit error handling
1037
+ try:
1038
+ logger.debug("Converting image to tensor")
1039
+ tensor = transform(img_resized)
1040
+ logger.debug(f"Transform successful: tensor shape={tensor.shape}, dtype={tensor.dtype}")
1041
+
1042
+ # Log tensor stats for debugging numerical issues
1043
+ if tensor.numel() > 0:
1044
+ logger.debug(f"Tensor stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
1045
+ f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
1046
+
1047
+ # Add batch dimension with careful checking
1048
+ if isinstance(tensor, torch.Tensor):
1049
+ logger.debug("Adding batch dimension")
1050
+ tensor = tensor.unsqueeze(0)
1051
+ logger.debug(f"After unsqueeze: shape={tensor.shape}")
1052
+ else:
1053
+ logger.error(f"Expected tensor but got {type(tensor)}")
1054
+ raise TypeError(f"Transform returned {type(tensor)} instead of tensor")
1055
+ except Exception as tensor_err:
1056
+ logger.error(f"Error in tensor creation: {str(tensor_err)}")
1057
+ logger.error(traceback.format_exc())
1058
+ raise
1059
 
1060
  # Move to device and set data type
1061
  device = "cuda" if torch.cuda.is_available() else "cpu"
1062
+ logger.debug(f"Moving tensor to device: {device}")
1063
  tensor = tensor.to(device)
1064
+
1065
  if torch.cuda.is_available():
1066
+ logger.debug("Converting tensor to bfloat16")
1067
  tensor = tensor.to(torch.bfloat16)
1068
 
1069
+ logger.info(f"Preprocessed tensor: shape={tensor.shape}, device={tensor.device}, dtype={tensor.dtype}")
1070
 
1071
  # Use direct text generation
1072
  page_prompt = f"PDF Page {i+1}: {prompt}"
1073
+ logger.debug(f"Preparing tokenization for prompt: {page_prompt}")
1074
  input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
1075
+ logger.debug(f"Tokenization complete: shape={input_tokens['input_ids'].shape}")
1076
 
1077
  # Generate with proper error handling
1078
  try:
1079
  # Try direct generation first
1080
+ logger.info("Attempting direct generation")
1081
  outputs = model.generate(
1082
  input_tokens["input_ids"],
1083
  pixel_values=tensor,
1084
  max_new_tokens=512,
1085
  do_sample=False
1086
  )
1087
+ logger.info("Generation successful")
1088
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
1089
+ logger.debug(f"Response length: {len(response)} chars")
1090
  except Exception as gen_err:
1091
+ logger.error(f"Error in direct generation: {str(gen_err)}")
1092
+ logger.error(traceback.format_exc())
1093
 
1094
  # Fall back to chat method
1095
  try:
1096
+ print("Trying chat method fallback")
1097
  question = f"<image>\n{page_prompt}"
1098
+ # IMPORTANT: Ensure we're not passing a list here!
1099
+ if isinstance(tensor, list):
1100
+ print("WARNING: tensor is a list, converting...")
1101
+ if len(tensor) > 0:
1102
+ # Take the first item if it's a tensor
1103
+ if isinstance(tensor[0], torch.Tensor):
1104
+ tensor = tensor[0].unsqueeze(0)
1105
+ else:
1106
+ # Create a new tensor from scratch
1107
+ print("Creating new tensor from scratch")
1108
+ tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
1109
+ dtype=torch.float32).to(device)
1110
+ if torch.cuda.is_available():
1111
+ tensor = tensor.to(torch.bfloat16)
1112
+ else:
1113
+ # Create a dummy tensor
1114
+ tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
1115
+ dtype=torch.float32).to(device)
1116
+ if torch.cuda.is_available():
1117
+ tensor = tensor.to(torch.bfloat16)
1118
+
1119
+ # Verify tensor shape and type before passing to model
1120
+ print(f"Final tensor type: {type(tensor)}, shape: {tensor.shape if hasattr(tensor, 'shape') else 'unknown'}")
1121
+
1122
+ # Use the chat method with verified tensor
1123
  response, _ = model.chat(
1124
  tokenizer=tokenizer,
1125
  pixel_values=tensor,
 
1130
  )
1131
  except Exception as chat_err:
1132
  print(f"Chat fallback failed: {str(chat_err)}")
1133
+ import traceback
1134
+ print(traceback.format_exc())
1135
+
1136
+ # Last attempt - use direct model forward pass
1137
+ try:
1138
+ print("Attempting direct model forward pass")
1139
+ # Create inputs manually
1140
+ if hasattr(model, "forward"):
1141
+ # Create tensors from scratch if needed
1142
+ if not isinstance(tensor, torch.Tensor):
1143
+ tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
1144
+ dtype=torch.float32).to(device)
1145
+ if torch.cuda.is_available():
1146
+ tensor = tensor.to(torch.bfloat16)
1147
+
1148
+ # Get input tokens in the right format
1149
+ input_ids = input_tokens["input_ids"]
1150
+ if len(input_ids.shape) == 1:
1151
+ input_ids = input_ids.unsqueeze(0)
1152
+
1153
+ # Prepare inputs for direct call
1154
+ inputs = {
1155
+ "input_ids": input_ids,
1156
+ "pixel_values": tensor,
1157
+ "return_dict": True,
1158
+ }
1159
+ # Call model directly
1160
+ outputs = model(**inputs)
1161
+ # Try to get some output
1162
+ if hasattr(outputs, "logits") and outputs.logits is not None:
1163
+ pred_ids = torch.argmax(outputs.logits, dim=-1)
1164
+ response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
1165
+ else:
1166
+ response = "Failed to generate analysis - model output didn't contain usable data"
1167
+ else:
1168
+ response = "Failed to generate analysis - model doesn't support direct calling"
1169
+ except Exception as final_err:
1170
+ print(f"All attempts failed: {str(final_err)}")
1171
+ import traceback
1172
+ print(traceback.format_exc())
1173
+ response = f"Analysis failed due to model error: {str(final_err)}"
1174
 
1175
  # Add to result
1176
  result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
 
1210
  def process_image_with_text(image, prompt):
1211
  """Process a single image with the InternVL model and a text prompt."""
1212
  try:
1213
+ logger.info(f"process_image_with_text called with image type: {type(image)}")
1214
+
1215
+ # Debug info for image
1216
+ if hasattr(image, 'size'):
1217
+ logger.debug(f"Image dimensions: {image.size}")
1218
+ if hasattr(image, 'mode'):
1219
+ logger.debug(f"Image mode: {image.mode}")
1220
+
1221
+ # Log memory usage
1222
+ if torch.cuda.is_available():
1223
+ logger.debug(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
1224
+ logger.debug(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
1225
+
1226
  # Load model if not already loaded
1227
+ logger.debug("Loading model")
1228
  model, tokenizer = load_model()
1229
  if model is None or tokenizer is None:
1230
+ logger.error("Model failed to load")
1231
  return "Error loading model. Please check the logs for details."
1232
 
1233
+ logger.debug("Model loaded successfully")
1234
+
1235
+ # Skip the standard load_image function which might return a list
1236
+ # Instead, process the image directly to avoid list issues
1237
+ try:
1238
+ # Convert to RGB if needed
1239
+ logger.debug("Converting image to RGB if needed")
1240
+ if hasattr(image, 'convert'):
1241
+ image = image.convert('RGB')
1242
+ logger.debug(f"After conversion: mode={image.mode}, size={image.size}")
1243
+ else:
1244
+ logger.error("Image does not have convert method")
1245
+ return "Error: Unable to convert image to RGB"
1246
 
1247
+ # Resize for consistent dimensions
1248
+ logger.debug(f"Resizing image to {IMAGE_SIZE}x{IMAGE_SIZE}")
1249
+ if hasattr(image, 'resize'):
1250
+ image_resized = image.resize((IMAGE_SIZE, IMAGE_SIZE))
1251
+ logger.debug(f"After resize: size={image_resized.size}")
1252
+ else:
1253
+ logger.error("Image does not have resize method")
1254
+ return "Error: Unable to resize image"
1255
+
1256
+ # Apply transforms directly
1257
+ logger.debug("Creating transform")
1258
+ transform = T.Compose([
1259
+ T.ToTensor(),
1260
+ T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
1261
+ ])
1262
+
1263
+ # Convert to tensor safely
1264
+ logger.debug("Converting image to tensor")
1265
+ tensor = transform(image_resized)
1266
+
1267
+ # Log detailed tensor info
1268
+ if isinstance(tensor, torch.Tensor):
1269
+ logger.debug(f"Image transformed to tensor: shape={tensor.shape}, dtype={tensor.dtype}")
1270
+ if tensor.numel() > 0:
1271
+ logger.debug(f"Tensor stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
1272
+ f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
1273
+ else:
1274
+ logger.error(f"Transform did not return a tensor: {type(tensor)}")
1275
+ raise TypeError(f"Expected torch.Tensor but got {type(tensor)}")
1276
+
1277
+ # Ensure we have a 4D tensor [batch, channels, height, width]
1278
+ logger.debug("Adding batch dimension if needed")
1279
+ if len(tensor.shape) == 3:
1280
+ tensor = tensor.unsqueeze(0) # Add batch dimension
1281
+ logger.debug(f"Added batch dimension, new shape: {tensor.shape}")
1282
+
1283
+ # Move to appropriate device
1284
+ device = "cuda" if torch.cuda.is_available() else "cpu"
1285
+ logger.debug(f"Moving tensor to device: {device}")
1286
+ tensor = tensor.to(device)
1287
+
1288
+ if torch.cuda.is_available():
1289
+ logger.debug("Converting tensor to bfloat16")
1290
+ tensor = tensor.to(torch.bfloat16)
1291
+ logger.debug(f"Tensor converted to bfloat16, new dtype: {tensor.dtype}")
1292
+
1293
+ logger.info(f"Final tensor prepared: shape={tensor.shape}, device={tensor.device}, dtype={tensor.dtype}")
1294
+ except Exception as tensor_err:
1295
+ logger.error(f"Error in tensor creation: {str(tensor_err)}")
1296
+ logger.error(traceback.format_exc())
1297
+ return f"Error preparing image for analysis: {str(tensor_err)}"
1298
 
1299
  # Process the prompt
1300
+ logger.debug(f"Tokenizing prompt: {prompt}")
1301
+ input_tokens = tokenizer(prompt, return_tensors="pt").to(device)
1302
+ logger.debug(f"Input tokens shape: {input_tokens['input_ids'].shape}")
1303
 
1304
+ # Generate description - try multiple approaches with proper error handling
1305
  with torch.inference_mode():
1306
+ try:
1307
+ # Approach 1: Try direct generation
1308
+ logger.info("Attempting direct generation")
1309
+
1310
+ # Double-check inputs
1311
+ logger.debug(f"Checking input token tensor: shape={input_tokens['input_ids'].shape}, device={input_tokens['input_ids'].device}")
1312
+ logger.debug(f"Checking image tensor: shape={tensor.shape}, device={tensor.device}")
1313
+
1314
+ output_ids = model.generate(
1315
+ input_tokens["input_ids"],
1316
+ tensor,
1317
+ max_new_tokens=512,
1318
+ temperature=0.1,
1319
+ do_sample=False
1320
+ )
1321
+
1322
+ logger.info("Direct generation successful")
1323
+ logger.debug(f"Output IDs shape: {output_ids.shape}")
1324
+
1325
+ output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
1326
+ logger.debug(f"Decoded output length: {len(output)} chars")
1327
+
1328
+ return output.strip()
1329
+ except Exception as gen_error:
1330
+ logger.error(f"Direct generation failed: {str(gen_error)}")
1331
+ logger.error(traceback.format_exc())
1332
+
1333
+ # Approach 2: Try the chat method
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1334
  try:
1335
+ logger.info("Attempting chat method")
1336
+ question = f"<image>\n{prompt}"
1337
+ logger.debug(f"Chat question: {question}")
 
 
 
 
 
 
1338
 
1339
+ # Double check tensor
1340
+ if not isinstance(tensor, torch.Tensor):
1341
+ logger.error(f"Chat method: expected torch.Tensor but got {type(tensor)}")
1342
+ raise TypeError(f"Expected torch.Tensor but got {type(tensor)}")
1343
 
1344
+ response, _ = model.chat(
1345
+ tokenizer=tokenizer,
1346
+ pixel_values=tensor,
1347
+ question=question,
1348
+ generation_config={"max_new_tokens": 512, "do_sample": False},
1349
+ history=None,
1350
+ return_history=True
1351
  )
1352
 
1353
+ logger.info("Chat method successful")
1354
+ logger.debug(f"Chat response length: {len(response)} chars")
1355
+
1356
+ return response.strip()
1357
+ except Exception as chat_error:
1358
+ logger.error(f"Chat method failed: {str(chat_error)}")
1359
+ logger.error(traceback.format_exc())
1360
 
1361
+ # Approach 3: Try direct model forward pass
1362
  try:
1363
+ logger.info("Attempting direct model forward call")
 
 
 
 
 
 
 
 
 
 
 
 
 
1364
 
1365
+ if hasattr(model, "forward"):
1366
+ logger.debug("Model has forward method")
1367
+
1368
+ # Prepare inputs
1369
+ logger.debug("Preparing inputs for direct forward pass")
1370
+ inputs = {
1371
+ "input_ids": input_tokens["input_ids"],
1372
+ "pixel_values": tensor,
1373
+ "return_dict": True,
1374
+ }
1375
+
1376
+ # Log input shapes
1377
+ for k, v in inputs.items():
1378
+ if hasattr(v, 'shape'):
1379
+ logger.debug(f"Input '{k}' shape: {v.shape}")
 
 
 
 
 
 
 
 
 
 
1380
 
1381
+ # Call model directly
1382
+ logger.debug("Calling model.forward")
1383
+ outputs = model(**inputs)
1384
+
1385
+ # Try to extract output
1386
+ if hasattr(outputs, "logits") and outputs.logits is not None:
1387
+ logger.debug(f"Got logits with shape: {outputs.logits.shape}")
1388
+
1389
+ pred_ids = torch.argmax(outputs.logits, dim=-1)
1390
+ logger.debug(f"Prediction IDs shape: {pred_ids.shape}")
1391
+
1392
+ response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
1393
+ logger.debug(f"Decoded response length: {len(response)} chars")
1394
+
1395
+ return response.strip()
1396
+ else:
1397
+ logger.error("Model output does not contain logits")
1398
+ return "Failed to analyze image - model output contains no usable data"
1399
+ else:
1400
+ logger.error("Model does not have forward method")
1401
+ return "Failed to analyze image - model doesn't support direct calling"
1402
+ except Exception as forward_error:
1403
+ logger.error(f"Forward method failed: {str(forward_error)}")
1404
+ logger.error(traceback.format_exc())
1405
+
1406
+ # All methods failed
1407
+ return f"Error generating analysis: All methods failed to process the image"
1408
  except Exception as e:
1409
+ logger.error(f"Fatal error in process_image_with_text: {str(e)}")
1410
+ logger.error(traceback.format_exc())
 
1411
  return f"Error processing image: {str(e)}"
1412
 
1413
  # Main function