mknolan commited on
Commit
b517f60
·
verified ·
1 Parent(s): fd5ea34

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +170 -64
app.py CHANGED
@@ -127,6 +127,7 @@ def load_image(image_pil, max_num=12):
127
  # Stack tensors - this is where the error might occur if any element isn't a tensor
128
  try:
129
  pixel_values = torch.stack(pixel_values)
 
130
  except Exception as stack_error:
131
  print(f"Error during tensor stacking: {str(stack_error)}")
132
  # Try to recover - convert any non-tensor to tensor
@@ -142,7 +143,7 @@ def load_image(image_pil, max_num=12):
142
  else:
143
  val = np.array(val)
144
  # Then to tensor
145
- val = torch.from_numpy(val)
146
  fixed_values.append(val)
147
  except Exception as convert_err:
148
  print(f"Failed to convert item {i}: {str(convert_err)}")
@@ -175,7 +176,17 @@ def load_image(image_pil, max_num=12):
175
  # Simplest approach: just convert the single image without splitting
176
  image_pil = image_pil.convert('RGB')
177
  transform = build_transform(IMAGE_SIZE)
178
- tensor = transform(image_pil).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
179
 
180
  if torch.cuda.is_available():
181
  tensor = tensor.cuda().to(torch.bfloat16)
@@ -186,7 +197,20 @@ def load_image(image_pil, max_num=12):
186
  return tensor
187
  except Exception as recovery_error:
188
  print(f"Recovery attempt also failed: {str(recovery_error)}")
189
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  # Function to split model across GPUs
192
  def split_model(model_name):
@@ -821,78 +845,99 @@ def analyze_folder_images(folder_path, prompt):
821
  file_name = os.path.basename(image_file)
822
  result += f"---\nImage: {file_name}\n"
823
 
824
- # For PDF files, convert to images and analyze each page
825
  if file_name.lower().endswith('.pdf'):
826
  try:
827
  print(f"Processing PDF file: {image_file}")
828
- # Use a completely different approach for PDFs that avoids tensor issues
829
  model, tokenizer = load_model()
830
  if model is None or tokenizer is None:
831
  result += "Error: Model failed to load for PDF analysis.\n"
832
  continue
833
 
834
- # Try conversion with pdf2image
835
  try:
 
836
  pdf_images = convert_from_path(image_file)
837
- print(f"Converted PDF to {len(pdf_images)} pages")
838
- except Exception as pdf_err:
839
- print(f"PDF conversion error: {str(pdf_err)}")
840
- result += f"Failed to convert PDF: {str(pdf_err)}\n"
841
- continue
842
-
843
- if not pdf_images or len(pdf_images) == 0:
844
- result += "PDF converted but no pages were extracted.\n"
845
- continue
846
-
847
- for i, img in enumerate(pdf_images):
848
- try:
849
- print(f"Processing PDF page {i+1} of {len(pdf_images)}")
850
- # Convert to RGB and resize to standard size
851
- img = img.convert('RGB')
852
- # Use the chat() function directly which is more reliable
853
- question = f"<image>\n{prompt}"
854
-
855
- # Process image with proper error handling
856
  try:
857
- # Manually preprocess image
858
- transform = build_transform(IMAGE_SIZE)
859
- # Resize to a standard size to avoid splitting issues
 
 
 
860
  img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
861
- pixel_values = transform(img_resized).unsqueeze(0)
 
 
 
 
862
 
863
- # Move to appropriate device
 
 
864
  if torch.cuda.is_available():
865
- pixel_values = pixel_values.cuda().to(torch.bfloat16)
866
- else:
867
- pixel_values = pixel_values.to(torch.float32)
868
 
869
- print(f"Processed image tensor shape: {pixel_values.shape}, type: {type(pixel_values)}")
870
 
871
- # Use direct generation
872
- input_tokens = tokenizer(prompt)
873
- output_ids = model.generate(
874
- input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
875
- pixel_values,
876
- max_new_tokens=512,
877
- temperature=0.1,
878
- do_sample=False
879
- )
880
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
881
- print(f"Successfully generated response for page {i+1}")
882
- except Exception as model_err:
883
- print(f"Error in model generation: {str(model_err)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  import traceback
885
  print(traceback.format_exc())
886
- response = f"Model error: {str(model_err)}"
887
-
888
- result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
889
- except Exception as page_err:
890
- print(f"Page processing error: {str(page_err)}")
891
- import traceback
892
- print(traceback.format_exc())
893
- result += f"\n-- PDF Page {i+1} --\nError: {str(page_err)}\n"
894
  except Exception as e:
895
- print(f"General PDF error: {str(e)}")
896
  import traceback
897
  print(traceback.format_exc())
898
  result += f"Failed to process PDF: {str(e)}\n"
@@ -903,6 +948,9 @@ def analyze_folder_images(folder_path, prompt):
903
  image_result = process_image_with_text(image, prompt)
904
  result += f"\n{image_result}\n"
905
  except Exception as e:
 
 
 
906
  result += f"Error processing image: {str(e)}\n"
907
 
908
  return result
@@ -911,6 +959,7 @@ def analyze_folder_images(folder_path, prompt):
911
  def process_image_with_text(image, prompt):
912
  """Process a single image with the InternVL model and a text prompt."""
913
  try:
 
914
  # Load model if not already loaded
915
  model, tokenizer = load_model()
916
  if model is None or tokenizer is None:
@@ -921,6 +970,9 @@ def process_image_with_text(image, prompt):
921
  if pixel_values is None:
922
  return "Error preparing image."
923
 
 
 
 
924
  # Process the prompt
925
  input_tokens = tokenizer(prompt)
926
 
@@ -952,17 +1004,39 @@ def process_image_with_text(image, prompt):
952
  # Move to device
953
  pv = pv.to("cuda" if torch.cuda.is_available() else "cpu")
954
 
955
- output_ids = model.generate(
956
- input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
957
- pv,
958
- max_new_tokens=512,
959
- temperature=0.1,
960
- do_sample=False
961
- )
962
- output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
  results.append(output.strip())
964
  except Exception as item_error:
965
  print(f"Error processing item {i}: {str(item_error)}")
 
 
966
  results.append(f"Error: {str(item_error)}")
967
 
968
  return "\n".join(results)
@@ -972,12 +1046,16 @@ def process_image_with_text(image, prompt):
972
  # Ensure pixel_values is a proper 4D tensor [batch, channels, height, width]
973
  if len(pixel_values.shape) == 3:
974
  pixel_values = pixel_values.unsqueeze(0)
 
975
 
976
  # Move tensors to the same device
977
  device = "cuda" if torch.cuda.is_available() else "cpu"
978
  pixel_values = pixel_values.to(device)
979
  input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
980
 
 
 
 
981
  # Run the model
982
  output_ids = model.generate(
983
  input_ids,
@@ -1010,6 +1088,34 @@ def process_image_with_text(image, prompt):
1010
  return response
1011
  except Exception as chat_error:
1012
  print(f"Fallback also failed: {str(chat_error)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
  return f"Error processing image: Unable to generate analysis. {str(tensor_error)}"
1014
  except Exception as e:
1015
  print(f"Outer exception in process_image_with_text: {str(e)}")
 
127
  # Stack tensors - this is where the error might occur if any element isn't a tensor
128
  try:
129
  pixel_values = torch.stack(pixel_values)
130
+ print(f"Successfully stacked tensors into shape: {pixel_values.shape}")
131
  except Exception as stack_error:
132
  print(f"Error during tensor stacking: {str(stack_error)}")
133
  # Try to recover - convert any non-tensor to tensor
 
143
  else:
144
  val = np.array(val)
145
  # Then to tensor
146
+ val = torch.from_numpy(val).float() # Specify float type explicitly
147
  fixed_values.append(val)
148
  except Exception as convert_err:
149
  print(f"Failed to convert item {i}: {str(convert_err)}")
 
176
  # Simplest approach: just convert the single image without splitting
177
  image_pil = image_pil.convert('RGB')
178
  transform = build_transform(IMAGE_SIZE)
179
+ tensor = transform(image_pil)
180
+
181
+ # Make sure it's a tensor before using unsqueeze
182
+ if not isinstance(tensor, torch.Tensor):
183
+ print(f"Warning: transform did not return a tensor, got {type(tensor)}")
184
+ if hasattr(tensor, 'numpy'):
185
+ tensor = torch.from_numpy(tensor.numpy()).float()
186
+ else:
187
+ tensor = torch.tensor(tensor, dtype=torch.float32)
188
+
189
+ tensor = tensor.unsqueeze(0) # Now safe to use unsqueeze
190
 
191
  if torch.cuda.is_available():
192
  tensor = tensor.cuda().to(torch.bfloat16)
 
197
  return tensor
198
  except Exception as recovery_error:
199
  print(f"Recovery attempt also failed: {str(recovery_error)}")
200
+ print(traceback.format_exc())
201
+
202
+ # Last resort - return a dummy tensor of the right shape
203
+ try:
204
+ print("Creating fallback dummy tensor...")
205
+ dummy_tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
206
+ dtype=torch.float32)
207
+ if torch.cuda.is_available():
208
+ dummy_tensor = dummy_tensor.cuda().to(torch.bfloat16)
209
+ print("Returning dummy tensor as last resort")
210
+ return dummy_tensor
211
+ except:
212
+ print("Even dummy tensor creation failed. Cannot proceed.")
213
+ return None
214
 
215
  # Function to split model across GPUs
216
  def split_model(model_name):
 
845
  file_name = os.path.basename(image_file)
846
  result += f"---\nImage: {file_name}\n"
847
 
848
+ # For PDF files, handle differently
849
  if file_name.lower().endswith('.pdf'):
850
  try:
851
  print(f"Processing PDF file: {image_file}")
852
+ # Load model here to ensure it's ready
853
  model, tokenizer = load_model()
854
  if model is None or tokenizer is None:
855
  result += "Error: Model failed to load for PDF analysis.\n"
856
  continue
857
 
858
+ # Try a completely different approach for PDFs to avoid tensor issues
859
  try:
860
+ # Convert PDF to images
861
  pdf_images = convert_from_path(image_file)
862
+ print(f"PDF converted to {len(pdf_images)} pages")
863
+
864
+ if not pdf_images or len(pdf_images) == 0:
865
+ result += "PDF converted but no pages were extracted.\n"
866
+ continue
867
+
868
+ # Process each page separately to avoid batch issues
869
+ for i, img in enumerate(pdf_images):
 
 
 
 
 
 
 
 
 
 
 
870
  try:
871
+ print(f"Processing PDF page {i+1}/{len(pdf_images)}")
872
+
873
+ # Manual preprocessing - don't use the typical image loading pipeline
874
+ img = img.convert('RGB')
875
+
876
+ # Resize and transform manually
877
  img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
878
+ transform = T.Compose([
879
+ T.ToTensor(),
880
+ T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
881
+ ])
882
+ tensor = transform(img_resized).unsqueeze(0)
883
 
884
+ # Move to device and set data type
885
+ device = "cuda" if torch.cuda.is_available() else "cpu"
886
+ tensor = tensor.to(device)
887
  if torch.cuda.is_available():
888
+ tensor = tensor.to(torch.bfloat16)
 
 
889
 
890
+ print(f"Preprocessed tensor shape: {tensor.shape}, device: {tensor.device}")
891
 
892
+ # Use direct text generation
893
+ page_prompt = f"PDF Page {i+1}: {prompt}"
894
+ input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
895
+
896
+ # Generate with proper error handling
897
+ try:
898
+ # Try direct generation first
899
+ outputs = model.generate(
900
+ input_tokens["input_ids"],
901
+ pixel_values=tensor,
902
+ max_new_tokens=512,
903
+ do_sample=False
904
+ )
905
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
906
+ except Exception as gen_err:
907
+ print(f"Error in direct generation: {str(gen_err)}")
908
+
909
+ # Fall back to chat method
910
+ try:
911
+ question = f"<image>\n{page_prompt}"
912
+ response, _ = model.chat(
913
+ tokenizer=tokenizer,
914
+ pixel_values=tensor,
915
+ question=question,
916
+ generation_config={"max_new_tokens": 512, "do_sample": False},
917
+ history=None,
918
+ return_history=True
919
+ )
920
+ except Exception as chat_err:
921
+ print(f"Chat fallback failed: {str(chat_err)}")
922
+ response = f"Analysis failed due to model error: {str(chat_err)}"
923
+
924
+ # Add to result
925
+ result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
926
+
927
+ except Exception as page_err:
928
+ print(f"Error processing page {i+1}: {str(page_err)}")
929
  import traceback
930
  print(traceback.format_exc())
931
+ result += f"\n-- PDF Page {i+1} --\nError: {str(page_err)}\n"
932
+
933
+ except Exception as pdf_err:
934
+ print(f"PDF processing error: {str(pdf_err)}")
935
+ import traceback
936
+ print(traceback.format_exc())
937
+ result += f"Failed to process PDF: {str(pdf_err)}\n"
938
+
939
  except Exception as e:
940
+ print(f"General exception in PDF processing: {str(e)}")
941
  import traceback
942
  print(traceback.format_exc())
943
  result += f"Failed to process PDF: {str(e)}\n"
 
948
  image_result = process_image_with_text(image, prompt)
949
  result += f"\n{image_result}\n"
950
  except Exception as e:
951
+ print(f"Error processing image {image_file}: {str(e)}")
952
+ import traceback
953
+ print(traceback.format_exc())
954
  result += f"Error processing image: {str(e)}\n"
955
 
956
  return result
 
959
  def process_image_with_text(image, prompt):
960
  """Process a single image with the InternVL model and a text prompt."""
961
  try:
962
+ print(f"process_image_with_text called with image type: {type(image)}")
963
  # Load model if not already loaded
964
  model, tokenizer = load_model()
965
  if model is None or tokenizer is None:
 
970
  if pixel_values is None:
971
  return "Error preparing image."
972
 
973
+ # Debug info
974
+ print(f"Image processed: tensor type {type(pixel_values)}, shape {pixel_values.shape if hasattr(pixel_values, 'shape') else 'unknown'}, dtype {pixel_values.dtype if hasattr(pixel_values, 'dtype') else 'unknown'}")
975
+
976
  # Process the prompt
977
  input_tokens = tokenizer(prompt)
978
 
 
1004
  # Move to device
1005
  pv = pv.to("cuda" if torch.cuda.is_available() else "cpu")
1006
 
1007
+ # Use model.generate directly
1008
+ try:
1009
+ output_ids = model.generate(
1010
+ input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
1011
+ pv,
1012
+ max_new_tokens=512,
1013
+ temperature=0.1,
1014
+ do_sample=False
1015
+ )
1016
+ output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
1017
+ except Exception as gen_error:
1018
+ print(f"Error in direct generation: {str(gen_error)}")
1019
+
1020
+ # Fall back to chat method
1021
+ try:
1022
+ question = f"<image>\n{prompt}"
1023
+ response, _ = model.chat(
1024
+ tokenizer=tokenizer,
1025
+ pixel_values=pv,
1026
+ question=question,
1027
+ generation_config={"max_new_tokens": 512, "do_sample": False},
1028
+ history=None,
1029
+ return_history=True
1030
+ )
1031
+ except Exception as chat_err:
1032
+ print(f"Chat fallback failed: {str(chat_err)}")
1033
+ output = f"Error analyzing image: {str(chat_err)}"
1034
+
1035
  results.append(output.strip())
1036
  except Exception as item_error:
1037
  print(f"Error processing item {i}: {str(item_error)}")
1038
+ import traceback
1039
+ print(traceback.format_exc())
1040
  results.append(f"Error: {str(item_error)}")
1041
 
1042
  return "\n".join(results)
 
1046
  # Ensure pixel_values is a proper 4D tensor [batch, channels, height, width]
1047
  if len(pixel_values.shape) == 3:
1048
  pixel_values = pixel_values.unsqueeze(0)
1049
+ print(f"Added batch dimension, new shape: {pixel_values.shape}")
1050
 
1051
  # Move tensors to the same device
1052
  device = "cuda" if torch.cuda.is_available() else "cpu"
1053
  pixel_values = pixel_values.to(device)
1054
  input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
1055
 
1056
+ print(f"Running model with pixel_values shape: {pixel_values.shape}, device: {pixel_values.device}")
1057
+ print(f"Input IDs shape: {input_ids.shape}, device: {input_ids.device}")
1058
+
1059
  # Run the model
1060
  output_ids = model.generate(
1061
  input_ids,
 
1088
  return response
1089
  except Exception as chat_error:
1090
  print(f"Fallback also failed: {str(chat_error)}")
1091
+ print(traceback.format_exc())
1092
+
1093
+ # Try one more approach - use the raw model architecture directly
1094
+ try:
1095
+ print("Attempting direct model call as last resort")
1096
+ # Try to reshape tensors to make them compatible
1097
+ if hasattr(model, "forward"):
1098
+ # Get only necessary inputs
1099
+ inputs = {
1100
+ "input_ids": input_ids,
1101
+ "pixel_values": pixel_values,
1102
+ "return_dict": True,
1103
+ }
1104
+ # Call model directly
1105
+ outputs = model(**inputs)
1106
+ # Try to get some meaningful output
1107
+ if hasattr(outputs, "logits") and outputs.logits is not None:
1108
+ pred_ids = torch.argmax(outputs.logits, dim=-1)
1109
+ response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
1110
+ return response
1111
+ else:
1112
+ return "Model output did not contain usable results"
1113
+ else:
1114
+ return "Model does not support direct calling"
1115
+ except Exception as direct_error:
1116
+ print(f"Direct model call failed: {str(direct_error)}")
1117
+ print(traceback.format_exc())
1118
+
1119
  return f"Error processing image: Unable to generate analysis. {str(tensor_error)}"
1120
  except Exception as e:
1121
  print(f"Outer exception in process_image_with_text: {str(e)}")