Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -17,6 +17,71 @@ import json
|
|
| 17 |
import re
|
| 18 |
from pdf2image import convert_from_path, convert_from_bytes
|
| 19 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Constants
|
| 22 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
|
@@ -36,10 +101,10 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
|
| 36 |
# If HF_TOKEN exists in environment, use it for authentication
|
| 37 |
hf_token = os.environ.get("HUGGINGFACE_TOKEN", None)
|
| 38 |
if hf_token:
|
| 39 |
-
|
| 40 |
login(token=hf_token)
|
| 41 |
else:
|
| 42 |
-
|
| 43 |
|
| 44 |
# Supported image file extensions
|
| 45 |
SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.pdf']
|
|
@@ -402,25 +467,79 @@ def analyze_dual_images(model, tokenizer, image1, image2, prompt):
|
|
| 402 |
def process_pdf(pdf_path=None, pdf_bytes=None):
|
| 403 |
"""Convert PDF file to a list of PIL images."""
|
| 404 |
try:
|
| 405 |
-
|
|
|
|
|
|
|
| 406 |
if pdf_path:
|
| 407 |
# Convert PDF file pages to PIL images
|
| 408 |
-
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
elif pdf_bytes:
|
| 411 |
# Convert PDF bytes to PIL images
|
| 412 |
-
|
| 413 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
else:
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
return None
|
| 417 |
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
return images
|
| 420 |
except Exception as e:
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
print(traceback.format_exc())
|
| 424 |
return None
|
| 425 |
|
| 426 |
# Function to analyze images with a prompt
|
|
@@ -848,67 +967,159 @@ def analyze_folder_images(folder_path, prompt):
|
|
| 848 |
# For PDF files, handle differently
|
| 849 |
if file_name.lower().endswith('.pdf'):
|
| 850 |
try:
|
| 851 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
# Load model here to ensure it's ready
|
| 853 |
model, tokenizer = load_model()
|
| 854 |
if model is None or tokenizer is None:
|
|
|
|
| 855 |
result += "Error: Model failed to load for PDF analysis.\n"
|
| 856 |
continue
|
| 857 |
|
| 858 |
# Try a completely different approach for PDFs to avoid tensor issues
|
| 859 |
try:
|
| 860 |
-
# Convert PDF to images
|
| 861 |
-
|
| 862 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 863 |
|
| 864 |
if not pdf_images or len(pdf_images) == 0:
|
|
|
|
| 865 |
result += "PDF converted but no pages were extracted.\n"
|
| 866 |
continue
|
| 867 |
|
| 868 |
# Process each page separately to avoid batch issues
|
| 869 |
for i, img in enumerate(pdf_images):
|
| 870 |
try:
|
| 871 |
-
|
|
|
|
| 872 |
|
| 873 |
# Manual preprocessing - don't use the typical image loading pipeline
|
|
|
|
| 874 |
img = img.convert('RGB')
|
| 875 |
|
|
|
|
|
|
|
|
|
|
| 876 |
# Resize and transform manually
|
|
|
|
| 877 |
img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
transform = T.Compose([
|
| 879 |
T.ToTensor(),
|
| 880 |
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 881 |
])
|
| 882 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 883 |
|
| 884 |
# Move to device and set data type
|
| 885 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 886 |
tensor = tensor.to(device)
|
|
|
|
| 887 |
if torch.cuda.is_available():
|
|
|
|
| 888 |
tensor = tensor.to(torch.bfloat16)
|
| 889 |
|
| 890 |
-
|
| 891 |
|
| 892 |
# Use direct text generation
|
| 893 |
page_prompt = f"PDF Page {i+1}: {prompt}"
|
|
|
|
| 894 |
input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
|
|
|
|
| 895 |
|
| 896 |
# Generate with proper error handling
|
| 897 |
try:
|
| 898 |
# Try direct generation first
|
|
|
|
| 899 |
outputs = model.generate(
|
| 900 |
input_tokens["input_ids"],
|
| 901 |
pixel_values=tensor,
|
| 902 |
max_new_tokens=512,
|
| 903 |
do_sample=False
|
| 904 |
)
|
|
|
|
| 905 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
| 906 |
except Exception as gen_err:
|
| 907 |
-
|
|
|
|
| 908 |
|
| 909 |
# Fall back to chat method
|
| 910 |
try:
|
|
|
|
| 911 |
question = f"<image>\n{page_prompt}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 912 |
response, _ = model.chat(
|
| 913 |
tokenizer=tokenizer,
|
| 914 |
pixel_values=tensor,
|
|
@@ -919,7 +1130,47 @@ def analyze_folder_images(folder_path, prompt):
|
|
| 919 |
)
|
| 920 |
except Exception as chat_err:
|
| 921 |
print(f"Chat fallback failed: {str(chat_err)}")
|
| 922 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 923 |
|
| 924 |
# Add to result
|
| 925 |
result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
|
|
@@ -959,168 +1210,204 @@ def analyze_folder_images(folder_path, prompt):
|
|
| 959 |
def process_image_with_text(image, prompt):
|
| 960 |
"""Process a single image with the InternVL model and a text prompt."""
|
| 961 |
try:
|
| 962 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 963 |
# Load model if not already loaded
|
|
|
|
| 964 |
model, tokenizer = load_model()
|
| 965 |
if model is None or tokenizer is None:
|
|
|
|
| 966 |
return "Error loading model. Please check the logs for details."
|
| 967 |
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
|
| 973 |
-
|
| 974 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 975 |
|
| 976 |
# Process the prompt
|
| 977 |
-
|
|
|
|
|
|
|
| 978 |
|
| 979 |
-
# Generate description
|
| 980 |
with torch.inference_mode():
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
output_ids = model.generate(
|
| 1010 |
-
input_tokens["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu"),
|
| 1011 |
-
pv,
|
| 1012 |
-
max_new_tokens=512,
|
| 1013 |
-
temperature=0.1,
|
| 1014 |
-
do_sample=False
|
| 1015 |
-
)
|
| 1016 |
-
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 1017 |
-
except Exception as gen_error:
|
| 1018 |
-
print(f"Error in direct generation: {str(gen_error)}")
|
| 1019 |
-
|
| 1020 |
-
# Fall back to chat method
|
| 1021 |
-
try:
|
| 1022 |
-
question = f"<image>\n{prompt}"
|
| 1023 |
-
response, _ = model.chat(
|
| 1024 |
-
tokenizer=tokenizer,
|
| 1025 |
-
pixel_values=pv,
|
| 1026 |
-
question=question,
|
| 1027 |
-
generation_config={"max_new_tokens": 512, "do_sample": False},
|
| 1028 |
-
history=None,
|
| 1029 |
-
return_history=True
|
| 1030 |
-
)
|
| 1031 |
-
except Exception as chat_err:
|
| 1032 |
-
print(f"Chat fallback failed: {str(chat_err)}")
|
| 1033 |
-
output = f"Error analyzing image: {str(chat_err)}"
|
| 1034 |
-
|
| 1035 |
-
results.append(output.strip())
|
| 1036 |
-
except Exception as item_error:
|
| 1037 |
-
print(f"Error processing item {i}: {str(item_error)}")
|
| 1038 |
-
import traceback
|
| 1039 |
-
print(traceback.format_exc())
|
| 1040 |
-
results.append(f"Error: {str(item_error)}")
|
| 1041 |
-
|
| 1042 |
-
return "\n".join(results)
|
| 1043 |
-
else:
|
| 1044 |
-
# Normal tensor processing
|
| 1045 |
try:
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
print(f"Added batch dimension, new shape: {pixel_values.shape}")
|
| 1050 |
-
|
| 1051 |
-
# Move tensors to the same device
|
| 1052 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 1053 |
-
pixel_values = pixel_values.to(device)
|
| 1054 |
-
input_ids = input_tokens["input_ids"].unsqueeze(0).to(device)
|
| 1055 |
|
| 1056 |
-
|
| 1057 |
-
|
|
|
|
|
|
|
| 1058 |
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
max_new_tokens
|
| 1064 |
-
|
| 1065 |
-
|
| 1066 |
)
|
| 1067 |
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
| 1075 |
|
| 1076 |
-
#
|
| 1077 |
try:
|
| 1078 |
-
|
| 1079 |
-
question = f"<image>\n{prompt}"
|
| 1080 |
-
response, _ = model.chat(
|
| 1081 |
-
tokenizer=tokenizer,
|
| 1082 |
-
pixel_values=pixel_values,
|
| 1083 |
-
question=question,
|
| 1084 |
-
generation_config={"max_new_tokens": 512, "do_sample": False},
|
| 1085 |
-
history=None,
|
| 1086 |
-
return_history=True
|
| 1087 |
-
)
|
| 1088 |
-
return response
|
| 1089 |
-
except Exception as chat_error:
|
| 1090 |
-
print(f"Fallback also failed: {str(chat_error)}")
|
| 1091 |
-
print(traceback.format_exc())
|
| 1092 |
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
#
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
-
pred_ids = torch.argmax(outputs.logits, dim=-1)
|
| 1109 |
-
response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
|
| 1110 |
-
return response
|
| 1111 |
-
else:
|
| 1112 |
-
return "Model output did not contain usable results"
|
| 1113 |
-
else:
|
| 1114 |
-
return "Model does not support direct calling"
|
| 1115 |
-
except Exception as direct_error:
|
| 1116 |
-
print(f"Direct model call failed: {str(direct_error)}")
|
| 1117 |
-
print(traceback.format_exc())
|
| 1118 |
|
| 1119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1120 |
except Exception as e:
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
print(traceback.format_exc())
|
| 1124 |
return f"Error processing image: {str(e)}"
|
| 1125 |
|
| 1126 |
# Main function
|
|
|
|
| 17 |
import re
|
| 18 |
from pdf2image import convert_from_path, convert_from_bytes
|
| 19 |
import tempfile
|
| 20 |
+
import logging
|
| 21 |
+
import traceback
|
| 22 |
+
|
| 23 |
+
# Set up logging
|
| 24 |
+
LOG_DIR = "logs"
|
| 25 |
+
os.makedirs(LOG_DIR, exist_ok=True)
|
| 26 |
+
log_file = os.path.join(LOG_DIR, f"app_debug_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
|
| 27 |
+
|
| 28 |
+
# Configure logging
|
| 29 |
+
logging.basicConfig(
|
| 30 |
+
level=logging.DEBUG,
|
| 31 |
+
format='%(asctime)s [%(levelname)s] %(message)s',
|
| 32 |
+
handlers=[
|
| 33 |
+
logging.FileHandler(log_file),
|
| 34 |
+
logging.StreamHandler(sys.stdout)
|
| 35 |
+
]
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Create a logger
|
| 39 |
+
logger = logging.getLogger("internvl_analyzer")
|
| 40 |
+
logger.setLevel(logging.DEBUG)
|
| 41 |
+
|
| 42 |
+
# Log startup information
|
| 43 |
+
logger.info("="*50)
|
| 44 |
+
logger.info("InternVL2.5 Image Analyzer starting up")
|
| 45 |
+
logger.info(f"Log file: {log_file}")
|
| 46 |
+
logger.info(f"Python version: {sys.version}")
|
| 47 |
+
logger.info(f"Torch version: {torch.__version__}")
|
| 48 |
+
logger.info(f"CUDA available: {torch.cuda.is_available()}")
|
| 49 |
+
if torch.cuda.is_available():
|
| 50 |
+
logger.info(f"CUDA version: {torch.version.cuda}")
|
| 51 |
+
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 52 |
+
logger.info("="*50)
|
| 53 |
+
|
| 54 |
+
# Function to log tensor info for debugging
|
| 55 |
+
def log_tensor_info(tensor, name="tensor"):
|
| 56 |
+
"""Log detailed information about a tensor or list for debugging."""
|
| 57 |
+
if tensor is None:
|
| 58 |
+
logger.warning(f"{name} is None")
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
if isinstance(tensor, list):
|
| 63 |
+
logger.debug(f"{name} is a list of length {len(tensor)}")
|
| 64 |
+
for i, item in enumerate(tensor[:3]): # Log first 3 items
|
| 65 |
+
item_type = type(item)
|
| 66 |
+
item_shape = getattr(item, "shape", "unknown")
|
| 67 |
+
item_dtype = getattr(item, "dtype", "unknown")
|
| 68 |
+
logger.debug(f" - Item {i}: type={item_type}, shape={item_shape}, dtype={item_dtype}")
|
| 69 |
+
if len(tensor) > 3:
|
| 70 |
+
logger.debug(f" - ... and {len(tensor)-3} more items")
|
| 71 |
+
elif isinstance(tensor, torch.Tensor):
|
| 72 |
+
logger.debug(f"{name} is a tensor: shape={tensor.shape}, dtype={tensor.dtype}, device={tensor.device}")
|
| 73 |
+
# Log additional stats for numerical issues
|
| 74 |
+
if tensor.numel() > 0:
|
| 75 |
+
try:
|
| 76 |
+
logger.debug(f" - Stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
|
| 77 |
+
f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
|
| 78 |
+
except:
|
| 79 |
+
pass # Skip stats if they can't be computed
|
| 80 |
+
logger.debug(f" - Requires grad: {tensor.requires_grad}")
|
| 81 |
+
else:
|
| 82 |
+
logger.debug(f"{name} is type {type(tensor)}")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Error logging tensor info for {name}: {str(e)}")
|
| 85 |
|
| 86 |
# Constants
|
| 87 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
|
|
|
| 101 |
# If HF_TOKEN exists in environment, use it for authentication
|
| 102 |
hf_token = os.environ.get("HUGGINGFACE_TOKEN", None)
|
| 103 |
if hf_token:
|
| 104 |
+
logger.info("Logging in to Hugging Face Hub with token...")
|
| 105 |
login(token=hf_token)
|
| 106 |
else:
|
| 107 |
+
logger.info("No Hugging Face token found in environment. Model may not load if it's private.")
|
| 108 |
|
| 109 |
# Supported image file extensions
|
| 110 |
SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.pdf']
|
|
|
|
| 467 |
def process_pdf(pdf_path=None, pdf_bytes=None):
|
| 468 |
"""Convert PDF file to a list of PIL images."""
|
| 469 |
try:
|
| 470 |
+
logger.info(f"Processing PDF: {pdf_path}")
|
| 471 |
+
logger.debug(f"Current working directory: {os.getcwd()}")
|
| 472 |
+
|
| 473 |
if pdf_path:
|
| 474 |
# Convert PDF file pages to PIL images
|
| 475 |
+
logger.info(f"Converting PDF from path: {pdf_path}")
|
| 476 |
+
logger.debug(f"PDF path exists: {os.path.exists(pdf_path)}")
|
| 477 |
+
logger.debug(f"PDF path is file: {os.path.isfile(pdf_path)}")
|
| 478 |
+
logger.debug(f"PDF file size: {os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 'N/A'} bytes")
|
| 479 |
+
|
| 480 |
+
try:
|
| 481 |
+
images = convert_from_path(pdf_path)
|
| 482 |
+
logger.info(f"PDF successfully converted to {len(images)} images")
|
| 483 |
+
except Exception as pdf_err:
|
| 484 |
+
logger.error(f"Error in convert_from_path: {str(pdf_err)}")
|
| 485 |
+
logger.error(traceback.format_exc())
|
| 486 |
+
# Try with different parameters
|
| 487 |
+
logger.info("Attempting alternative PDF conversion")
|
| 488 |
+
try:
|
| 489 |
+
images = convert_from_path(
|
| 490 |
+
pdf_path,
|
| 491 |
+
dpi=150, # Lower DPI for better compatibility
|
| 492 |
+
use_pdftocairo=False, # Try different backend
|
| 493 |
+
strict=False # Be more lenient with errors
|
| 494 |
+
)
|
| 495 |
+
logger.info(f"Alternative conversion successful: {len(images)} images")
|
| 496 |
+
except Exception as alt_err:
|
| 497 |
+
logger.error(f"Alternative conversion also failed: {str(alt_err)}")
|
| 498 |
+
logger.error(traceback.format_exc())
|
| 499 |
+
raise
|
| 500 |
elif pdf_bytes:
|
| 501 |
# Convert PDF bytes to PIL images
|
| 502 |
+
logger.info("Converting PDF from bytes")
|
| 503 |
+
logger.debug(f"PDF bytes size: {len(pdf_bytes)} bytes")
|
| 504 |
+
|
| 505 |
+
try:
|
| 506 |
+
images = convert_from_bytes(pdf_bytes)
|
| 507 |
+
logger.info(f"PDF bytes successfully converted to {len(images)} images")
|
| 508 |
+
except Exception as bytes_err:
|
| 509 |
+
logger.error(f"Error in convert_from_bytes: {str(bytes_err)}")
|
| 510 |
+
logger.error(traceback.format_exc())
|
| 511 |
+
# Try with different parameters
|
| 512 |
+
logger.info("Attempting alternative PDF bytes conversion")
|
| 513 |
+
try:
|
| 514 |
+
images = convert_from_bytes(
|
| 515 |
+
pdf_bytes,
|
| 516 |
+
dpi=150, # Lower DPI
|
| 517 |
+
use_pdftocairo=False,
|
| 518 |
+
strict=False
|
| 519 |
+
)
|
| 520 |
+
logger.info(f"Alternative bytes conversion successful: {len(images)} images")
|
| 521 |
+
except Exception as alt_bytes_err:
|
| 522 |
+
logger.error(f"Alternative bytes conversion also failed: {str(alt_bytes_err)}")
|
| 523 |
+
logger.error(traceback.format_exc())
|
| 524 |
+
raise
|
| 525 |
else:
|
| 526 |
+
logger.error("No PDF source provided")
|
| 527 |
+
return None
|
| 528 |
+
|
| 529 |
+
# Validate and log the output
|
| 530 |
+
if not images:
|
| 531 |
+
logger.error("PDF conversion returned empty list")
|
| 532 |
return None
|
| 533 |
|
| 534 |
+
# Log details about the first few converted images
|
| 535 |
+
for i, img in enumerate(images[:2]): # Log first 2 pages
|
| 536 |
+
logger.debug(f"PDF Page {i+1}: size={img.size}, mode={img.mode}")
|
| 537 |
+
|
| 538 |
+
logger.info(f"PDF successfully processed, returning {len(images)} images")
|
| 539 |
return images
|
| 540 |
except Exception as e:
|
| 541 |
+
logger.error(f"Fatal error in process_pdf: {str(e)}")
|
| 542 |
+
logger.error(traceback.format_exc())
|
|
|
|
| 543 |
return None
|
| 544 |
|
| 545 |
# Function to analyze images with a prompt
|
|
|
|
| 967 |
# For PDF files, handle differently
|
| 968 |
if file_name.lower().endswith('.pdf'):
|
| 969 |
try:
|
| 970 |
+
logger.info(f"Processing PDF file in folder analysis: {image_file}")
|
| 971 |
+
logger.debug(f"PDF absolute path: {os.path.abspath(image_file)}")
|
| 972 |
+
logger.debug(f"PDF exists: {os.path.exists(image_file)}")
|
| 973 |
+
logger.debug(f"PDF file size: {os.path.getsize(image_file) if os.path.exists(image_file) else 'N/A'}")
|
| 974 |
+
|
| 975 |
# Load model here to ensure it's ready
|
| 976 |
model, tokenizer = load_model()
|
| 977 |
if model is None or tokenizer is None:
|
| 978 |
+
logger.error("Model failed to load for PDF analysis")
|
| 979 |
result += "Error: Model failed to load for PDF analysis.\n"
|
| 980 |
continue
|
| 981 |
|
| 982 |
# Try a completely different approach for PDFs to avoid tensor issues
|
| 983 |
try:
|
| 984 |
+
# Convert PDF to images with detailed logging
|
| 985 |
+
logger.info(f"Starting PDF to image conversion for {file_name}")
|
| 986 |
+
with open(image_file, 'rb') as pdf_file:
|
| 987 |
+
pdf_data = pdf_file.read()
|
| 988 |
+
logger.debug(f"Read {len(pdf_data)} bytes from PDF file")
|
| 989 |
+
|
| 990 |
+
# Try both methods
|
| 991 |
+
try:
|
| 992 |
+
logger.debug("Attempting convert_from_path...")
|
| 993 |
+
pdf_images = convert_from_path(image_file)
|
| 994 |
+
logger.info(f"convert_from_path successful: {len(pdf_images)} pages")
|
| 995 |
+
except Exception as path_err:
|
| 996 |
+
logger.error(f"convert_from_path failed: {str(path_err)}")
|
| 997 |
+
logger.error(traceback.format_exc())
|
| 998 |
+
|
| 999 |
+
# Fall back to bytes method
|
| 1000 |
+
logger.debug("Falling back to convert_from_bytes...")
|
| 1001 |
+
pdf_images = convert_from_bytes(pdf_data)
|
| 1002 |
+
logger.info(f"convert_from_bytes successful: {len(pdf_images)} pages")
|
| 1003 |
+
|
| 1004 |
+
logger.info(f"PDF converted to {len(pdf_images)} pages")
|
| 1005 |
|
| 1006 |
if not pdf_images or len(pdf_images) == 0:
|
| 1007 |
+
logger.error("PDF converted but no pages were extracted")
|
| 1008 |
result += "PDF converted but no pages were extracted.\n"
|
| 1009 |
continue
|
| 1010 |
|
| 1011 |
# Process each page separately to avoid batch issues
|
| 1012 |
for i, img in enumerate(pdf_images):
|
| 1013 |
try:
|
| 1014 |
+
logger.info(f"Processing PDF page {i+1}/{len(pdf_images)}")
|
| 1015 |
+
logger.debug(f"Page {i+1} image: size={img.size}, mode={img.mode}")
|
| 1016 |
|
| 1017 |
# Manual preprocessing - don't use the typical image loading pipeline
|
| 1018 |
+
logger.debug("Converting image to RGB")
|
| 1019 |
img = img.convert('RGB')
|
| 1020 |
|
| 1021 |
+
# Log the image info for debugging
|
| 1022 |
+
logger.debug(f"After RGB conversion: size={img.size}, mode={img.mode}")
|
| 1023 |
+
|
| 1024 |
# Resize and transform manually
|
| 1025 |
+
logger.debug(f"Resizing image to {IMAGE_SIZE}x{IMAGE_SIZE}")
|
| 1026 |
img_resized = img.resize((IMAGE_SIZE, IMAGE_SIZE))
|
| 1027 |
+
logger.debug(f"After resize: size={img_resized.size}, mode={img_resized.mode}")
|
| 1028 |
+
|
| 1029 |
+
# Build transform and apply it
|
| 1030 |
+
logger.debug("Building and applying transform")
|
| 1031 |
transform = T.Compose([
|
| 1032 |
T.ToTensor(),
|
| 1033 |
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 1034 |
])
|
| 1035 |
+
|
| 1036 |
+
# Transform to tensor with explicit error handling
|
| 1037 |
+
try:
|
| 1038 |
+
logger.debug("Converting image to tensor")
|
| 1039 |
+
tensor = transform(img_resized)
|
| 1040 |
+
logger.debug(f"Transform successful: tensor shape={tensor.shape}, dtype={tensor.dtype}")
|
| 1041 |
+
|
| 1042 |
+
# Log tensor stats for debugging numerical issues
|
| 1043 |
+
if tensor.numel() > 0:
|
| 1044 |
+
logger.debug(f"Tensor stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
|
| 1045 |
+
f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
|
| 1046 |
+
|
| 1047 |
+
# Add batch dimension with careful checking
|
| 1048 |
+
if isinstance(tensor, torch.Tensor):
|
| 1049 |
+
logger.debug("Adding batch dimension")
|
| 1050 |
+
tensor = tensor.unsqueeze(0)
|
| 1051 |
+
logger.debug(f"After unsqueeze: shape={tensor.shape}")
|
| 1052 |
+
else:
|
| 1053 |
+
logger.error(f"Expected tensor but got {type(tensor)}")
|
| 1054 |
+
raise TypeError(f"Transform returned {type(tensor)} instead of tensor")
|
| 1055 |
+
except Exception as tensor_err:
|
| 1056 |
+
logger.error(f"Error in tensor creation: {str(tensor_err)}")
|
| 1057 |
+
logger.error(traceback.format_exc())
|
| 1058 |
+
raise
|
| 1059 |
|
| 1060 |
# Move to device and set data type
|
| 1061 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 1062 |
+
logger.debug(f"Moving tensor to device: {device}")
|
| 1063 |
tensor = tensor.to(device)
|
| 1064 |
+
|
| 1065 |
if torch.cuda.is_available():
|
| 1066 |
+
logger.debug("Converting tensor to bfloat16")
|
| 1067 |
tensor = tensor.to(torch.bfloat16)
|
| 1068 |
|
| 1069 |
+
logger.info(f"Preprocessed tensor: shape={tensor.shape}, device={tensor.device}, dtype={tensor.dtype}")
|
| 1070 |
|
| 1071 |
# Use direct text generation
|
| 1072 |
page_prompt = f"PDF Page {i+1}: {prompt}"
|
| 1073 |
+
logger.debug(f"Preparing tokenization for prompt: {page_prompt}")
|
| 1074 |
input_tokens = tokenizer(page_prompt, return_tensors="pt").to(device)
|
| 1075 |
+
logger.debug(f"Tokenization complete: shape={input_tokens['input_ids'].shape}")
|
| 1076 |
|
| 1077 |
# Generate with proper error handling
|
| 1078 |
try:
|
| 1079 |
# Try direct generation first
|
| 1080 |
+
logger.info("Attempting direct generation")
|
| 1081 |
outputs = model.generate(
|
| 1082 |
input_tokens["input_ids"],
|
| 1083 |
pixel_values=tensor,
|
| 1084 |
max_new_tokens=512,
|
| 1085 |
do_sample=False
|
| 1086 |
)
|
| 1087 |
+
logger.info("Generation successful")
|
| 1088 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 1089 |
+
logger.debug(f"Response length: {len(response)} chars")
|
| 1090 |
except Exception as gen_err:
|
| 1091 |
+
logger.error(f"Error in direct generation: {str(gen_err)}")
|
| 1092 |
+
logger.error(traceback.format_exc())
|
| 1093 |
|
| 1094 |
# Fall back to chat method
|
| 1095 |
try:
|
| 1096 |
+
print("Trying chat method fallback")
|
| 1097 |
question = f"<image>\n{page_prompt}"
|
| 1098 |
+
# IMPORTANT: Ensure we're not passing a list here!
|
| 1099 |
+
if isinstance(tensor, list):
|
| 1100 |
+
print("WARNING: tensor is a list, converting...")
|
| 1101 |
+
if len(tensor) > 0:
|
| 1102 |
+
# Take the first item if it's a tensor
|
| 1103 |
+
if isinstance(tensor[0], torch.Tensor):
|
| 1104 |
+
tensor = tensor[0].unsqueeze(0)
|
| 1105 |
+
else:
|
| 1106 |
+
# Create a new tensor from scratch
|
| 1107 |
+
print("Creating new tensor from scratch")
|
| 1108 |
+
tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
|
| 1109 |
+
dtype=torch.float32).to(device)
|
| 1110 |
+
if torch.cuda.is_available():
|
| 1111 |
+
tensor = tensor.to(torch.bfloat16)
|
| 1112 |
+
else:
|
| 1113 |
+
# Create a dummy tensor
|
| 1114 |
+
tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
|
| 1115 |
+
dtype=torch.float32).to(device)
|
| 1116 |
+
if torch.cuda.is_available():
|
| 1117 |
+
tensor = tensor.to(torch.bfloat16)
|
| 1118 |
+
|
| 1119 |
+
# Verify tensor shape and type before passing to model
|
| 1120 |
+
print(f"Final tensor type: {type(tensor)}, shape: {tensor.shape if hasattr(tensor, 'shape') else 'unknown'}")
|
| 1121 |
+
|
| 1122 |
+
# Use the chat method with verified tensor
|
| 1123 |
response, _ = model.chat(
|
| 1124 |
tokenizer=tokenizer,
|
| 1125 |
pixel_values=tensor,
|
|
|
|
| 1130 |
)
|
| 1131 |
except Exception as chat_err:
|
| 1132 |
print(f"Chat fallback failed: {str(chat_err)}")
|
| 1133 |
+
import traceback
|
| 1134 |
+
print(traceback.format_exc())
|
| 1135 |
+
|
| 1136 |
+
# Last attempt - use direct model forward pass
|
| 1137 |
+
try:
|
| 1138 |
+
print("Attempting direct model forward pass")
|
| 1139 |
+
# Create inputs manually
|
| 1140 |
+
if hasattr(model, "forward"):
|
| 1141 |
+
# Create tensors from scratch if needed
|
| 1142 |
+
if not isinstance(tensor, torch.Tensor):
|
| 1143 |
+
tensor = torch.zeros((1, 3, IMAGE_SIZE, IMAGE_SIZE),
|
| 1144 |
+
dtype=torch.float32).to(device)
|
| 1145 |
+
if torch.cuda.is_available():
|
| 1146 |
+
tensor = tensor.to(torch.bfloat16)
|
| 1147 |
+
|
| 1148 |
+
# Get input tokens in the right format
|
| 1149 |
+
input_ids = input_tokens["input_ids"]
|
| 1150 |
+
if len(input_ids.shape) == 1:
|
| 1151 |
+
input_ids = input_ids.unsqueeze(0)
|
| 1152 |
+
|
| 1153 |
+
# Prepare inputs for direct call
|
| 1154 |
+
inputs = {
|
| 1155 |
+
"input_ids": input_ids,
|
| 1156 |
+
"pixel_values": tensor,
|
| 1157 |
+
"return_dict": True,
|
| 1158 |
+
}
|
| 1159 |
+
# Call model directly
|
| 1160 |
+
outputs = model(**inputs)
|
| 1161 |
+
# Try to get some output
|
| 1162 |
+
if hasattr(outputs, "logits") and outputs.logits is not None:
|
| 1163 |
+
pred_ids = torch.argmax(outputs.logits, dim=-1)
|
| 1164 |
+
response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
|
| 1165 |
+
else:
|
| 1166 |
+
response = "Failed to generate analysis - model output didn't contain usable data"
|
| 1167 |
+
else:
|
| 1168 |
+
response = "Failed to generate analysis - model doesn't support direct calling"
|
| 1169 |
+
except Exception as final_err:
|
| 1170 |
+
print(f"All attempts failed: {str(final_err)}")
|
| 1171 |
+
import traceback
|
| 1172 |
+
print(traceback.format_exc())
|
| 1173 |
+
response = f"Analysis failed due to model error: {str(final_err)}"
|
| 1174 |
|
| 1175 |
# Add to result
|
| 1176 |
result += f"\n-- PDF Page {i+1} --\n{response.strip()}\n"
|
|
|
|
| 1210 |
def process_image_with_text(image, prompt):
|
| 1211 |
"""Process a single image with the InternVL model and a text prompt."""
|
| 1212 |
try:
|
| 1213 |
+
logger.info(f"process_image_with_text called with image type: {type(image)}")
|
| 1214 |
+
|
| 1215 |
+
# Debug info for image
|
| 1216 |
+
if hasattr(image, 'size'):
|
| 1217 |
+
logger.debug(f"Image dimensions: {image.size}")
|
| 1218 |
+
if hasattr(image, 'mode'):
|
| 1219 |
+
logger.debug(f"Image mode: {image.mode}")
|
| 1220 |
+
|
| 1221 |
+
# Log memory usage
|
| 1222 |
+
if torch.cuda.is_available():
|
| 1223 |
+
logger.debug(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
|
| 1224 |
+
logger.debug(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
|
| 1225 |
+
|
| 1226 |
# Load model if not already loaded
|
| 1227 |
+
logger.debug("Loading model")
|
| 1228 |
model, tokenizer = load_model()
|
| 1229 |
if model is None or tokenizer is None:
|
| 1230 |
+
logger.error("Model failed to load")
|
| 1231 |
return "Error loading model. Please check the logs for details."
|
| 1232 |
|
| 1233 |
+
logger.debug("Model loaded successfully")
|
| 1234 |
+
|
| 1235 |
+
# Skip the standard load_image function which might return a list
|
| 1236 |
+
# Instead, process the image directly to avoid list issues
|
| 1237 |
+
try:
|
| 1238 |
+
# Convert to RGB if needed
|
| 1239 |
+
logger.debug("Converting image to RGB if needed")
|
| 1240 |
+
if hasattr(image, 'convert'):
|
| 1241 |
+
image = image.convert('RGB')
|
| 1242 |
+
logger.debug(f"After conversion: mode={image.mode}, size={image.size}")
|
| 1243 |
+
else:
|
| 1244 |
+
logger.error("Image does not have convert method")
|
| 1245 |
+
return "Error: Unable to convert image to RGB"
|
| 1246 |
|
| 1247 |
+
# Resize for consistent dimensions
|
| 1248 |
+
logger.debug(f"Resizing image to {IMAGE_SIZE}x{IMAGE_SIZE}")
|
| 1249 |
+
if hasattr(image, 'resize'):
|
| 1250 |
+
image_resized = image.resize((IMAGE_SIZE, IMAGE_SIZE))
|
| 1251 |
+
logger.debug(f"After resize: size={image_resized.size}")
|
| 1252 |
+
else:
|
| 1253 |
+
logger.error("Image does not have resize method")
|
| 1254 |
+
return "Error: Unable to resize image"
|
| 1255 |
+
|
| 1256 |
+
# Apply transforms directly
|
| 1257 |
+
logger.debug("Creating transform")
|
| 1258 |
+
transform = T.Compose([
|
| 1259 |
+
T.ToTensor(),
|
| 1260 |
+
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 1261 |
+
])
|
| 1262 |
+
|
| 1263 |
+
# Convert to tensor safely
|
| 1264 |
+
logger.debug("Converting image to tensor")
|
| 1265 |
+
tensor = transform(image_resized)
|
| 1266 |
+
|
| 1267 |
+
# Log detailed tensor info
|
| 1268 |
+
if isinstance(tensor, torch.Tensor):
|
| 1269 |
+
logger.debug(f"Image transformed to tensor: shape={tensor.shape}, dtype={tensor.dtype}")
|
| 1270 |
+
if tensor.numel() > 0:
|
| 1271 |
+
logger.debug(f"Tensor stats: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, "
|
| 1272 |
+
f"mean={tensor.mean().item():.4f}, std={tensor.std().item():.4f}")
|
| 1273 |
+
else:
|
| 1274 |
+
logger.error(f"Transform did not return a tensor: {type(tensor)}")
|
| 1275 |
+
raise TypeError(f"Expected torch.Tensor but got {type(tensor)}")
|
| 1276 |
+
|
| 1277 |
+
# Ensure we have a 4D tensor [batch, channels, height, width]
|
| 1278 |
+
logger.debug("Adding batch dimension if needed")
|
| 1279 |
+
if len(tensor.shape) == 3:
|
| 1280 |
+
tensor = tensor.unsqueeze(0) # Add batch dimension
|
| 1281 |
+
logger.debug(f"Added batch dimension, new shape: {tensor.shape}")
|
| 1282 |
+
|
| 1283 |
+
# Move to appropriate device
|
| 1284 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 1285 |
+
logger.debug(f"Moving tensor to device: {device}")
|
| 1286 |
+
tensor = tensor.to(device)
|
| 1287 |
+
|
| 1288 |
+
if torch.cuda.is_available():
|
| 1289 |
+
logger.debug("Converting tensor to bfloat16")
|
| 1290 |
+
tensor = tensor.to(torch.bfloat16)
|
| 1291 |
+
logger.debug(f"Tensor converted to bfloat16, new dtype: {tensor.dtype}")
|
| 1292 |
+
|
| 1293 |
+
logger.info(f"Final tensor prepared: shape={tensor.shape}, device={tensor.device}, dtype={tensor.dtype}")
|
| 1294 |
+
except Exception as tensor_err:
|
| 1295 |
+
logger.error(f"Error in tensor creation: {str(tensor_err)}")
|
| 1296 |
+
logger.error(traceback.format_exc())
|
| 1297 |
+
return f"Error preparing image for analysis: {str(tensor_err)}"
|
| 1298 |
|
| 1299 |
# Process the prompt
|
| 1300 |
+
logger.debug(f"Tokenizing prompt: {prompt}")
|
| 1301 |
+
input_tokens = tokenizer(prompt, return_tensors="pt").to(device)
|
| 1302 |
+
logger.debug(f"Input tokens shape: {input_tokens['input_ids'].shape}")
|
| 1303 |
|
| 1304 |
+
# Generate description - try multiple approaches with proper error handling
|
| 1305 |
with torch.inference_mode():
|
| 1306 |
+
try:
|
| 1307 |
+
# Approach 1: Try direct generation
|
| 1308 |
+
logger.info("Attempting direct generation")
|
| 1309 |
+
|
| 1310 |
+
# Double-check inputs
|
| 1311 |
+
logger.debug(f"Checking input token tensor: shape={input_tokens['input_ids'].shape}, device={input_tokens['input_ids'].device}")
|
| 1312 |
+
logger.debug(f"Checking image tensor: shape={tensor.shape}, device={tensor.device}")
|
| 1313 |
+
|
| 1314 |
+
output_ids = model.generate(
|
| 1315 |
+
input_tokens["input_ids"],
|
| 1316 |
+
tensor,
|
| 1317 |
+
max_new_tokens=512,
|
| 1318 |
+
temperature=0.1,
|
| 1319 |
+
do_sample=False
|
| 1320 |
+
)
|
| 1321 |
+
|
| 1322 |
+
logger.info("Direct generation successful")
|
| 1323 |
+
logger.debug(f"Output IDs shape: {output_ids.shape}")
|
| 1324 |
+
|
| 1325 |
+
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 1326 |
+
logger.debug(f"Decoded output length: {len(output)} chars")
|
| 1327 |
+
|
| 1328 |
+
return output.strip()
|
| 1329 |
+
except Exception as gen_error:
|
| 1330 |
+
logger.error(f"Direct generation failed: {str(gen_error)}")
|
| 1331 |
+
logger.error(traceback.format_exc())
|
| 1332 |
+
|
| 1333 |
+
# Approach 2: Try the chat method
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1334 |
try:
|
| 1335 |
+
logger.info("Attempting chat method")
|
| 1336 |
+
question = f"<image>\n{prompt}"
|
| 1337 |
+
logger.debug(f"Chat question: {question}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1338 |
|
| 1339 |
+
# Double check tensor
|
| 1340 |
+
if not isinstance(tensor, torch.Tensor):
|
| 1341 |
+
logger.error(f"Chat method: expected torch.Tensor but got {type(tensor)}")
|
| 1342 |
+
raise TypeError(f"Expected torch.Tensor but got {type(tensor)}")
|
| 1343 |
|
| 1344 |
+
response, _ = model.chat(
|
| 1345 |
+
tokenizer=tokenizer,
|
| 1346 |
+
pixel_values=tensor,
|
| 1347 |
+
question=question,
|
| 1348 |
+
generation_config={"max_new_tokens": 512, "do_sample": False},
|
| 1349 |
+
history=None,
|
| 1350 |
+
return_history=True
|
| 1351 |
)
|
| 1352 |
|
| 1353 |
+
logger.info("Chat method successful")
|
| 1354 |
+
logger.debug(f"Chat response length: {len(response)} chars")
|
| 1355 |
+
|
| 1356 |
+
return response.strip()
|
| 1357 |
+
except Exception as chat_error:
|
| 1358 |
+
logger.error(f"Chat method failed: {str(chat_error)}")
|
| 1359 |
+
logger.error(traceback.format_exc())
|
| 1360 |
|
| 1361 |
+
# Approach 3: Try direct model forward pass
|
| 1362 |
try:
|
| 1363 |
+
logger.info("Attempting direct model forward call")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1364 |
|
| 1365 |
+
if hasattr(model, "forward"):
|
| 1366 |
+
logger.debug("Model has forward method")
|
| 1367 |
+
|
| 1368 |
+
# Prepare inputs
|
| 1369 |
+
logger.debug("Preparing inputs for direct forward pass")
|
| 1370 |
+
inputs = {
|
| 1371 |
+
"input_ids": input_tokens["input_ids"],
|
| 1372 |
+
"pixel_values": tensor,
|
| 1373 |
+
"return_dict": True,
|
| 1374 |
+
}
|
| 1375 |
+
|
| 1376 |
+
# Log input shapes
|
| 1377 |
+
for k, v in inputs.items():
|
| 1378 |
+
if hasattr(v, 'shape'):
|
| 1379 |
+
logger.debug(f"Input '{k}' shape: {v.shape}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1380 |
|
| 1381 |
+
# Call model directly
|
| 1382 |
+
logger.debug("Calling model.forward")
|
| 1383 |
+
outputs = model(**inputs)
|
| 1384 |
+
|
| 1385 |
+
# Try to extract output
|
| 1386 |
+
if hasattr(outputs, "logits") and outputs.logits is not None:
|
| 1387 |
+
logger.debug(f"Got logits with shape: {outputs.logits.shape}")
|
| 1388 |
+
|
| 1389 |
+
pred_ids = torch.argmax(outputs.logits, dim=-1)
|
| 1390 |
+
logger.debug(f"Prediction IDs shape: {pred_ids.shape}")
|
| 1391 |
+
|
| 1392 |
+
response = tokenizer.decode(pred_ids[0], skip_special_tokens=True)
|
| 1393 |
+
logger.debug(f"Decoded response length: {len(response)} chars")
|
| 1394 |
+
|
| 1395 |
+
return response.strip()
|
| 1396 |
+
else:
|
| 1397 |
+
logger.error("Model output does not contain logits")
|
| 1398 |
+
return "Failed to analyze image - model output contains no usable data"
|
| 1399 |
+
else:
|
| 1400 |
+
logger.error("Model does not have forward method")
|
| 1401 |
+
return "Failed to analyze image - model doesn't support direct calling"
|
| 1402 |
+
except Exception as forward_error:
|
| 1403 |
+
logger.error(f"Forward method failed: {str(forward_error)}")
|
| 1404 |
+
logger.error(traceback.format_exc())
|
| 1405 |
+
|
| 1406 |
+
# All methods failed
|
| 1407 |
+
return f"Error generating analysis: All methods failed to process the image"
|
| 1408 |
except Exception as e:
|
| 1409 |
+
logger.error(f"Fatal error in process_image_with_text: {str(e)}")
|
| 1410 |
+
logger.error(traceback.format_exc())
|
|
|
|
| 1411 |
return f"Error processing image: {str(e)}"
|
| 1412 |
|
| 1413 |
# Main function
|