Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -133,28 +133,72 @@ class FloorPlanProcessor:
|
|
| 133 |
pdf.page_count = len(pdf_document)
|
| 134 |
|
| 135 |
images = []
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
page = pdf_document[page_num]
|
|
|
|
|
|
|
|
|
|
| 138 |
image_list = page.get_images(full=True)
|
|
|
|
| 139 |
|
| 140 |
-
if
|
| 141 |
-
|
| 142 |
-
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 143 |
-
images.append(img)
|
| 144 |
-
else:
|
| 145 |
for img_info in image_list:
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
images.append(img)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
-
pdf.images = images
|
| 153 |
-
logger.info(f"
|
| 154 |
return True
|
| 155 |
|
| 156 |
except Exception as e:
|
| 157 |
-
logger.error(f"PDF error: {str(e)}")
|
| 158 |
pdf.error = str(e)
|
| 159 |
return False
|
| 160 |
|
|
|
|
| 133 |
pdf.page_count = len(pdf_document)
|
| 134 |
|
| 135 |
images = []
|
| 136 |
+
logger.info(f"PDF has {pdf.page_count} pages")
|
| 137 |
+
|
| 138 |
+
# Process only first 5 pages (optimization)
|
| 139 |
+
pages_to_process = min(5, pdf.page_count)
|
| 140 |
+
|
| 141 |
+
for page_num in range(pages_to_process):
|
| 142 |
page = pdf_document[page_num]
|
| 143 |
+
logger.info(f"Processing page {page_num + 1}/{pages_to_process}")
|
| 144 |
+
|
| 145 |
+
# Try to extract embedded images first (high quality)
|
| 146 |
image_list = page.get_images(full=True)
|
| 147 |
+
extracted_from_page = False
|
| 148 |
|
| 149 |
+
if image_list:
|
| 150 |
+
logger.info(f" Found {len(image_list)} embedded images")
|
|
|
|
|
|
|
|
|
|
| 151 |
for img_info in image_list:
|
| 152 |
+
try:
|
| 153 |
+
xref = img_info[0]
|
| 154 |
+
base_image = pdf_document.extract_image(xref)
|
| 155 |
+
if base_image and "image" in base_image:
|
| 156 |
+
img = Image.open(BytesIO(base_image["image"]))
|
| 157 |
+
if img.width > 200 and img.height > 200:
|
| 158 |
+
logger.info(f" ✓ Extracted embedded image: {img.size}")
|
| 159 |
+
images.append(img)
|
| 160 |
+
extracted_from_page = True
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.warning(f" Could not extract embedded image: {e}")
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
# If no good embedded images, render page at high quality
|
| 166 |
+
if not extracted_from_page:
|
| 167 |
+
try:
|
| 168 |
+
# Use higher zoom (2.0) for better quality rendering
|
| 169 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False, dpi=300)
|
| 170 |
+
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 171 |
+
|
| 172 |
+
if img.width > 0 and img.height > 0:
|
| 173 |
+
logger.info(f" ✓ Rendered page as image: {img.size}")
|
| 174 |
images.append(img)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.warning(f" Could not render page {page_num + 1}: {e}")
|
| 177 |
+
continue
|
| 178 |
+
|
| 179 |
+
# Stop if we have good images from first page
|
| 180 |
+
if extracted_from_page and len(images) > 0:
|
| 181 |
+
logger.info("✓ Got good floor plan from embedded images, stopping search")
|
| 182 |
+
break
|
| 183 |
+
|
| 184 |
+
if not images:
|
| 185 |
+
logger.warning("No images extracted, trying fallback rendering")
|
| 186 |
+
# Fallback: render first page at maximum quality
|
| 187 |
+
try:
|
| 188 |
+
page = pdf_document[0]
|
| 189 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), alpha=False)
|
| 190 |
+
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 191 |
+
images.append(img)
|
| 192 |
+
logger.info(f"✓ Fallback: Rendered first page at 3x zoom: {img.size}")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Fallback rendering failed: {e}")
|
| 195 |
|
| 196 |
+
pdf.images = images[:3] # Keep max 3 images
|
| 197 |
+
logger.info(f"✓ Successfully extracted {len(pdf.images)} images from PDF")
|
| 198 |
return True
|
| 199 |
|
| 200 |
except Exception as e:
|
| 201 |
+
logger.error(f"PDF error: {str(e)}", exc_info=True)
|
| 202 |
pdf.error = str(e)
|
| 203 |
return False
|
| 204 |
|