Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -201,27 +201,86 @@ class ForgeryDetector:
|
|
| 201 |
# Handle file path input (from gr.Image with type="filepath")
|
| 202 |
if isinstance(image, str):
|
| 203 |
if image.lower().endswith(('.doc', '.docx')):
|
| 204 |
-
# Handle Word documents -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
try:
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
image =
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
| 223 |
except Exception as e:
|
| 224 |
-
raise ValueError(f"Could not process Word document: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
elif image.lower().endswith('.pdf'):
|
| 227 |
# Handle PDF files
|
|
|
|
| 201 |
# Handle file path input (from gr.Image with type="filepath")
|
| 202 |
if isinstance(image, str):
|
| 203 |
if image.lower().endswith(('.doc', '.docx')):
|
| 204 |
+
# Handle Word documents - multiple fallback strategies
|
| 205 |
+
import tempfile
|
| 206 |
+
import os
|
| 207 |
+
import subprocess
|
| 208 |
+
|
| 209 |
+
temp_pdf = None
|
| 210 |
try:
|
| 211 |
+
# Strategy 1: Try docx2pdf (Windows with MS Word)
|
| 212 |
+
try:
|
| 213 |
+
from docx2pdf import convert
|
| 214 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
|
| 215 |
+
temp_pdf.close()
|
| 216 |
+
convert(image, temp_pdf.name)
|
| 217 |
+
pdf_path = temp_pdf.name
|
| 218 |
+
except Exception as e1:
|
| 219 |
+
# Strategy 2: Try LibreOffice (Linux/Mac)
|
| 220 |
+
try:
|
| 221 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
|
| 222 |
+
temp_pdf.close()
|
| 223 |
+
subprocess.run([
|
| 224 |
+
'libreoffice', '--headless', '--convert-to', 'pdf',
|
| 225 |
+
'--outdir', os.path.dirname(temp_pdf.name),
|
| 226 |
+
image
|
| 227 |
+
], check=True, capture_output=True)
|
| 228 |
+
|
| 229 |
+
# LibreOffice creates file with original name + .pdf
|
| 230 |
+
base_name = os.path.splitext(os.path.basename(image))[0]
|
| 231 |
+
generated_pdf = os.path.join(os.path.dirname(temp_pdf.name), f"{base_name}.pdf")
|
| 232 |
+
|
| 233 |
+
if os.path.exists(generated_pdf):
|
| 234 |
+
os.rename(generated_pdf, temp_pdf.name)
|
| 235 |
+
pdf_path = temp_pdf.name
|
| 236 |
+
else:
|
| 237 |
+
raise Exception("LibreOffice conversion failed")
|
| 238 |
+
except Exception as e2:
|
| 239 |
+
# Strategy 3: Extract text and create simple image
|
| 240 |
+
from docx import Document
|
| 241 |
+
doc = Document(image)
|
| 242 |
+
|
| 243 |
+
# Extract text
|
| 244 |
+
text_lines = []
|
| 245 |
+
for para in doc.paragraphs[:40]: # First 40 paragraphs
|
| 246 |
+
if para.text.strip():
|
| 247 |
+
text_lines.append(para.text[:100]) # Max 100 chars per line
|
| 248 |
+
|
| 249 |
+
# Create image with text
|
| 250 |
+
img_height = 1400
|
| 251 |
+
img_width = 1000
|
| 252 |
+
image = np.ones((img_height, img_width, 3), dtype=np.uint8) * 255
|
| 253 |
+
|
| 254 |
+
y_offset = 60
|
| 255 |
+
for line in text_lines[:35]:
|
| 256 |
+
cv2.putText(image, line, (40, y_offset),
|
| 257 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 0), 1, cv2.LINE_AA)
|
| 258 |
+
y_offset += 35
|
| 259 |
+
|
| 260 |
+
# Skip to end - image is ready
|
| 261 |
+
pdf_path = None
|
| 262 |
|
| 263 |
+
# If we got a PDF, convert it to image
|
| 264 |
+
if pdf_path and os.path.exists(pdf_path):
|
| 265 |
+
import fitz
|
| 266 |
+
pdf_document = fitz.open(pdf_path)
|
| 267 |
+
page = pdf_document[0]
|
| 268 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 269 |
+
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
| 270 |
+
if pix.n == 4:
|
| 271 |
+
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
| 272 |
+
pdf_document.close()
|
| 273 |
+
os.unlink(pdf_path)
|
| 274 |
+
|
| 275 |
except Exception as e:
|
| 276 |
+
raise ValueError(f"Could not process Word document. Please convert to PDF or image first. Error: {str(e)}")
|
| 277 |
+
finally:
|
| 278 |
+
# Clean up temp file if it exists
|
| 279 |
+
if temp_pdf and os.path.exists(temp_pdf.name):
|
| 280 |
+
try:
|
| 281 |
+
os.unlink(temp_pdf.name)
|
| 282 |
+
except:
|
| 283 |
+
pass
|
| 284 |
|
| 285 |
elif image.lower().endswith('.pdf'):
|
| 286 |
# Handle PDF files
|