JKrishnanandhaa commited on
Commit
f15f397
·
verified ·
1 Parent(s): 4090a34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -18
app.py CHANGED
@@ -201,27 +201,86 @@ class ForgeryDetector:
201
  # Handle file path input (from gr.Image with type="filepath")
202
  if isinstance(image, str):
203
  if image.lower().endswith(('.doc', '.docx')):
204
- # Handle Word documents - convert to PDF then to image
 
 
 
 
 
205
  try:
206
- from docx2pdf import convert
207
- import tempfile
208
- import os
209
-
210
- temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
211
- temp_pdf.close()
212
- convert(image, temp_pdf.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- import fitz
215
- pdf_document = fitz.open(temp_pdf.name)
216
- page = pdf_document[0]
217
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
218
- image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
219
- if pix.n == 4:
220
- image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
221
- pdf_document.close()
222
- os.unlink(temp_pdf.name)
 
 
 
223
  except Exception as e:
224
- raise ValueError(f"Could not process Word document: {str(e)}")
 
 
 
 
 
 
 
225
 
226
  elif image.lower().endswith('.pdf'):
227
  # Handle PDF files
 
201
  # Handle file path input (from gr.Image with type="filepath")
202
  if isinstance(image, str):
203
  if image.lower().endswith(('.doc', '.docx')):
204
+ # Handle Word documents - multiple fallback strategies
205
+ import tempfile
206
+ import os
207
+ import subprocess
208
+
209
+ temp_pdf = None
210
  try:
211
+ # Strategy 1: Try docx2pdf (Windows with MS Word)
212
+ try:
213
+ from docx2pdf import convert
214
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
215
+ temp_pdf.close()
216
+ convert(image, temp_pdf.name)
217
+ pdf_path = temp_pdf.name
218
+ except Exception as e1:
219
+ # Strategy 2: Try LibreOffice (Linux/Mac)
220
+ try:
221
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
222
+ temp_pdf.close()
223
+ subprocess.run([
224
+ 'libreoffice', '--headless', '--convert-to', 'pdf',
225
+ '--outdir', os.path.dirname(temp_pdf.name),
226
+ image
227
+ ], check=True, capture_output=True)
228
+
229
+ # LibreOffice creates file with original name + .pdf
230
+ base_name = os.path.splitext(os.path.basename(image))[0]
231
+ generated_pdf = os.path.join(os.path.dirname(temp_pdf.name), f"{base_name}.pdf")
232
+
233
+ if os.path.exists(generated_pdf):
234
+ os.rename(generated_pdf, temp_pdf.name)
235
+ pdf_path = temp_pdf.name
236
+ else:
237
+ raise Exception("LibreOffice conversion failed")
238
+ except Exception as e2:
239
+ # Strategy 3: Extract text and create simple image
240
+ from docx import Document
241
+ doc = Document(image)
242
+
243
+ # Extract text
244
+ text_lines = []
245
+ for para in doc.paragraphs[:40]: # First 40 paragraphs
246
+ if para.text.strip():
247
+ text_lines.append(para.text[:100]) # Max 100 chars per line
248
+
249
+ # Create image with text
250
+ img_height = 1400
251
+ img_width = 1000
252
+ image = np.ones((img_height, img_width, 3), dtype=np.uint8) * 255
253
+
254
+ y_offset = 60
255
+ for line in text_lines[:35]:
256
+ cv2.putText(image, line, (40, y_offset),
257
+ cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 0), 1, cv2.LINE_AA)
258
+ y_offset += 35
259
+
260
+ # Skip to end - image is ready
261
+ pdf_path = None
262
 
263
+ # If we got a PDF, convert it to image
264
+ if pdf_path and os.path.exists(pdf_path):
265
+ import fitz
266
+ pdf_document = fitz.open(pdf_path)
267
+ page = pdf_document[0]
268
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
269
+ image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
270
+ if pix.n == 4:
271
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
272
+ pdf_document.close()
273
+ os.unlink(pdf_path)
274
+
275
  except Exception as e:
276
+ raise ValueError(f"Could not process Word document. Please convert to PDF or image first. Error: {str(e)}")
277
+ finally:
278
+ # Clean up temp file if it exists
279
+ if temp_pdf and os.path.exists(temp_pdf.name):
280
+ try:
281
+ os.unlink(temp_pdf.name)
282
+ except:
283
+ pass
284
 
285
  elif image.lower().endswith('.pdf'):
286
  # Handle PDF files