Spaces:

DocForg
/

Document_Forgery_Detection

Sleeping

App Files Files Community

JKrishnanandhaa commited on Jan 20

Commit

f15f397

verified ·

1 Parent(s): 4090a34

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -18

app.py CHANGED Viewed

@@ -201,27 +201,86 @@ class ForgeryDetector:
         # Handle file path input (from gr.Image with type="filepath")
         if isinstance(image, str):
             if image.lower().endswith(('.doc', '.docx')):
-                # Handle Word documents - convert to PDF then to image
                 try:
-                    from docx2pdf import convert
-                    import tempfile
-                    import os
-                    temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
-                    temp_pdf.close()
-                    convert(image, temp_pdf.name)
-                    import fitz
-                    pdf_document = fitz.open(temp_pdf.name)
-                    page = pdf_document[0]
-                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-                    image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
-                    if pix.n == 4:
-                        image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
-                    pdf_document.close()
-                    os.unlink(temp_pdf.name)
                 except Exception as e:
-                    raise ValueError(f"Could not process Word document: {str(e)}")
             elif image.lower().endswith('.pdf'):
                 # Handle PDF files

         # Handle file path input (from gr.Image with type="filepath")
         if isinstance(image, str):
             if image.lower().endswith(('.doc', '.docx')):
+                # Handle Word documents - multiple fallback strategies
+                import tempfile
+                import os
+                import subprocess
+                temp_pdf = None
                 try:
+                    # Strategy 1: Try docx2pdf (Windows with MS Word)
+                    try:
+                        from docx2pdf import convert
+                        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
+                        temp_pdf.close()
+                        convert(image, temp_pdf.name)
+                        pdf_path = temp_pdf.name
+                    except Exception as e1:
+                        # Strategy 2: Try LibreOffice (Linux/Mac)
+                        try:
+                            temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
+                            temp_pdf.close()
+                            subprocess.run([
+                                'libreoffice', '--headless', '--convert-to', 'pdf',
+                                '--outdir', os.path.dirname(temp_pdf.name),
+                                image
+                            ], check=True, capture_output=True)
+                            # LibreOffice creates file with original name + .pdf
+                            base_name = os.path.splitext(os.path.basename(image))[0]
+                            generated_pdf = os.path.join(os.path.dirname(temp_pdf.name), f"{base_name}.pdf")
+                            if os.path.exists(generated_pdf):
+                                os.rename(generated_pdf, temp_pdf.name)
+                                pdf_path = temp_pdf.name
+                            else:
+                                raise Exception("LibreOffice conversion failed")
+                        except Exception as e2:
+                            # Strategy 3: Extract text and create simple image
+                            from docx import Document
+                            doc = Document(image)
+                            # Extract text
+                            text_lines = []
+                            for para in doc.paragraphs[:40]:  # First 40 paragraphs
+                                if para.text.strip():
+                                    text_lines.append(para.text[:100])  # Max 100 chars per line
+                            # Create image with text
+                            img_height = 1400
+                            img_width = 1000
+                            image = np.ones((img_height, img_width, 3), dtype=np.uint8) * 255
+                            y_offset = 60
+                            for line in text_lines[:35]:
+                                cv2.putText(image, line, (40, y_offset),
+                                          cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 0), 1, cv2.LINE_AA)
+                                y_offset += 35
+                            # Skip to end - image is ready
+                            pdf_path = None
+                    # If we got a PDF, convert it to image
+                    if pdf_path and os.path.exists(pdf_path):
+                        import fitz
+                        pdf_document = fitz.open(pdf_path)
+                        page = pdf_document[0]
+                        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+                        image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+                        if pix.n == 4:
+                            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+                        pdf_document.close()
+                        os.unlink(pdf_path)
                 except Exception as e:
+                    raise ValueError(f"Could not process Word document. Please convert to PDF or image first. Error: {str(e)}")
+                finally:
+                    # Clean up temp file if it exists
+                    if temp_pdf and os.path.exists(temp_pdf.name):
+                        try:
+                            os.unlink(temp_pdf.name)
+                        except:
+                            pass
             elif image.lower().endswith('.pdf'):
                 # Handle PDF files