Spaces:

SuriRaja
/

usecase2

Sleeping

App Files Files Community

SuriRaja commited on Nov 3, 2024

Commit

123be7b

verified ·

1 Parent(s): b5db601

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -27

app.py CHANGED Viewed

@@ -3,44 +3,56 @@ from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
 import torch
 from difflib import unified_diff
-def extract_text_from_pdf(file_path):
     """Extract text from a PDF using pdfplumber."""
-    text = ""
-    with pdfplumber.open(file_path) as pdf:
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-    return text
 def compare_texts(source_text, target_text):
     """Compare two texts and highlight differences with source as truth."""
-    diff = unified_diff(
-        source_text.splitlines(),
-        target_text.splitlines(),
-        lineterm='',
-        fromfile='Source PDF',
-        tofile='Target PDF'
-    )
-    return '\n'.join(diff)
 def process_pdfs(pdf1, pdf2):
-    # Extract text from the uploaded PDFs
-    text1 = extract_text_from_pdf(pdf1)
-    text2 = extract_text_from_pdf(pdf2)
-    if not text1 or not text2:
-        return "One or both PDFs have no extractable text. Please check the files."
-    # Compare texts and find differences
-    differences = compare_texts(text1, text2)
-    return f"Differences found between the PDFs:\n\n{differences}"
 if __name__ == "__main__":
-    # Paths to your PDF files for testing
-    pdf1_path = "path/to/your/source.pdf"  # Replace with actual file path
-    pdf2_path = "path/to/your/target.pdf"  # Replace with actual file path
     # Process and print differences
     result = process_pdfs(pdf1_path, pdf2_path)

 import torch
 from difflib import unified_diff
+def extract_text_from_pdf(file):
     """Extract text from a PDF using pdfplumber."""
+    try:
+        text = ""
+        with pdfplumber.open(file) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text
+    except Exception as e:
+        return f"Error extracting text from PDF: {e}"
 def compare_texts(source_text, target_text):
     """Compare two texts and highlight differences with source as truth."""
+    try:
+        diff = unified_diff(
+            source_text.splitlines(),
+            target_text.splitlines(),
+            lineterm='',
+            fromfile='Source PDF',
+            tofile='Target PDF'
+        )
+        return '\n'.join(diff)
+    except Exception as e:
+        return f"Error comparing texts: {e}"
 def process_pdfs(pdf1, pdf2):
+    try:
+        # Extract text from the uploaded PDFs
+        text1 = extract_text_from_pdf(pdf1)
+        text2 = extract_text_from_pdf(pdf2)
+        if "Error" in text1 or "Error" in text2:
+            return f"Extraction issues detected: {text1 if 'Error' in text1 else ''} {text2 if 'Error' in text2 else ''}"
+        if not text1 or not text2:
+            return "One or both PDFs have no extractable text. Please check the files."
+        # Compare texts and find differences
+        differences = compare_texts(text1, text2)
+        return f"Differences found between the PDFs:\n\n{differences}"
+    except Exception as e:
+        return f"Error processing PDFs: {e}"
 if __name__ == "__main__":
+    # Replace this block with code to upload and pass files if running in a web app environment
+    pdf1_path = "path/to/source.pdf"  # Placeholder path
+    pdf2_path = "path/to/target.pdf"  # Placeholder path
     # Process and print differences
     result = process_pdfs(pdf1_path, pdf2_path)