SuriRaja commited on
Commit
8e65970
·
verified ·
1 Parent(s): ea52284

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
3
+ import torch
4
+ from difflib import unified_diff
5
+
6
+ def extract_text_from_pdf(file_path):
7
+ """Extract text from a PDF using pdfplumber."""
8
+ text = ""
9
+ with pdfplumber.open(file_path) as pdf:
10
+ for page in pdf.pages:
11
+ page_text = page.extract_text()
12
+ if page_text:
13
+ text += page_text + "\n"
14
+ return text
15
+
16
+ def compare_texts(source_text, target_text):
17
+ """Compare two texts and highlight differences with source as truth."""
18
+ diff = unified_diff(
19
+ source_text.splitlines(),
20
+ target_text.splitlines(),
21
+ lineterm='',
22
+ fromfile='Source PDF',
23
+ tofile='Target PDF'
24
+ )
25
+ return '\n'.join(diff)
26
+
27
+ def process_pdfs(pdf1, pdf2):
28
+ # Extract text from the uploaded PDFs
29
+ text1 = extract_text_from_pdf(pdf1)
30
+ text2 = extract_text_from_pdf(pdf2)
31
+
32
+ if not text1 or not text2:
33
+ return "One or both PDFs have no extractable text. Please check the files."
34
+
35
+ # Compare texts and find differences
36
+ differences = compare_texts(text1, text2)
37
+
38
+ return f"Differences found between the PDFs:\n\n{differences}"
39
+
40
+ if __name__ == "__main__":
41
+ # Paths to your PDF files for testing
42
+ pdf1_path = "path/to/your/source.pdf" # Replace with actual file path
43
+ pdf2_path = "path/to/your/target.pdf" # Replace with actual file path
44
+
45
+ # Process and print differences
46
+ result = process_pdfs(pdf1_path, pdf2_path)
47
+ print(result)