SuriRaja commited on
Commit
123be7b
·
verified ·
1 Parent(s): b5db601

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -27
app.py CHANGED
@@ -3,44 +3,56 @@ from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
3
  import torch
4
  from difflib import unified_diff
5
 
6
- def extract_text_from_pdf(file_path):
7
  """Extract text from a PDF using pdfplumber."""
8
- text = ""
9
- with pdfplumber.open(file_path) as pdf:
10
- for page in pdf.pages:
11
- page_text = page.extract_text()
12
- if page_text:
13
- text += page_text + "\n"
14
- return text
 
 
 
15
 
16
  def compare_texts(source_text, target_text):
17
  """Compare two texts and highlight differences with source as truth."""
18
- diff = unified_diff(
19
- source_text.splitlines(),
20
- target_text.splitlines(),
21
- lineterm='',
22
- fromfile='Source PDF',
23
- tofile='Target PDF'
24
- )
25
- return '\n'.join(diff)
 
 
 
26
 
27
  def process_pdfs(pdf1, pdf2):
28
- # Extract text from the uploaded PDFs
29
- text1 = extract_text_from_pdf(pdf1)
30
- text2 = extract_text_from_pdf(pdf2)
 
31
 
32
- if not text1 or not text2:
33
- return "One or both PDFs have no extractable text. Please check the files."
34
 
35
- # Compare texts and find differences
36
- differences = compare_texts(text1, text2)
37
 
38
- return f"Differences found between the PDFs:\n\n{differences}"
 
 
 
 
 
39
 
40
  if __name__ == "__main__":
41
- # Paths to your PDF files for testing
42
- pdf1_path = "path/to/your/source.pdf" # Replace with actual file path
43
- pdf2_path = "path/to/your/target.pdf" # Replace with actual file path
44
 
45
  # Process and print differences
46
  result = process_pdfs(pdf1_path, pdf2_path)
 
3
  import torch
4
  from difflib import unified_diff
5
 
6
+ def extract_text_from_pdf(file):
7
  """Extract text from a PDF using pdfplumber."""
8
+ try:
9
+ text = ""
10
+ with pdfplumber.open(file) as pdf:
11
+ for page in pdf.pages:
12
+ page_text = page.extract_text()
13
+ if page_text:
14
+ text += page_text + "\n"
15
+ return text
16
+ except Exception as e:
17
+ return f"Error extracting text from PDF: {e}"
18
 
19
  def compare_texts(source_text, target_text):
20
  """Compare two texts and highlight differences with source as truth."""
21
+ try:
22
+ diff = unified_diff(
23
+ source_text.splitlines(),
24
+ target_text.splitlines(),
25
+ lineterm='',
26
+ fromfile='Source PDF',
27
+ tofile='Target PDF'
28
+ )
29
+ return '\n'.join(diff)
30
+ except Exception as e:
31
+ return f"Error comparing texts: {e}"
32
 
33
  def process_pdfs(pdf1, pdf2):
34
+ try:
35
+ # Extract text from the uploaded PDFs
36
+ text1 = extract_text_from_pdf(pdf1)
37
+ text2 = extract_text_from_pdf(pdf2)
38
 
39
+ if "Error" in text1 or "Error" in text2:
40
+ return f"Extraction issues detected: {text1 if 'Error' in text1 else ''} {text2 if 'Error' in text2 else ''}"
41
 
42
+ if not text1 or not text2:
43
+ return "One or both PDFs have no extractable text. Please check the files."
44
 
45
+ # Compare texts and find differences
46
+ differences = compare_texts(text1, text2)
47
+
48
+ return f"Differences found between the PDFs:\n\n{differences}"
49
+ except Exception as e:
50
+ return f"Error processing PDFs: {e}"
51
 
52
  if __name__ == "__main__":
53
+ # Replace this block with code to upload and pass files if running in a web app environment
54
+ pdf1_path = "path/to/source.pdf" # Placeholder path
55
+ pdf2_path = "path/to/target.pdf" # Placeholder path
56
 
57
  # Process and print differences
58
  result = process_pdfs(pdf1_path, pdf2_path)