rianders commited on
Commit
e6ea8c6
·
1 Parent(s): 0d61aa0

Include example PDFs with Git LFS and add helper scripts

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -68,7 +68,7 @@ htmlcov/
68
  # =========================
69
  uploads/
70
  data/
71
- *.pdf
72
  *.png
73
  *.jpg
74
  *.jpeg
 
68
  # =========================
69
  uploads/
70
  data/
71
+ # *.pdf
72
  *.png
73
  *.jpg
74
  *.jpeg
18.1 Notes.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed3d2790d00abbd7d89fbb53b745de70b46a667613b2ad52db498be48ea4d1f1
3
+ size 3392377
debug_rotation.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pymupdf as fitz
3
+ from PIL import Image, ImageDraw
4
+
5
+ def create_rotated_pdf():
6
+ doc = fitz.open()
7
+ page = doc.new_page()
8
+
9
+ # Insert text at specific coordinates (100, 100) -> (200, 150) roughly
10
+ page.insert_text((100, 100), "Hello World", fontsize=20)
11
+
12
+ # Rotate page 90 degrees
13
+ page.set_rotation(90)
14
+
15
+ doc.save("rotated_test.pdf")
16
+ return "rotated_test.pdf"
17
+
18
+ def analyze_pdf(path):
19
+ doc = fitz.open(path)
20
+ page = doc[0]
21
+
22
+ print(f"Page Rotation: {page.rotation}")
23
+ print(f"Page Rect (unrotated): {page.rect}")
24
+
25
+ # Pixmap
26
+ pix = page.get_pixmap()
27
+ print(f"Pixmap WxH: {pix.width}x{pix.height}")
28
+
29
+ # Text Dict
30
+ raw = page.get_text("dict")
31
+ block = raw["blocks"][0]
32
+ bbox = fitz.Rect(block["bbox"])
33
+ print(f"Text Bbox (raw): {bbox}")
34
+
35
+ # Try transforming
36
+ # page.rotation_matrix might not exist as a property, let's check dir
37
+ if hasattr(page, "rotation_matrix"):
38
+ mat = page.rotation_matrix
39
+ print(f"Page.rotation_matrix: {mat}")
40
+ t_bbox = bbox * mat
41
+ print(f"Transformed Bbox: {t_bbox}")
42
+ else:
43
+ print("No page.rotation_matrix")
44
+
45
+ doc.close()
46
+
47
+ if __name__ == "__main__":
48
+ pdf_path = create_rotated_pdf()
49
+ analyze_pdf(pdf_path)
generate_test_pdf.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pymupdf
3
+ import os
4
+
5
+ def create_test_pdf(filename="test_document.pdf"):
6
+ doc = pymupdf.open()
7
+ page = doc.new_page()
8
+
9
+ # Title
10
+ page.insert_text((50, 50), "PDF Structure Inspector Test Document", fontsize=24)
11
+
12
+ # Normal paragraph
13
+ text = "This is a normal paragraph of text to test basic extraction."
14
+ page.insert_text((50, 80), text, fontsize=12)
15
+
16
+ # Math-like text to test math detection
17
+ math_text = "Here is some math: f(x) = sum(x_i) for i in N. Also x^2 + y^2 = r^2."
18
+ page.insert_text((50, 110), math_text, fontsize=12)
19
+
20
+ # Text that might look like a header/column
21
+ page.insert_text((50, 150), "Column 1", fontsize=14)
22
+ page.insert_text((300, 150), "Column 2", fontsize=14)
23
+
24
+ page.insert_text((50, 170), "Left side content.", fontsize=12)
25
+ page.insert_text((300, 170), "Right side content.", fontsize=12)
26
+
27
+ # Add a drawing (vector)
28
+ page.draw_rect((50, 200, 150, 250), color=(0, 0, 1))
29
+ page.insert_text((55, 225), "Vector Box", fontsize=10, color=(1, 1, 1))
30
+
31
+ # Add a second page for batch testing
32
+ page2 = doc.new_page()
33
+ page2.insert_text((50, 50), "Page 2 - Batch Analysis Test", fontsize=24)
34
+ page2.insert_text((50, 80), "Just another page to verify multi-page processing.", fontsize=12)
35
+
36
+ doc.save(filename)
37
+ print(f"Created {filename}")
38
+
39
+ if __name__ == "__main__":
40
+ create_test_pdf()
logic.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27322fa134c2fbce6dd38e1f24b2c0d65c40a50f7abd9bc154d71f6f91464e48
3
+ size 635839
repro_error.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pymupdf as fitz
3
+ from layout_utils import extract_blocks_spans
4
+
5
+ def test_extraction():
6
+ # Open the real PDF
7
+ doc = fitz.open("logic.pdf")
8
+ # doc.new_page() ... no
9
+
10
+ # Analyze it using the function I modified
11
+ try:
12
+ blocks = extract_blocks_spans(doc, 0)
13
+ print(f"Blocks extracted: {len(blocks)}")
14
+ for i, b in enumerate(blocks):
15
+ bbox = b.bbox
16
+ print(f"Block {i} bbox type: {type(bbox)}")
17
+ if isinstance(bbox, tuple):
18
+ print(f"Block {i} bbox len: {len(bbox)}")
19
+ if len(bbox) == 4:
20
+ print(f"Block {i} sample: {bbox}")
21
+ # Try unpacking
22
+ x0, y0, x1, y1 = bbox
23
+ # Try explicit float cast
24
+ _ = float(x0)
25
+ else:
26
+ print(f"Block {i} bbox is NOT tuple: {bbox}")
27
+
28
+ except Exception as e:
29
+ print(f"Error caught: {type(e).__name__}: {e}")
30
+ import traceback
31
+ traceback.print_exc()
32
+
33
+ if __name__ == "__main__":
34
+ test_extraction()
rotated_test.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:276ce24070de6a5c2f034980be647556e73d8a9e25fd051089ef807c10ebbdaa
3
+ size 746
test_app_analyze.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from app import analyze
3
+ import traceback
4
+
5
+ def test_analyze():
6
+ pdf_path = "test_document.pdf"
7
+ page_num = 1
8
+ dpi = 72
9
+ order_mode = "raw"
10
+ show_spans = False
11
+ highlight_math = False
12
+
13
+ try:
14
+ print(f"Analyzing {pdf_path}...")
15
+ overlay, report, summary, preview = analyze(
16
+ pdf_path, page_num, dpi, order_mode, show_spans, highlight_math
17
+ )
18
+ print("Analysis successful!")
19
+ print(f"Summary length: {len(summary)}")
20
+ print(f"Preview length: {len(preview)}")
21
+ print(f"Report keys: {report.keys()}")
22
+ print(f"Overlay size: {overlay.size}")
23
+ except Exception as e:
24
+ print(f"CRASHED: {e}")
25
+ traceback.print_exc()
26
+
27
+ if __name__ == "__main__":
28
+ test_analyze()
test_document.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f2648892600eb411eae617de92b54231946e51639c564331537886f55f94cd2
3
+ size 2759