Spaces:
Sleeping
Sleeping
Include example PDFs with Git LFS and add helper scripts
Browse files- .gitattributes +1 -0
- .gitignore +1 -1
- 18.1 Notes.pdf +3 -0
- debug_rotation.py +49 -0
- generate_test_pdf.py +40 -0
- logic.pdf +3 -0
- repro_error.py +34 -0
- rotated_test.pdf +3 -0
- test_app_analyze.py +28 -0
- test_document.pdf +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -68,7 +68,7 @@ htmlcov/
|
|
| 68 |
# =========================
|
| 69 |
uploads/
|
| 70 |
data/
|
| 71 |
-
*.pdf
|
| 72 |
*.png
|
| 73 |
*.jpg
|
| 74 |
*.jpeg
|
|
|
|
| 68 |
# =========================
|
| 69 |
uploads/
|
| 70 |
data/
|
| 71 |
+
# *.pdf
|
| 72 |
*.png
|
| 73 |
*.jpg
|
| 74 |
*.jpeg
|
18.1 Notes.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed3d2790d00abbd7d89fbb53b745de70b46a667613b2ad52db498be48ea4d1f1
|
| 3 |
+
size 3392377
|
debug_rotation.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pymupdf as fitz
|
| 3 |
+
from PIL import Image, ImageDraw
|
| 4 |
+
|
| 5 |
+
def create_rotated_pdf():
|
| 6 |
+
doc = fitz.open()
|
| 7 |
+
page = doc.new_page()
|
| 8 |
+
|
| 9 |
+
# Insert text at specific coordinates (100, 100) -> (200, 150) roughly
|
| 10 |
+
page.insert_text((100, 100), "Hello World", fontsize=20)
|
| 11 |
+
|
| 12 |
+
# Rotate page 90 degrees
|
| 13 |
+
page.set_rotation(90)
|
| 14 |
+
|
| 15 |
+
doc.save("rotated_test.pdf")
|
| 16 |
+
return "rotated_test.pdf"
|
| 17 |
+
|
| 18 |
+
def analyze_pdf(path):
|
| 19 |
+
doc = fitz.open(path)
|
| 20 |
+
page = doc[0]
|
| 21 |
+
|
| 22 |
+
print(f"Page Rotation: {page.rotation}")
|
| 23 |
+
print(f"Page Rect (unrotated): {page.rect}")
|
| 24 |
+
|
| 25 |
+
# Pixmap
|
| 26 |
+
pix = page.get_pixmap()
|
| 27 |
+
print(f"Pixmap WxH: {pix.width}x{pix.height}")
|
| 28 |
+
|
| 29 |
+
# Text Dict
|
| 30 |
+
raw = page.get_text("dict")
|
| 31 |
+
block = raw["blocks"][0]
|
| 32 |
+
bbox = fitz.Rect(block["bbox"])
|
| 33 |
+
print(f"Text Bbox (raw): {bbox}")
|
| 34 |
+
|
| 35 |
+
# Try transforming
|
| 36 |
+
# page.rotation_matrix might not exist as a property, let's check dir
|
| 37 |
+
if hasattr(page, "rotation_matrix"):
|
| 38 |
+
mat = page.rotation_matrix
|
| 39 |
+
print(f"Page.rotation_matrix: {mat}")
|
| 40 |
+
t_bbox = bbox * mat
|
| 41 |
+
print(f"Transformed Bbox: {t_bbox}")
|
| 42 |
+
else:
|
| 43 |
+
print("No page.rotation_matrix")
|
| 44 |
+
|
| 45 |
+
doc.close()
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
pdf_path = create_rotated_pdf()
|
| 49 |
+
analyze_pdf(pdf_path)
|
generate_test_pdf.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pymupdf
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def create_test_pdf(filename="test_document.pdf"):
|
| 6 |
+
doc = pymupdf.open()
|
| 7 |
+
page = doc.new_page()
|
| 8 |
+
|
| 9 |
+
# Title
|
| 10 |
+
page.insert_text((50, 50), "PDF Structure Inspector Test Document", fontsize=24)
|
| 11 |
+
|
| 12 |
+
# Normal paragraph
|
| 13 |
+
text = "This is a normal paragraph of text to test basic extraction."
|
| 14 |
+
page.insert_text((50, 80), text, fontsize=12)
|
| 15 |
+
|
| 16 |
+
# Math-like text to test math detection
|
| 17 |
+
math_text = "Here is some math: f(x) = sum(x_i) for i in N. Also x^2 + y^2 = r^2."
|
| 18 |
+
page.insert_text((50, 110), math_text, fontsize=12)
|
| 19 |
+
|
| 20 |
+
# Text that might look like a header/column
|
| 21 |
+
page.insert_text((50, 150), "Column 1", fontsize=14)
|
| 22 |
+
page.insert_text((300, 150), "Column 2", fontsize=14)
|
| 23 |
+
|
| 24 |
+
page.insert_text((50, 170), "Left side content.", fontsize=12)
|
| 25 |
+
page.insert_text((300, 170), "Right side content.", fontsize=12)
|
| 26 |
+
|
| 27 |
+
# Add a drawing (vector)
|
| 28 |
+
page.draw_rect((50, 200, 150, 250), color=(0, 0, 1))
|
| 29 |
+
page.insert_text((55, 225), "Vector Box", fontsize=10, color=(1, 1, 1))
|
| 30 |
+
|
| 31 |
+
# Add a second page for batch testing
|
| 32 |
+
page2 = doc.new_page()
|
| 33 |
+
page2.insert_text((50, 50), "Page 2 - Batch Analysis Test", fontsize=24)
|
| 34 |
+
page2.insert_text((50, 80), "Just another page to verify multi-page processing.", fontsize=12)
|
| 35 |
+
|
| 36 |
+
doc.save(filename)
|
| 37 |
+
print(f"Created {filename}")
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
create_test_pdf()
|
logic.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27322fa134c2fbce6dd38e1f24b2c0d65c40a50f7abd9bc154d71f6f91464e48
|
| 3 |
+
size 635839
|
repro_error.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pymupdf as fitz
|
| 3 |
+
from layout_utils import extract_blocks_spans
|
| 4 |
+
|
| 5 |
+
def test_extraction():
|
| 6 |
+
# Open the real PDF
|
| 7 |
+
doc = fitz.open("logic.pdf")
|
| 8 |
+
# doc.new_page() ... no
|
| 9 |
+
|
| 10 |
+
# Analyze it using the function I modified
|
| 11 |
+
try:
|
| 12 |
+
blocks = extract_blocks_spans(doc, 0)
|
| 13 |
+
print(f"Blocks extracted: {len(blocks)}")
|
| 14 |
+
for i, b in enumerate(blocks):
|
| 15 |
+
bbox = b.bbox
|
| 16 |
+
print(f"Block {i} bbox type: {type(bbox)}")
|
| 17 |
+
if isinstance(bbox, tuple):
|
| 18 |
+
print(f"Block {i} bbox len: {len(bbox)}")
|
| 19 |
+
if len(bbox) == 4:
|
| 20 |
+
print(f"Block {i} sample: {bbox}")
|
| 21 |
+
# Try unpacking
|
| 22 |
+
x0, y0, x1, y1 = bbox
|
| 23 |
+
# Try explicit float cast
|
| 24 |
+
_ = float(x0)
|
| 25 |
+
else:
|
| 26 |
+
print(f"Block {i} bbox is NOT tuple: {bbox}")
|
| 27 |
+
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error caught: {type(e).__name__}: {e}")
|
| 30 |
+
import traceback
|
| 31 |
+
traceback.print_exc()
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
test_extraction()
|
rotated_test.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:276ce24070de6a5c2f034980be647556e73d8a9e25fd051089ef807c10ebbdaa
|
| 3 |
+
size 746
|
test_app_analyze.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from app import analyze
|
| 3 |
+
import traceback
|
| 4 |
+
|
| 5 |
+
def test_analyze():
|
| 6 |
+
pdf_path = "test_document.pdf"
|
| 7 |
+
page_num = 1
|
| 8 |
+
dpi = 72
|
| 9 |
+
order_mode = "raw"
|
| 10 |
+
show_spans = False
|
| 11 |
+
highlight_math = False
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
print(f"Analyzing {pdf_path}...")
|
| 15 |
+
overlay, report, summary, preview = analyze(
|
| 16 |
+
pdf_path, page_num, dpi, order_mode, show_spans, highlight_math
|
| 17 |
+
)
|
| 18 |
+
print("Analysis successful!")
|
| 19 |
+
print(f"Summary length: {len(summary)}")
|
| 20 |
+
print(f"Preview length: {len(preview)}")
|
| 21 |
+
print(f"Report keys: {report.keys()}")
|
| 22 |
+
print(f"Overlay size: {overlay.size}")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"CRASHED: {e}")
|
| 25 |
+
traceback.print_exc()
|
| 26 |
+
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
test_analyze()
|
test_document.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f2648892600eb411eae617de92b54231946e51639c564331537886f55f94cd2
|
| 3 |
+
size 2759
|