Spaces:
Runtime error
Runtime error
Aryan Jain commited on
Commit ·
dc472fa
1
Parent(s): 2a79756
add margin
Browse files- src/controllers/_file_controller.py +1 -0
- src/utils/_file_client.py +113 -9
src/controllers/_file_controller.py
CHANGED
|
@@ -28,6 +28,7 @@ class FileController:
|
|
| 28 |
try:
|
| 29 |
async with self.service() as service:
|
| 30 |
result = await service.extract_file(file=file)
|
|
|
|
| 31 |
return result
|
| 32 |
except Exception as e:
|
| 33 |
logger.error(e)
|
|
|
|
| 28 |
try:
|
| 29 |
async with self.service() as service:
|
| 30 |
result = await service.extract_file(file=file)
|
| 31 |
+
result.update({"file_name": file.filename})
|
| 32 |
return result
|
| 33 |
except Exception as e:
|
| 34 |
logger.error(e)
|
src/utils/_file_client.py
CHANGED
|
@@ -28,28 +28,132 @@ class FileClient:
|
|
| 28 |
"pages": []
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
for page_num, page in enumerate(doc):
|
| 32 |
blocks = page.get_text("dict")["blocks"]
|
| 33 |
page_content = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
for block in blocks:
|
| 36 |
-
if "lines" in block:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
for line in block["lines"]:
|
| 38 |
for span in line["spans"]:
|
| 39 |
-
|
| 40 |
"text": span["text"],
|
| 41 |
"font": span["font"],
|
| 42 |
"size": span["size"],
|
| 43 |
"color": span.get("color", None),
|
| 44 |
-
"flags": span.get("flags", None)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
"page_number": page_num + 1,
|
| 49 |
-
"width":
|
| 50 |
-
"height":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
"content": page_content
|
| 52 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
return layout_data
|
| 55 |
|
|
|
|
| 28 |
"pages": []
|
| 29 |
}
|
| 30 |
|
| 31 |
+
all_headers = []
|
| 32 |
+
all_footers = []
|
| 33 |
+
|
| 34 |
for page_num, page in enumerate(doc):
|
| 35 |
blocks = page.get_text("dict")["blocks"]
|
| 36 |
page_content = []
|
| 37 |
+
text_blocks = []
|
| 38 |
+
header_blocks = []
|
| 39 |
+
footer_blocks = []
|
| 40 |
+
body_blocks = []
|
| 41 |
+
|
| 42 |
+
page_rect = page.rect
|
| 43 |
+
media_box = page.mediabox if hasattr(page, 'mediabox') else page_rect
|
| 44 |
+
crop_box = page.cropbox
|
| 45 |
+
|
| 46 |
+
header_region = (page_rect.y0, page_rect.y0 + (page_rect.height * 0.2))
|
| 47 |
+
footer_region = (page_rect.y1 - (page_rect.height * 0.2), page_rect.y1)
|
| 48 |
|
| 49 |
for block in blocks:
|
| 50 |
+
if "lines" in block and block.get("bbox"):
|
| 51 |
+
block_bbox = block["bbox"]
|
| 52 |
+
|
| 53 |
+
is_header = block_bbox[1] >= header_region[0] and block_bbox[3] <= header_region[1]
|
| 54 |
+
is_footer = block_bbox[1] >= footer_region[0] and block_bbox[3] <= footer_region[1]
|
| 55 |
+
|
| 56 |
+
if is_header:
|
| 57 |
+
header_blocks.append(block_bbox)
|
| 58 |
+
elif is_footer:
|
| 59 |
+
footer_blocks.append(block_bbox)
|
| 60 |
+
else:
|
| 61 |
+
body_blocks.append(block_bbox)
|
| 62 |
+
text_blocks.append(block_bbox)
|
| 63 |
+
|
| 64 |
for line in block["lines"]:
|
| 65 |
for span in line["spans"]:
|
| 66 |
+
content_item = {
|
| 67 |
"text": span["text"],
|
| 68 |
"font": span["font"],
|
| 69 |
"size": span["size"],
|
| 70 |
"color": span.get("color", None),
|
| 71 |
+
"flags": span.get("flags", None),
|
| 72 |
+
"is_header": is_header,
|
| 73 |
+
"is_footer": is_footer
|
| 74 |
+
}
|
| 75 |
+
page_content.append(content_item)
|
| 76 |
+
|
| 77 |
+
if body_blocks:
|
| 78 |
+
min_x = min(block[0] for block in body_blocks)
|
| 79 |
+
min_y = min(block[1] for block in body_blocks)
|
| 80 |
+
max_x = max(block[2] for block in body_blocks)
|
| 81 |
+
max_y = max(block[3] for block in body_blocks)
|
| 82 |
+
|
| 83 |
+
margin_left = (min_x - page_rect.x0) / 72
|
| 84 |
+
margin_top = (min_y - page_rect.y0) / 72
|
| 85 |
+
margin_right = (page_rect.x1 - max_x) / 72
|
| 86 |
+
margin_bottom = (page_rect.y1 - max_y) / 72
|
| 87 |
+
else:
|
| 88 |
+
margin_left = margin_top = margin_right = margin_bottom = 0
|
| 89 |
+
|
| 90 |
+
crop_margin_left = (crop_box.x0 - media_box.x0) / 72
|
| 91 |
+
crop_margin_top = (crop_box.y0 - media_box.y0) / 72
|
| 92 |
+
crop_margin_right = (media_box.x1 - crop_box.x1) / 72
|
| 93 |
+
crop_margin_bottom = (media_box.y1 - crop_box.y1) / 72
|
| 94 |
+
|
| 95 |
+
header_height = 0
|
| 96 |
+
footer_height = 0
|
| 97 |
+
if header_blocks:
|
| 98 |
+
header_min_y = min(block[1] for block in header_blocks)
|
| 99 |
+
header_max_y = max(block[3] for block in header_blocks)
|
| 100 |
+
header_height = (header_max_y - header_min_y) / 72
|
| 101 |
+
if footer_blocks:
|
| 102 |
+
footer_min_y = min(block[1] for block in footer_blocks)
|
| 103 |
+
footer_max_y = max(block[3] for block in footer_blocks)
|
| 104 |
+
footer_height = (footer_max_y - footer_min_y) / 72
|
| 105 |
+
|
| 106 |
+
header_text = ""
|
| 107 |
+
footer_text = ""
|
| 108 |
+
for item in page_content:
|
| 109 |
+
if item["is_header"]:
|
| 110 |
+
header_text += item["text"] + " "
|
| 111 |
+
elif item["is_footer"]:
|
| 112 |
+
footer_text += item["text"] + " "
|
| 113 |
+
|
| 114 |
+
header_text = header_text.strip()
|
| 115 |
+
footer_text = footer_text.strip()
|
| 116 |
+
|
| 117 |
+
if header_text:
|
| 118 |
+
all_headers.append(header_text)
|
| 119 |
+
if footer_text:
|
| 120 |
+
all_footers.append(footer_text)
|
| 121 |
+
|
| 122 |
+
page_data = {
|
| 123 |
"page_number": page_num + 1,
|
| 124 |
+
"width": page_rect.width,
|
| 125 |
+
"height": page_rect.height,
|
| 126 |
+
"margin_top": f"{round(margin_top, 1)} inches",
|
| 127 |
+
"margin_left": f"{round(margin_left, 1)} inches",
|
| 128 |
+
"margin_right": f"{round(margin_right, 1)} inches",
|
| 129 |
+
"margin_bottom": f"{round(margin_bottom, 1)} inches",
|
| 130 |
+
"header_height": f"{header_height} inches",
|
| 131 |
+
"footer_height": f"{footer_height} inches",
|
| 132 |
+
"has_header": len(header_blocks) > 0,
|
| 133 |
+
"has_footer": len(footer_blocks) > 0,
|
| 134 |
"content": page_content
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
if page_num == 0:
|
| 138 |
+
page_data["is_first_page"] = True
|
| 139 |
+
|
| 140 |
+
layout_data["pages"].append(page_data)
|
| 141 |
+
|
| 142 |
+
if all_headers:
|
| 143 |
+
unique_headers = set(all_headers)
|
| 144 |
+
layout_data["header_analysis"] = {
|
| 145 |
+
"total_pages_with_headers": len(all_headers),
|
| 146 |
+
"unique_headers": len(unique_headers),
|
| 147 |
+
"is_header_consistent": len(unique_headers) == 1 if all_headers else False
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
if all_footers:
|
| 151 |
+
unique_footers = set(all_footers)
|
| 152 |
+
layout_data["footer_analysis"] = {
|
| 153 |
+
"total_pages_with_footers": len(all_footers),
|
| 154 |
+
"unique_footers": len(unique_footers),
|
| 155 |
+
"is_footer_consistent": len(unique_footers) == 1 if all_footers else False
|
| 156 |
+
}
|
| 157 |
|
| 158 |
return layout_data
|
| 159 |
|