Aryan Jain commited on
Commit
dc472fa
·
1 Parent(s): 2a79756

add margin

Browse files
src/controllers/_file_controller.py CHANGED
@@ -28,6 +28,7 @@ class FileController:
28
  try:
29
  async with self.service() as service:
30
  result = await service.extract_file(file=file)
 
31
  return result
32
  except Exception as e:
33
  logger.error(e)
 
28
  try:
29
  async with self.service() as service:
30
  result = await service.extract_file(file=file)
31
+ result.update({"file_name": file.filename})
32
  return result
33
  except Exception as e:
34
  logger.error(e)
src/utils/_file_client.py CHANGED
@@ -28,28 +28,132 @@ class FileClient:
28
  "pages": []
29
  }
30
 
 
 
 
31
  for page_num, page in enumerate(doc):
32
  blocks = page.get_text("dict")["blocks"]
33
  page_content = []
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  for block in blocks:
36
- if "lines" in block:
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  for line in block["lines"]:
38
  for span in line["spans"]:
39
- page_content.append({
40
  "text": span["text"],
41
  "font": span["font"],
42
  "size": span["size"],
43
  "color": span.get("color", None),
44
- "flags": span.get("flags", None)
45
- })
46
-
47
- layout_data["pages"].append({
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  "page_number": page_num + 1,
49
- "width": page.rect.width,
50
- "height": page.rect.height,
 
 
 
 
 
 
 
 
51
  "content": page_content
52
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  return layout_data
55
 
 
28
  "pages": []
29
  }
30
 
31
+ all_headers = []
32
+ all_footers = []
33
+
34
  for page_num, page in enumerate(doc):
35
  blocks = page.get_text("dict")["blocks"]
36
  page_content = []
37
+ text_blocks = []
38
+ header_blocks = []
39
+ footer_blocks = []
40
+ body_blocks = []
41
+
42
+ page_rect = page.rect
43
+ media_box = page.mediabox if hasattr(page, 'mediabox') else page_rect
44
+ crop_box = page.cropbox
45
+
46
+ header_region = (page_rect.y0, page_rect.y0 + (page_rect.height * 0.2))
47
+ footer_region = (page_rect.y1 - (page_rect.height * 0.2), page_rect.y1)
48
 
49
  for block in blocks:
50
+ if "lines" in block and block.get("bbox"):
51
+ block_bbox = block["bbox"]
52
+
53
+ is_header = block_bbox[1] >= header_region[0] and block_bbox[3] <= header_region[1]
54
+ is_footer = block_bbox[1] >= footer_region[0] and block_bbox[3] <= footer_region[1]
55
+
56
+ if is_header:
57
+ header_blocks.append(block_bbox)
58
+ elif is_footer:
59
+ footer_blocks.append(block_bbox)
60
+ else:
61
+ body_blocks.append(block_bbox)
62
+ text_blocks.append(block_bbox)
63
+
64
  for line in block["lines"]:
65
  for span in line["spans"]:
66
+ content_item = {
67
  "text": span["text"],
68
  "font": span["font"],
69
  "size": span["size"],
70
  "color": span.get("color", None),
71
+ "flags": span.get("flags", None),
72
+ "is_header": is_header,
73
+ "is_footer": is_footer
74
+ }
75
+ page_content.append(content_item)
76
+
77
+ if body_blocks:
78
+ min_x = min(block[0] for block in body_blocks)
79
+ min_y = min(block[1] for block in body_blocks)
80
+ max_x = max(block[2] for block in body_blocks)
81
+ max_y = max(block[3] for block in body_blocks)
82
+
83
+ margin_left = (min_x - page_rect.x0) / 72
84
+ margin_top = (min_y - page_rect.y0) / 72
85
+ margin_right = (page_rect.x1 - max_x) / 72
86
+ margin_bottom = (page_rect.y1 - max_y) / 72
87
+ else:
88
+ margin_left = margin_top = margin_right = margin_bottom = 0
89
+
90
+ crop_margin_left = (crop_box.x0 - media_box.x0) / 72
91
+ crop_margin_top = (crop_box.y0 - media_box.y0) / 72
92
+ crop_margin_right = (media_box.x1 - crop_box.x1) / 72
93
+ crop_margin_bottom = (media_box.y1 - crop_box.y1) / 72
94
+
95
+ header_height = 0
96
+ footer_height = 0
97
+ if header_blocks:
98
+ header_min_y = min(block[1] for block in header_blocks)
99
+ header_max_y = max(block[3] for block in header_blocks)
100
+ header_height = (header_max_y - header_min_y) / 72
101
+ if footer_blocks:
102
+ footer_min_y = min(block[1] for block in footer_blocks)
103
+ footer_max_y = max(block[3] for block in footer_blocks)
104
+ footer_height = (footer_max_y - footer_min_y) / 72
105
+
106
+ header_text = ""
107
+ footer_text = ""
108
+ for item in page_content:
109
+ if item["is_header"]:
110
+ header_text += item["text"] + " "
111
+ elif item["is_footer"]:
112
+ footer_text += item["text"] + " "
113
+
114
+ header_text = header_text.strip()
115
+ footer_text = footer_text.strip()
116
+
117
+ if header_text:
118
+ all_headers.append(header_text)
119
+ if footer_text:
120
+ all_footers.append(footer_text)
121
+
122
+ page_data = {
123
  "page_number": page_num + 1,
124
+ "width": page_rect.width,
125
+ "height": page_rect.height,
126
+ "margin_top": f"{round(margin_top, 1)} inches",
127
+ "margin_left": f"{round(margin_left, 1)} inches",
128
+ "margin_right": f"{round(margin_right, 1)} inches",
129
+ "margin_bottom": f"{round(margin_bottom, 1)} inches",
130
+ "header_height": f"{header_height} inches",
131
+ "footer_height": f"{footer_height} inches",
132
+ "has_header": len(header_blocks) > 0,
133
+ "has_footer": len(footer_blocks) > 0,
134
  "content": page_content
135
+ }
136
+
137
+ if page_num == 0:
138
+ page_data["is_first_page"] = True
139
+
140
+ layout_data["pages"].append(page_data)
141
+
142
+ if all_headers:
143
+ unique_headers = set(all_headers)
144
+ layout_data["header_analysis"] = {
145
+ "total_pages_with_headers": len(all_headers),
146
+ "unique_headers": len(unique_headers),
147
+ "is_header_consistent": len(unique_headers) == 1 if all_headers else False
148
+ }
149
+
150
+ if all_footers:
151
+ unique_footers = set(all_footers)
152
+ layout_data["footer_analysis"] = {
153
+ "total_pages_with_footers": len(all_footers),
154
+ "unique_footers": len(unique_footers),
155
+ "is_footer_consistent": len(unique_footers) == 1 if all_footers else False
156
+ }
157
 
158
  return layout_data
159