jobian commited on
Commit
0d7e7bf
·
1 Parent(s): cf97d16

Added support for multiple files

Browse files
Files changed (4) hide show
  1. .env +0 -0
  2. app.py +9 -7
  3. config.py +9 -5
  4. services/extraction_service.py +66 -16
.env ADDED
File without changes
app.py CHANGED
@@ -1,18 +1,17 @@
1
- import os
2
  import gradio as gr
3
  from services.extraction_service import extract_tables
4
- from config import IN_SPACES
5
 
6
  with gr.Blocks(title="Tables Extractor", theme=gr.themes.Soft()) as demo:
7
  gr.Markdown(
8
  """
9
  # Table Extraction
10
- Upload a **text-based** PDF and extract tables to structured JSON.
11
- The UI renders detected tables; you can also download the JSON + metrics.
12
  """
13
  )
14
  with gr.Row():
15
- inp = gr.File(file_types=[".pdf"], label="Upload PDF", type="filepath")
16
  with gr.Row():
17
  run_btn = gr.Button("Extract Tables", variant="primary")
18
  with gr.Row():
@@ -28,5 +27,8 @@ with gr.Blocks(title="Tables Extractor", theme=gr.themes.Soft()) as demo:
28
  status, downloads, gallery, html_view])
29
 
30
  if __name__ == "__main__":
31
- demo.launch(server_name="0.0.0.0", server_port=int(
32
- os.getenv("PORT", "7860")), debug=not IN_SPACES)
 
 
 
 
 
1
  import gradio as gr
2
  from services.extraction_service import extract_tables
3
+ from config import SERVER_NAME, SERVER_PORT, IN_SPACES
4
 
5
  with gr.Blocks(title="Tables Extractor", theme=gr.themes.Soft()) as demo:
6
  gr.Markdown(
7
  """
8
  # Table Extraction
9
+ Upload up to **15 text-based PDFs** and extract tables to structured JSON.
10
+ The UI renders detected tables; you can also download JSON + metrics.
11
  """
12
  )
13
  with gr.Row():
14
+ inp = gr.File(file_types=[".pdf"], label="Upload PDFs", type="filepath", file_count="multiple")
15
  with gr.Row():
16
  run_btn = gr.Button("Extract Tables", variant="primary")
17
  with gr.Row():
 
27
  status, downloads, gallery, html_view])
28
 
29
  if __name__ == "__main__":
30
+ demo.launch(
31
+ server_name=SERVER_NAME,
32
+ server_port=SERVER_PORT,
33
+ debug=not IN_SPACES
34
+ )
config.py CHANGED
@@ -8,9 +8,13 @@ OUTPUTS_DIR = BASE_DIR / "outputs"
8
  UPLOAD_DIR.mkdir(exist_ok=True)
9
  OUTPUTS_DIR.mkdir(exist_ok=True)
10
 
11
- # Detect Spaces
12
  IN_SPACES = bool(
13
- os.environ.get("SPACE_ID") or
14
- os.environ.get("HF_SPACE") or
15
- os.environ.get("SYSTEM") == "spaces"
16
- )
 
 
 
 
 
8
  UPLOAD_DIR.mkdir(exist_ok=True)
9
  OUTPUTS_DIR.mkdir(exist_ok=True)
10
 
11
+ # Detect if running on Hugging Face Spaces
12
  IN_SPACES = bool(
13
+ os.environ.get("SPACE_ID")
14
+ or os.environ.get("HF_SPACE")
15
+ or os.environ.get("SYSTEM") == "spaces"
16
+ )
17
+
18
+ # Networking defaults
19
+ SERVER_NAME = "0.0.0.0" if IN_SPACES else "127.0.0.1"
20
+ SERVER_PORT = int(os.getenv("PORT", "7860"))
services/extraction_service.py CHANGED
@@ -1,6 +1,4 @@
1
- import os
2
  import time
3
- import json
4
  from pathlib import Path
5
  from datetime import datetime, timezone
6
 
@@ -15,28 +13,30 @@ from utils.pdf_utils import table_image
15
  from config import IN_SPACES
16
 
17
 
18
- def extract_tables(file_obj):
19
- """Main table extraction pipeline."""
20
  t0 = time.time()
21
  if file_obj is None:
22
  return safe_err("Please upload a PDF."), [], [], ""
23
 
24
- # Save uploaded file
25
  original_name = Path(file_obj.name).name
26
  stem = Path(original_name).stem
27
  run_dir = unique_run_dir(stem)
28
- uploads_dir = run_dir / "uploads"
29
- outputs_dir = run_dir / "outputs"
30
- imgs_dir = outputs_dir / "images"
 
 
31
  for d in [uploads_dir, outputs_dir, imgs_dir]:
32
  d.mkdir(parents=True, exist_ok=True)
33
 
 
34
  saved_pdf_path = uploads_dir / original_name
35
  with open(file_obj.name, "rb") as src, open(saved_pdf_path, "wb") as dst:
36
  dst.write(src.read())
37
 
38
  detector, formatter = AutoTableDetector(), AutoTableFormatter()
39
-
40
  all_tables_json, table_images, html_blocks = [], [], []
41
  per_page_counts, n_pages, global_tid = {}, 0, 0
42
 
@@ -60,7 +60,8 @@ def extract_tables(file_obj):
60
  except Exception as e:
61
  if not IN_SPACES:
62
  html_blocks.append(
63
- f"<pre>{safe_err(f'Detection failed on page {human_page_no}', e)}</pre>")
 
64
  page_idx += 1
65
  continue
66
 
@@ -68,7 +69,8 @@ def extract_tables(file_obj):
68
 
69
  if not cropped_tables:
70
  html_blocks.append(
71
- f"<div class='meta'>Page {human_page_no}: no tables detected.</div>")
 
72
  else:
73
  for i, ct in enumerate(cropped_tables, start=1):
74
  try:
@@ -77,9 +79,15 @@ def extract_tables(file_obj):
77
  except Exception as e:
78
  if not IN_SPACES:
79
  html_blocks.append(
80
- f"<pre>{safe_err(f'Formatting failed for page {human_page_no}, table {i}', e)}</pre>")
 
81
  continue
82
 
 
 
 
 
 
83
  table_json = {
84
  "page": human_page_no,
85
  "table_id": global_tid,
@@ -89,6 +97,12 @@ def extract_tables(file_obj):
89
  "n_cols": df.shape[1],
90
  "data": df.to_dict(orient="records"),
91
  }
 
 
 
 
 
 
92
  all_tables_json.append(table_json)
93
 
94
  try:
@@ -100,9 +114,8 @@ def extract_tables(file_obj):
100
  except Exception:
101
  pass
102
 
103
- title = f"Page {human_page_no} · Table {i} (ID {global_tid})"
104
  html_blocks.append(df_to_html_table(df, title))
105
-
106
  global_tid += 1
107
 
108
  page_idx += 1
@@ -126,10 +139,47 @@ def extract_tables(file_obj):
126
  save_json(metrics_path, metrics)
127
 
128
  if not all_tables_json:
129
- msg = "No tables found." if IN_SPACES else "⚠️ No tables found."
130
  html_out = style_block() + f"<div class='meta'>{msg}</div>"
131
  return msg, [str(json_path), str(metrics_path)], table_images, html_out
132
 
133
- status = f"✅ Extracted {len(all_tables_json)} table(s) from {n_pages} page(s)."
134
  html_out = style_block() + "\n".join(html_blocks)
135
  return status, [str(json_path), str(metrics_path)], table_images, html_out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import time
 
2
  from pathlib import Path
3
  from datetime import datetime, timezone
4
 
 
13
  from config import IN_SPACES
14
 
15
 
16
+ def process_single_pdf(file_obj):
17
+ """Extract tables from a single PDF."""
18
  t0 = time.time()
19
  if file_obj is None:
20
  return safe_err("Please upload a PDF."), [], [], ""
21
 
22
+ # Prepare dirs
23
  original_name = Path(file_obj.name).name
24
  stem = Path(original_name).stem
25
  run_dir = unique_run_dir(stem)
26
+ uploads_dir, outputs_dir, imgs_dir = (
27
+ run_dir / "uploads",
28
+ run_dir / "outputs",
29
+ run_dir / "outputs" / "images",
30
+ )
31
  for d in [uploads_dir, outputs_dir, imgs_dir]:
32
  d.mkdir(parents=True, exist_ok=True)
33
 
34
+ # Save file
35
  saved_pdf_path = uploads_dir / original_name
36
  with open(file_obj.name, "rb") as src, open(saved_pdf_path, "wb") as dst:
37
  dst.write(src.read())
38
 
39
  detector, formatter = AutoTableDetector(), AutoTableFormatter()
 
40
  all_tables_json, table_images, html_blocks = [], [], []
41
  per_page_counts, n_pages, global_tid = {}, 0, 0
42
 
 
60
  except Exception as e:
61
  if not IN_SPACES:
62
  html_blocks.append(
63
+ f"<pre>{safe_err(f'Detection failed on page {human_page_no}', e)}</pre>"
64
+ )
65
  page_idx += 1
66
  continue
67
 
 
69
 
70
  if not cropped_tables:
71
  html_blocks.append(
72
+ f"<div class='meta'>Page {human_page_no}: no tables detected.</div>"
73
+ )
74
  else:
75
  for i, ct in enumerate(cropped_tables, start=1):
76
  try:
 
79
  except Exception as e:
80
  if not IN_SPACES:
81
  html_blocks.append(
82
+ f"<pre>{safe_err(f'Formatting failed for page {human_page_no}, table {i}', e)}</pre>"
83
+ )
84
  continue
85
 
86
+
87
+ # Deduplicate column names before exporting
88
+ original_cols = list(df.columns)
89
+ df.columns, renamed = deduplicate_columns(df.columns)
90
+
91
  table_json = {
92
  "page": human_page_no,
93
  "table_id": global_tid,
 
97
  "n_cols": df.shape[1],
98
  "data": df.to_dict(orient="records"),
99
  }
100
+
101
+ # Add metadata if renaming happened
102
+ if renamed:
103
+ table_json["renamed_columns"] = True
104
+ table_json["original_columns"] = [str(c) for c in original_cols]
105
+
106
  all_tables_json.append(table_json)
107
 
108
  try:
 
114
  except Exception:
115
  pass
116
 
117
+ title = f"{original_name} · Page {human_page_no} · Table {i} (ID {global_tid})"
118
  html_blocks.append(df_to_html_table(df, title))
 
119
  global_tid += 1
120
 
121
  page_idx += 1
 
139
  save_json(metrics_path, metrics)
140
 
141
  if not all_tables_json:
142
+ msg = "No tables found." if IN_SPACES else f"⚠️ No tables found in {original_name}"
143
  html_out = style_block() + f"<div class='meta'>{msg}</div>"
144
  return msg, [str(json_path), str(metrics_path)], table_images, html_out
145
 
146
+ status = f"✅ Extracted {len(all_tables_json)} table(s) from {n_pages} page(s) in {original_name}."
147
  html_out = style_block() + "\n".join(html_blocks)
148
  return status, [str(json_path), str(metrics_path)], table_images, html_out
149
+
150
+
151
+ def extract_tables(file_objs):
152
+ """Handle multiple PDF uploads (max 15)."""
153
+ if not file_objs:
154
+ return "Please upload at least one PDF.", [], [], ""
155
+
156
+ if len(file_objs) > 15:
157
+ return "❌ Too many PDFs uploaded. Limit is 15.", [], [], ""
158
+
159
+ all_status, all_files, all_images, all_html = [], [], [], []
160
+
161
+ for file_obj in file_objs:
162
+ status, files, images, html = process_single_pdf(file_obj)
163
+ all_status.append(status)
164
+ all_files.extend(files)
165
+ all_images.extend(images)
166
+ all_html.append(html)
167
+
168
+ return "\n".join(all_status), all_files, all_images, "\n".join(all_html)
169
+
170
+
171
+ def deduplicate_columns(columns):
172
+ """Auto-rename duplicate column names with suffixes .1, .2, etc."""
173
+ seen = {}
174
+ new_cols = []
175
+ renamed = False
176
+ for col in columns:
177
+ if col not in seen:
178
+ seen[col] = 0
179
+ new_cols.append(col)
180
+ else:
181
+ seen[col] += 1
182
+ new_name = f"{col}.{seen[col]}"
183
+ new_cols.append(new_name)
184
+ renamed = True
185
+ return new_cols, renamed