Spaces:

jobian
/

table-extraction

Sleeping

App Files Files Community

jobian commited on Sep 4, 2025

Commit

0d7e7bf

1 Parent(s): cf97d16

Added support for multiple files

Browse files

Files changed (4) hide show

.env +0 -0
app.py +9 -7
config.py +9 -5
services/extraction_service.py +66 -16

.env ADDED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,18 +1,17 @@
-import os
 import gradio as gr
 from services.extraction_service import extract_tables
-from config import IN_SPACES
 with gr.Blocks(title="Tables Extractor", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Table Extraction
-        Upload a **text-based** PDF and extract tables to structured JSON.
-        The UI renders detected tables; you can also download the JSON + metrics.
         """
     )
     with gr.Row():
-        inp = gr.File(file_types=[".pdf"], label="Upload PDF", type="filepath")
     with gr.Row():
         run_btn = gr.Button("Extract Tables", variant="primary")
     with gr.Row():
@@ -28,5 +27,8 @@ with gr.Blocks(title="Tables Extractor", theme=gr.themes.Soft()) as demo:
                   status, downloads, gallery, html_view])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(
-        os.getenv("PORT", "7860")), debug=not IN_SPACES)

 import gradio as gr
 from services.extraction_service import extract_tables
+from config import SERVER_NAME, SERVER_PORT, IN_SPACES
 with gr.Blocks(title="Tables Extractor", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Table Extraction
+        Upload up to **15 text-based PDFs** and extract tables to structured JSON.
+        The UI renders detected tables; you can also download JSON + metrics.
         """
     )
     with gr.Row():
+        inp = gr.File(file_types=[".pdf"], label="Upload PDFs", type="filepath", file_count="multiple")
     with gr.Row():
         run_btn = gr.Button("Extract Tables", variant="primary")
     with gr.Row():
                   status, downloads, gallery, html_view])
 if __name__ == "__main__":
+    demo.launch(
+        server_name=SERVER_NAME,
+        server_port=SERVER_PORT,
+        debug=not IN_SPACES
+    )

config.py CHANGED Viewed

@@ -8,9 +8,13 @@ OUTPUTS_DIR = BASE_DIR / "outputs"
 UPLOAD_DIR.mkdir(exist_ok=True)
 OUTPUTS_DIR.mkdir(exist_ok=True)
-# Detect Spaces
 IN_SPACES = bool(
-    os.environ.get("SPACE_ID") or
-    os.environ.get("HF_SPACE") or
-    os.environ.get("SYSTEM") == "spaces"
-)

 UPLOAD_DIR.mkdir(exist_ok=True)
 OUTPUTS_DIR.mkdir(exist_ok=True)
+# Detect if running on Hugging Face Spaces
 IN_SPACES = bool(
+    os.environ.get("SPACE_ID")
+    or os.environ.get("HF_SPACE")
+    or os.environ.get("SYSTEM") == "spaces"
+)
+# Networking defaults
+SERVER_NAME = "0.0.0.0" if IN_SPACES else "127.0.0.1"
+SERVER_PORT = int(os.getenv("PORT", "7860"))

services/extraction_service.py CHANGED Viewed

@@ -1,6 +1,4 @@
-import os
 import time
-import json
 from pathlib import Path
 from datetime import datetime, timezone
@@ -15,28 +13,30 @@ from utils.pdf_utils import table_image
 from config import IN_SPACES
-def extract_tables(file_obj):
-    """Main table extraction pipeline."""
     t0 = time.time()
     if file_obj is None:
         return safe_err("Please upload a PDF."), [], [], ""
-    # Save uploaded file
     original_name = Path(file_obj.name).name
     stem = Path(original_name).stem
     run_dir = unique_run_dir(stem)
-    uploads_dir = run_dir / "uploads"
-    outputs_dir = run_dir / "outputs"
-    imgs_dir = outputs_dir / "images"
     for d in [uploads_dir, outputs_dir, imgs_dir]:
         d.mkdir(parents=True, exist_ok=True)
     saved_pdf_path = uploads_dir / original_name
     with open(file_obj.name, "rb") as src, open(saved_pdf_path, "wb") as dst:
         dst.write(src.read())
     detector, formatter = AutoTableDetector(), AutoTableFormatter()
     all_tables_json, table_images, html_blocks = [], [], []
     per_page_counts, n_pages, global_tid = {}, 0, 0
@@ -60,7 +60,8 @@ def extract_tables(file_obj):
             except Exception as e:
                 if not IN_SPACES:
                     html_blocks.append(
-                        f"<pre>{safe_err(f'Detection failed on page {human_page_no}', e)}</pre>")
                 page_idx += 1
                 continue
@@ -68,7 +69,8 @@ def extract_tables(file_obj):
             if not cropped_tables:
                 html_blocks.append(
-                    f"<div class='meta'>Page {human_page_no}: no tables detected.</div>")
             else:
                 for i, ct in enumerate(cropped_tables, start=1):
                     try:
@@ -77,9 +79,15 @@ def extract_tables(file_obj):
                     except Exception as e:
                         if not IN_SPACES:
                             html_blocks.append(
-                                f"<pre>{safe_err(f'Formatting failed for page {human_page_no}, table {i}', e)}</pre>")
                         continue
                     table_json = {
                         "page": human_page_no,
                         "table_id": global_tid,
@@ -89,6 +97,12 @@ def extract_tables(file_obj):
                         "n_cols": df.shape[1],
                         "data": df.to_dict(orient="records"),
                     }
                     all_tables_json.append(table_json)
                     try:
@@ -100,9 +114,8 @@ def extract_tables(file_obj):
                     except Exception:
                         pass
-                    title = f"Page {human_page_no} · Table {i} (ID {global_tid})"
                     html_blocks.append(df_to_html_table(df, title))
                     global_tid += 1
             page_idx += 1
@@ -126,10 +139,47 @@ def extract_tables(file_obj):
     save_json(metrics_path, metrics)
     if not all_tables_json:
-        msg = "No tables found." if IN_SPACES else "⚠️ No tables found."
         html_out = style_block() + f"<div class='meta'>{msg}</div>"
         return msg, [str(json_path), str(metrics_path)], table_images, html_out
-    status = f"✅ Extracted {len(all_tables_json)} table(s) from {n_pages} page(s)."
     html_out = style_block() + "\n".join(html_blocks)
     return status, [str(json_path), str(metrics_path)], table_images, html_out

 import time
 from pathlib import Path
 from datetime import datetime, timezone
 from config import IN_SPACES
+def process_single_pdf(file_obj):
+    """Extract tables from a single PDF."""
     t0 = time.time()
     if file_obj is None:
         return safe_err("Please upload a PDF."), [], [], ""
+    # Prepare dirs
     original_name = Path(file_obj.name).name
     stem = Path(original_name).stem
     run_dir = unique_run_dir(stem)
+    uploads_dir, outputs_dir, imgs_dir = (
+        run_dir / "uploads",
+        run_dir / "outputs",
+        run_dir / "outputs" / "images",
+    )
     for d in [uploads_dir, outputs_dir, imgs_dir]:
         d.mkdir(parents=True, exist_ok=True)
+    # Save file
     saved_pdf_path = uploads_dir / original_name
     with open(file_obj.name, "rb") as src, open(saved_pdf_path, "wb") as dst:
         dst.write(src.read())
     detector, formatter = AutoTableDetector(), AutoTableFormatter()
     all_tables_json, table_images, html_blocks = [], [], []
     per_page_counts, n_pages, global_tid = {}, 0, 0
             except Exception as e:
                 if not IN_SPACES:
                     html_blocks.append(
+                        f"<pre>{safe_err(f'Detection failed on page {human_page_no}', e)}</pre>"
+                    )
                 page_idx += 1
                 continue
             if not cropped_tables:
                 html_blocks.append(
+                    f"<div class='meta'>Page {human_page_no}: no tables detected.</div>"
+                )
             else:
                 for i, ct in enumerate(cropped_tables, start=1):
                     try:
                     except Exception as e:
                         if not IN_SPACES:
                             html_blocks.append(
+                                f"<pre>{safe_err(f'Formatting failed for page {human_page_no}, table {i}', e)}</pre>"
+                            )
                         continue
+                    # Deduplicate column names before exporting
+                    original_cols = list(df.columns)
+                    df.columns, renamed = deduplicate_columns(df.columns)
                     table_json = {
                         "page": human_page_no,
                         "table_id": global_tid,
                         "n_cols": df.shape[1],
                         "data": df.to_dict(orient="records"),
                     }
+                    # Add metadata if renaming happened
+                    if renamed:
+                        table_json["renamed_columns"] = True
+                        table_json["original_columns"] = [str(c) for c in original_cols]
                     all_tables_json.append(table_json)
                     try:
                     except Exception:
                         pass
+                    title = f"{original_name} · Page {human_page_no} · Table {i} (ID {global_tid})"
                     html_blocks.append(df_to_html_table(df, title))
                     global_tid += 1
             page_idx += 1
     save_json(metrics_path, metrics)
     if not all_tables_json:
+        msg = "No tables found." if IN_SPACES else f"⚠️ No tables found in {original_name}"
         html_out = style_block() + f"<div class='meta'>{msg}</div>"
         return msg, [str(json_path), str(metrics_path)], table_images, html_out
+    status = f"✅ Extracted {len(all_tables_json)} table(s) from {n_pages} page(s) in {original_name}."
     html_out = style_block() + "\n".join(html_blocks)
     return status, [str(json_path), str(metrics_path)], table_images, html_out
+def extract_tables(file_objs):
+    """Handle multiple PDF uploads (max 15)."""
+    if not file_objs:
+        return "Please upload at least one PDF.", [], [], ""
+    if len(file_objs) > 15:
+        return "❌ Too many PDFs uploaded. Limit is 15.", [], [], ""
+    all_status, all_files, all_images, all_html = [], [], [], []
+    for file_obj in file_objs:
+        status, files, images, html = process_single_pdf(file_obj)
+        all_status.append(status)
+        all_files.extend(files)
+        all_images.extend(images)
+        all_html.append(html)
+    return "\n".join(all_status), all_files, all_images, "\n".join(all_html)
+def deduplicate_columns(columns):
+    """Auto-rename duplicate column names with suffixes .1, .2, etc."""
+    seen = {}
+    new_cols = []
+    renamed = False
+    for col in columns:
+        if col not in seen:
+            seen[col] = 0
+            new_cols.append(col)
+        else:
+            seen[col] += 1
+            new_name = f"{col}.{seen[col]}"
+            new_cols.append(new_name)
+            renamed = True
+    return new_cols, renamed