Spaces:

vaibhavbalar
/

PDF_Extractor

Sleeping

App Files Files Community

vaibhavbalar commited on Apr 20, 2025

Commit

fa988d9

verified ·

1 Parent(s): 4ff95c3

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -27

app.py CHANGED Viewed

@@ -1,8 +1,14 @@
 import gradio as gr
 import PyPDF2
-def extract_pdf_lines(pdf_file):
-    # Extract all lines with page/line numbers for user reference
     with open(pdf_file.name, "rb") as f:
         reader = PyPDF2.PdfReader(f)
         result = []
@@ -11,12 +17,11 @@ def extract_pdf_lines(pdf_file):
             if page_text:
                 lines = page_text.splitlines()
                 for ln, line in enumerate(lines):
-                    # Prefix each with [Page X Line Y] for clarity
                     result.append(f"[Page {i+1} Line {ln+1}] {line}")
     return "\n".join(result) if result else "[NO TEXT FOUND]"
-def show_sample_context(raw_text, example):
-    """Show where sample value occurs for user feedback and future rule logic."""
     context_lines = []
     lines = raw_text.splitlines()
     example = example.strip()
@@ -30,28 +35,94 @@ def show_sample_context(raw_text, example):
         return "No match for example in extracted text."
     return "\n---\n".join(context_lines)
 with gr.Blocks() as demo:
-    gr.Markdown("# PDF Field Teach-Me Extractor (Hugging Face Space Example)")
-    pdf_in = gr.File(label="Upload PDF")
-    raw_text = gr.Textbox(
-        label="Raw extracted text (copy-paste any value for training)",
-        lines=20,
-        interactive=False,
-        show_copy_button=True
-    )
-    pdf_in.change(extract_pdf_lines, inputs=pdf_in, outputs=raw_text)
-    gr.Markdown("""
-    ### 2️⃣ Teach the System
-    - Copy a sample value you want to extract, and paste it below.
-    - Assign a field name (will become a column header later).
-    - Click "Show context" to see where this value appears (future step: use this to build automatic rules!).
-    """)
-    label = gr.Textbox(label="Field Name (e.g., 'Customer Name')", lines=1)
-    example = gr.Textbox(label="Sample Value (copy from above)", lines=1)
-    context_out = gr.Textbox(label="Value context in raw PDF text", lines=4)
-    search_btn = gr.Button("Show context")
-    search_btn.click(fn=show_sample_context, inputs=[raw_text, example], outputs=context_out)
 demo.launch()

 import gradio as gr
 import PyPDF2
+import pandas as pd
+import re
+import io
+def extract_with_lines(pdf_file):
+    """
+    Extract all PDF text, displaying page+line number prefix.
+    Returns raw text for training.
+    """
     with open(pdf_file.name, "rb") as f:
         reader = PyPDF2.PdfReader(f)
         result = []
             if page_text:
                 lines = page_text.splitlines()
                 for ln, line in enumerate(lines):
                     result.append(f"[Page {i+1} Line {ln+1}] {line}")
     return "\n".join(result) if result else "[NO TEXT FOUND]"
+def get_sample_context(raw_text, example):
+    """Show where the sample occurs, for user feedback (teaching phase)"""
     context_lines = []
     lines = raw_text.splitlines()
     example = example.strip()
         return "No match for example in extracted text."
     return "\n---\n".join(context_lines)
+def guess_extraction_regex(sample_value, all_lines):
+    """
+    Use the sample_value to build a simple extraction pattern.
+    If the value is after a colon or consistent header, match similar lines.
+    """
+    # Try to extract prefix
+    for line in all_lines:
+        if sample_value in line:
+            # If the sample is after "Some Label: ", extract that
+            if ':' in line:
+                prefix, suffix = line.split(':', 1)
+                if sample_value.strip() == suffix.strip():
+                    return re.compile(f"{re.escape(prefix.strip())}\s*:\s*(.+)", re.IGNORECASE)
+            # If the sample is always after the same start
+            match = re.match(r"(.*?)(\s+)?"+re.escape(sample_value)+r"(.*)?", line)
+            if match and match.group(1).strip():
+                # Return a regex that matches that prefix and captures the rest
+                return re.compile(f"{re.escape(match.group(1).strip())}\s*(.+)", re.IGNORECASE)
+    # Fallback: find lines that contain the sample and grab same structure
+    return None
+def extract_table_from_sample(raw_text, label, sample_value):
+    # Split lines
+    lines = raw_text.splitlines()
+    if not label or not sample_value:
+        return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])
+    # Try to pattern match (e.g. "Customer Name: Ramesh Kumar")
+    regex = guess_extraction_regex(sample_value, lines)
+    found = []
+    if regex:
+        for line in lines:
+            m = regex.match(line)
+            if m:
+                found.append({label: m.group(1).strip()})
+    else:
+        # Fallback, just grab lines that contain the sample's prefix
+        # Try to find all lines which have the non-digit prefix of this sample
+        prefix = sample_value[:5]
+        for line in lines:
+            if prefix in line:
+                found.append({label: line.strip()})
+    if not found:
+        return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])
+    return pd.DataFrame(found)
+def export_xlsx(df):
+    """Export pandas df to xlsx in-memory file"""
+    buf = io.BytesIO()
+    with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
+        df.to_excel(writer, index=False)
+    buf.seek(0)
+    return buf
+### Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧑‍🏫 PDF Teach-&-Extract System\n**1. Upload PDF → 2. Teach a sample field → 3. Preview all auto-extracted matches → 4. Download as Excel**")
+    file_in = gr.File(label="Upload your PDF", file_count="single", type="file")
+    raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)
+    file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)
+    with gr.Row():
+        teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")
+        teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")
+        teach_search = gr.Button("Show Context")
+    context_out = gr.Textbox(label="System shows the found context(s)", lines=4)
+    teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)
+    with gr.Row():
+        extract_btn = gr.Button("Extract All Similar Values")
+        results_table = gr.Dataframe(label="Extracted Results Table")
+        download_btn = gr.Button("Download as Excel")
+        xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)
+    def extract_and_preview(raw_text, teach_label, teach_sample):
+        df = extract_table_from_sample(raw_text, teach_label, teach_sample)
+        return df
+    extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)
+    def save_xlsx(df):
+        buf = export_xlsx(df)
+        return ("results.xlsx", buf)
+    download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)
 demo.launch()