Spaces:

vaibhavbalar
/

PDF_Extractor

Sleeping

App Files Files Community

vaibhavbalar commited on Apr 20, 2025

Commit

71c0901

verified ·

1 Parent(s): 38b049c

Create app.py

Browse files

Files changed (1) hide show

app.py +57 -0

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import gradio as gr
+import PyPDF2
+def extract_pdf_lines(pdf_file):
+    # Extract all lines with page/line numbers for user reference
+    with open(pdf_file.name, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        result = []
+        for i, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            if page_text:
+                lines = page_text.splitlines()
+                for ln, line in enumerate(lines):
+                    # Prefix each with [Page X Line Y] for clarity
+                    result.append(f"[Page {i+1} Line {ln+1}] {line}")
+    return "\n".join(result) if result else "[NO TEXT FOUND]"
+def show_sample_context(raw_text, example):
+    """Show where sample value occurs for user feedback and future rule logic."""
+    context_lines = []
+    lines = raw_text.splitlines()
+    example = example.strip()
+    for i, line in enumerate(lines):
+        if example and example in line:
+            prev_line = lines[i-1] if i > 0 else ""
+            next_line = lines[i+1] if i+1 < len(lines) else ""
+            snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..."
+            context_lines.append(snippet)
+    if not context_lines:
+        return "No match for example in extracted text."
+    return "\n---\n".join(context_lines)
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF Field Teach-Me Extractor (Hugging Face Space Example)")
+    pdf_in = gr.File(label="Upload PDF")
+    raw_text = gr.Textbox(
+        label="Raw extracted text (copy-paste any value for training)",
+        lines=20,
+        interactive=False,
+        show_copy_button=True
+    )
+    pdf_in.change(extract_pdf_lines, inputs=pdf_in, outputs=raw_text)
+    gr.Markdown("""
+    ### 2️⃣ Teach the System
+    - Copy a sample value you want to extract, and paste it below.
+    - Assign a field name (will become a column header later).
+    - Click "Show context" to see where this value appears (future step: use this to build automatic rules!).
+    """)
+    label = gr.Textbox(label="Field Name (e.g., 'Customer Name')", lines=1)
+    example = gr.Textbox(label="Sample Value (copy from above)", lines=1)
+    context_out = gr.Textbox(label="Value context in raw PDF text", lines=4)
+    search_btn = gr.Button("Show context")
+    search_btn.click(fn=show_sample_context, inputs=[raw_text, example], outputs=context_out)
+demo.launch()