vaibhavbalar commited on
Commit
71c0901
·
verified ·
1 Parent(s): 38b049c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+
4
+ def extract_pdf_lines(pdf_file):
5
+ # Extract all lines with page/line numbers for user reference
6
+ with open(pdf_file.name, "rb") as f:
7
+ reader = PyPDF2.PdfReader(f)
8
+ result = []
9
+ for i, page in enumerate(reader.pages):
10
+ page_text = page.extract_text()
11
+ if page_text:
12
+ lines = page_text.splitlines()
13
+ for ln, line in enumerate(lines):
14
+ # Prefix each with [Page X Line Y] for clarity
15
+ result.append(f"[Page {i+1} Line {ln+1}] {line}")
16
+ return "\n".join(result) if result else "[NO TEXT FOUND]"
17
+
18
+ def show_sample_context(raw_text, example):
19
+ """Show where sample value occurs for user feedback and future rule logic."""
20
+ context_lines = []
21
+ lines = raw_text.splitlines()
22
+ example = example.strip()
23
+ for i, line in enumerate(lines):
24
+ if example and example in line:
25
+ prev_line = lines[i-1] if i > 0 else ""
26
+ next_line = lines[i+1] if i+1 < len(lines) else ""
27
+ snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..."
28
+ context_lines.append(snippet)
29
+ if not context_lines:
30
+ return "No match for example in extracted text."
31
+ return "\n---\n".join(context_lines)
32
+
33
+ with gr.Blocks() as demo:
34
+ gr.Markdown("# PDF Field Teach-Me Extractor (Hugging Face Space Example)")
35
+ pdf_in = gr.File(label="Upload PDF")
36
+ raw_text = gr.Textbox(
37
+ label="Raw extracted text (copy-paste any value for training)",
38
+ lines=20,
39
+ interactive=False,
40
+ show_copy_button=True
41
+ )
42
+ pdf_in.change(extract_pdf_lines, inputs=pdf_in, outputs=raw_text)
43
+
44
+ gr.Markdown("""
45
+ ### 2️⃣ Teach the System
46
+ - Copy a sample value you want to extract, and paste it below.
47
+ - Assign a field name (will become a column header later).
48
+ - Click "Show context" to see where this value appears (future step: use this to build automatic rules!).
49
+ """)
50
+ label = gr.Textbox(label="Field Name (e.g., 'Customer Name')", lines=1)
51
+ example = gr.Textbox(label="Sample Value (copy from above)", lines=1)
52
+ context_out = gr.Textbox(label="Value context in raw PDF text", lines=4)
53
+ search_btn = gr.Button("Show context")
54
+
55
+ search_btn.click(fn=show_sample_context, inputs=[raw_text, example], outputs=context_out)
56
+
57
+ demo.launch()