vaibhavbalar commited on
Commit
fa988d9
·
verified ·
1 Parent(s): 4ff95c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -27
app.py CHANGED
@@ -1,8 +1,14 @@
1
  import gradio as gr
2
  import PyPDF2
 
 
 
3
 
4
- def extract_pdf_lines(pdf_file):
5
- # Extract all lines with page/line numbers for user reference
 
 
 
6
  with open(pdf_file.name, "rb") as f:
7
  reader = PyPDF2.PdfReader(f)
8
  result = []
@@ -11,12 +17,11 @@ def extract_pdf_lines(pdf_file):
11
  if page_text:
12
  lines = page_text.splitlines()
13
  for ln, line in enumerate(lines):
14
- # Prefix each with [Page X Line Y] for clarity
15
  result.append(f"[Page {i+1} Line {ln+1}] {line}")
16
  return "\n".join(result) if result else "[NO TEXT FOUND]"
17
 
18
- def show_sample_context(raw_text, example):
19
- """Show where sample value occurs for user feedback and future rule logic."""
20
  context_lines = []
21
  lines = raw_text.splitlines()
22
  example = example.strip()
@@ -30,28 +35,94 @@ def show_sample_context(raw_text, example):
30
  return "No match for example in extracted text."
31
  return "\n---\n".join(context_lines)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  with gr.Blocks() as demo:
34
- gr.Markdown("# PDF Field Teach-Me Extractor (Hugging Face Space Example)")
35
- pdf_in = gr.File(label="Upload PDF")
36
- raw_text = gr.Textbox(
37
- label="Raw extracted text (copy-paste any value for training)",
38
- lines=20,
39
- interactive=False,
40
- show_copy_button=True
41
- )
42
- pdf_in.change(extract_pdf_lines, inputs=pdf_in, outputs=raw_text)
43
-
44
- gr.Markdown("""
45
- ### 2️⃣ Teach the System
46
- - Copy a sample value you want to extract, and paste it below.
47
- - Assign a field name (will become a column header later).
48
- - Click "Show context" to see where this value appears (future step: use this to build automatic rules!).
49
- """)
50
- label = gr.Textbox(label="Field Name (e.g., 'Customer Name')", lines=1)
51
- example = gr.Textbox(label="Sample Value (copy from above)", lines=1)
52
- context_out = gr.Textbox(label="Value context in raw PDF text", lines=4)
53
- search_btn = gr.Button("Show context")
54
-
55
- search_btn.click(fn=show_sample_context, inputs=[raw_text, example], outputs=context_out)
 
 
 
 
 
 
 
 
56
 
57
  demo.launch()
 
1
  import gradio as gr
2
  import PyPDF2
3
+ import pandas as pd
4
+ import re
5
+ import io
6
 
7
+ def extract_with_lines(pdf_file):
8
+ """
9
+ Extract all PDF text, displaying page+line number prefix.
10
+ Returns raw text for training.
11
+ """
12
  with open(pdf_file.name, "rb") as f:
13
  reader = PyPDF2.PdfReader(f)
14
  result = []
 
17
  if page_text:
18
  lines = page_text.splitlines()
19
  for ln, line in enumerate(lines):
 
20
  result.append(f"[Page {i+1} Line {ln+1}] {line}")
21
  return "\n".join(result) if result else "[NO TEXT FOUND]"
22
 
23
+ def get_sample_context(raw_text, example):
24
+ """Show where the sample occurs, for user feedback (teaching phase)"""
25
  context_lines = []
26
  lines = raw_text.splitlines()
27
  example = example.strip()
 
35
  return "No match for example in extracted text."
36
  return "\n---\n".join(context_lines)
37
 
38
+ def guess_extraction_regex(sample_value, all_lines):
39
+ """
40
+ Use the sample_value to build a simple extraction pattern.
41
+ If the value is after a colon or consistent header, match similar lines.
42
+ """
43
+ # Try to extract prefix
44
+ for line in all_lines:
45
+ if sample_value in line:
46
+ # If the sample is after "Some Label: ", extract that
47
+ if ':' in line:
48
+ prefix, suffix = line.split(':', 1)
49
+ if sample_value.strip() == suffix.strip():
50
+ return re.compile(f"{re.escape(prefix.strip())}\s*:\s*(.+)", re.IGNORECASE)
51
+ # If the sample is always after the same start
52
+ match = re.match(r"(.*?)(\s+)?"+re.escape(sample_value)+r"(.*)?", line)
53
+ if match and match.group(1).strip():
54
+ # Return a regex that matches that prefix and captures the rest
55
+ return re.compile(f"{re.escape(match.group(1).strip())}\s*(.+)", re.IGNORECASE)
56
+ # Fallback: find lines that contain the sample and grab same structure
57
+ return None
58
+
59
+ def extract_table_from_sample(raw_text, label, sample_value):
60
+ # Split lines
61
+ lines = raw_text.splitlines()
62
+ if not label or not sample_value:
63
+ return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])
64
+
65
+ # Try to pattern match (e.g. "Customer Name: Ramesh Kumar")
66
+ regex = guess_extraction_regex(sample_value, lines)
67
+ found = []
68
+
69
+ if regex:
70
+ for line in lines:
71
+ m = regex.match(line)
72
+ if m:
73
+ found.append({label: m.group(1).strip()})
74
+ else:
75
+ # Fallback, just grab lines that contain the sample's prefix
76
+ # Try to find all lines which have the non-digit prefix of this sample
77
+ prefix = sample_value[:5]
78
+ for line in lines:
79
+ if prefix in line:
80
+ found.append({label: line.strip()})
81
+
82
+ if not found:
83
+ return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])
84
+ return pd.DataFrame(found)
85
+
86
+ def export_xlsx(df):
87
+ """Export pandas df to xlsx in-memory file"""
88
+ buf = io.BytesIO()
89
+ with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
90
+ df.to_excel(writer, index=False)
91
+ buf.seek(0)
92
+ return buf
93
+
94
+ ### Gradio Interface
95
+
96
  with gr.Blocks() as demo:
97
+ gr.Markdown("# 🧑‍🏫 PDF Teach-&-Extract System\n**1. Upload PDF → 2. Teach a sample field → 3. Preview all auto-extracted matches 4. Download as Excel**")
98
+
99
+ file_in = gr.File(label="Upload your PDF", file_count="single", type="file")
100
+ raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)
101
+
102
+ file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)
103
+
104
+ with gr.Row():
105
+ teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")
106
+ teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")
107
+ teach_search = gr.Button("Show Context")
108
+ context_out = gr.Textbox(label="System shows the found context(s)", lines=4)
109
+
110
+ teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)
111
+
112
+ with gr.Row():
113
+ extract_btn = gr.Button("Extract All Similar Values")
114
+ results_table = gr.Dataframe(label="Extracted Results Table")
115
+ download_btn = gr.Button("Download as Excel")
116
+ xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)
117
+
118
+ def extract_and_preview(raw_text, teach_label, teach_sample):
119
+ df = extract_table_from_sample(raw_text, teach_label, teach_sample)
120
+ return df
121
+ extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)
122
+
123
+ def save_xlsx(df):
124
+ buf = export_xlsx(df)
125
+ return ("results.xlsx", buf)
126
+ download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)
127
 
128
  demo.launch()