Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import PyPDF2 | |
| import pandas as pd | |
| import re | |
| import io | |
| def extract_with_lines(pdf_path): | |
| """ | |
| Extract all PDF text, displaying page+line number prefix. | |
| Returns raw text for training. | |
| """ | |
| with open(pdf_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| result = [] | |
| for i, page in enumerate(reader.pages): | |
| page_text = page.extract_text() | |
| if page_text: | |
| lines = page_text.splitlines() | |
| for ln, line in enumerate(lines): | |
| result.append(f"[Page {i+1} Line {ln+1}] {line}") | |
| return "\n".join(result) if result else "[NO TEXT FOUND]" | |
| def get_sample_context(raw_text, example): | |
| """Show where the sample occurs, for user feedback (teaching phase)""" | |
| context_lines = [] | |
| lines = raw_text.splitlines() | |
| example = example.strip() | |
| for i, line in enumerate(lines): | |
| if example and example in line: | |
| prev_line = lines[i-1] if i > 0 else "" | |
| next_line = lines[i+1] if i+1 < len(lines) else "" | |
| snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..." | |
| context_lines.append(snippet) | |
| if not context_lines: | |
| return "No match for example in extracted text." | |
| return "\n---\n".join(context_lines) | |
| def guess_extraction_regex(sample_value, all_lines): | |
| """ | |
| Use the sample_value to build a simple extraction pattern. | |
| If the value is after a colon or consistent header, match similar lines. | |
| """ | |
| for line in all_lines: | |
| if sample_value in line: | |
| if ':' in line: | |
| prefix, suffix = line.split(':', 1) | |
| if sample_value.strip() == suffix.strip(): | |
| return re.compile(f"{re.escape(prefix.strip())}\\s*:\\s*(.+)", re.IGNORECASE) | |
| match = re.match(r"(.*?)(\\s+)?"+re.escape(sample_value)+r"(.*)?", line) | |
| if match and match.group(1).strip(): | |
| return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE) | |
| return None | |
| def extract_table_from_sample(raw_text, label, sample_value): | |
| lines = raw_text.splitlines() | |
| if not label or not sample_value: | |
| return pd.DataFrame([{"Error": "Please supply both label and sample value!"}]) | |
| regex = guess_extraction_regex(sample_value, lines) | |
| found = [] | |
| if regex: | |
| for line in lines: | |
| m = regex.match(line) | |
| if m: | |
| found.append({label: m.group(1).strip()}) | |
| else: | |
| prefix = sample_value[:5] | |
| for line in lines: | |
| if prefix in line: | |
| found.append({label: line.strip()}) | |
| if not found: | |
| return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}]) | |
| return pd.DataFrame(found) | |
| def export_xlsx(df): | |
| buf = io.BytesIO() | |
| with pd.ExcelWriter(buf, engine="xlsxwriter") as writer: | |
| df.to_excel(writer, index=False) | |
| buf.seek(0) | |
| return buf | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π§βπ« PDF Teach-&-Extract System\n**1. Upload PDF β 2. Teach a sample field β 3. Preview all auto-extracted matches β 4. Download as Excel**") | |
| file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath") | |
| raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True) | |
| file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text) | |
| with gr.Row(): | |
| teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)") | |
| teach_sample = gr.Textbox(label="Example Value (copy-paste from above)") | |
| teach_search = gr.Button("Show Context") | |
| context_out = gr.Textbox(label="System shows the found context(s)", lines=4) | |
| teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out) | |
| with gr.Row(): | |
| extract_btn = gr.Button("Extract All Similar Values") | |
| results_table = gr.Dataframe(label="Extracted Results Table") | |
| download_btn = gr.Button("Download as Excel") | |
| xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True) | |
| def extract_and_preview(raw_text, teach_label, teach_sample): | |
| df = extract_table_from_sample(raw_text, teach_label, teach_sample) | |
| return df | |
| extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table) | |
| def save_xlsx(df): | |
| buf = export_xlsx(df) | |
| return ("results.xlsx", buf) | |
| download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file) | |
| demo.launch() |