Spaces:

vaibhavbalar
/

PDF_Extractor

Sleeping

App Files Files Community

PDF_Extractor / app.py

vaibhavbalar

Update app.py

cbbc2a6 verified 10 months ago

raw

history blame contribute delete

4.78 kB

	import gradio as gr
	import PyPDF2
	import pandas as pd
	import re
	import io

	def extract_with_lines(pdf_path):
	"""
	Extract all PDF text, displaying page+line number prefix.
	Returns raw text for training.
	"""
	with open(pdf_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	result = []
	for i, page in enumerate(reader.pages):
	page_text = page.extract_text()
	if page_text:
	lines = page_text.splitlines()
	for ln, line in enumerate(lines):
	result.append(f"[Page {i+1} Line {ln+1}] {line}")
	return "\n".join(result) if result else "[NO TEXT FOUND]"

	def get_sample_context(raw_text, example):
	"""Show where the sample occurs, for user feedback (teaching phase)"""
	context_lines = []
	lines = raw_text.splitlines()
	example = example.strip()
	for i, line in enumerate(lines):
	if example and example in line:
	prev_line = lines[i-1] if i > 0 else ""
	next_line = lines[i+1] if i+1 < len(lines) else ""
	snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..."
	context_lines.append(snippet)
	if not context_lines:
	return "No match for example in extracted text."
	return "\n---\n".join(context_lines)

	def guess_extraction_regex(sample_value, all_lines):
	"""
	Use the sample_value to build a simple extraction pattern.
	If the value is after a colon or consistent header, match similar lines.
	"""
	for line in all_lines:
	if sample_value in line:
	if ':' in line:
	prefix, suffix = line.split(':', 1)
	if sample_value.strip() == suffix.strip():
	return re.compile(f"{re.escape(prefix.strip())}\\s:\\s(.+)", re.IGNORECASE)
	match = re.match(r"(.?)(\\s+)?"+re.escape(sample_value)+r"(.)?", line)
	if match and match.group(1).strip():
	return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE)
	return None

	def extract_table_from_sample(raw_text, label, sample_value):
	lines = raw_text.splitlines()
	if not label or not sample_value:
	return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])
	regex = guess_extraction_regex(sample_value, lines)
	found = []
	if regex:
	for line in lines:
	m = regex.match(line)
	if m:
	found.append({label: m.group(1).strip()})
	else:
	prefix = sample_value[:5]
	for line in lines:
	if prefix in line:
	found.append({label: line.strip()})
	if not found:
	return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])
	return pd.DataFrame(found)

	def export_xlsx(df):
	buf = io.BytesIO()
	with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
	df.to_excel(writer, index=False)
	buf.seek(0)
	return buf

	with gr.Blocks() as demo:
	gr.Markdown("# 🧑‍🏫 PDF Teach-&-Extract System\n1. Upload PDF → 2. Teach a sample field → 3. Preview all auto-extracted matches → 4. Download as Excel")
	file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath")
	raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)
	file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)
	with gr.Row():
	teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")
	teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")
	teach_search = gr.Button("Show Context")
	context_out = gr.Textbox(label="System shows the found context(s)", lines=4)
	teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)
	with gr.Row():
	extract_btn = gr.Button("Extract All Similar Values")
	results_table = gr.Dataframe(label="Extracted Results Table")
	download_btn = gr.Button("Download as Excel")
	xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)
	def extract_and_preview(raw_text, teach_label, teach_sample):
	df = extract_table_from_sample(raw_text, teach_label, teach_sample)
	return df
	extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)
	def save_xlsx(df):
	buf = export_xlsx(df)
	return ("results.xlsx", buf)
	download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)
	demo.launch()