Spaces:

Ayesha-Majeed
/

Cvs_text_Extraction

Sleeping

App Files Files Community

Cvs_text_Extraction / app.py

Ayesha-Majeed

Update app.py

c03d9b5 verified 8 months ago

raw

history blame contribute delete

7.47 kB

	import gradio as gr
	import pytesseract
	import cv2
	import pandas as pd
	import re
	from PIL import Image
	import numpy as np

	def extract_fields(image):
	try:
	# -------------------- Image Preparation --------------------
	img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV, 25, 15)
	inverted = cv2.bitwise_not(bw)
	pil_img = Image.fromarray(inverted)

	# -------------------- OCR Pass 1: Name by "Title" --------------------
	ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
	ocr_df2 = ocr_df2.dropna(subset=["text"])
	ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]

	name = "Not found"
	neighbors = []

	# ✅ Fix - Add these lines BEFORE you use `ocr_df`
	ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
	ocr_df = ocr_df.dropna(subset=["text"])
	ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
	title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]

	if not title_matches.empty:
	title_info = title_matches.iloc[0]

	if 'line_num' in title_info and 'block_num' in title_info:
	line_num = title_info['line_num']
	block_num = title_info['block_num']

	same_line = ocr_df[
	(ocr_df['line_num'] == line_num) &
	(ocr_df['block_num'] == block_num)
	].copy().sort_values(by='left').reset_index(drop=True)

	tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
	if not tit_indices.empty:
	idx = tit_indices[0]
	if idx + 1 < len(same_line):
	neighbors.append(same_line.iloc[idx + 1]['text'])
	if idx + 2 < len(same_line):
	neighbors.append(same_line.iloc[idx + 2]['text'])

	def clean_name(words):
	cleaned = []
	for w in words:
	w_clean = re.sub(r'^[^a-zA-Z]+\|[^a-zA-Z]+$', '', w)
	if w_clean:
	cleaned.append(w_clean)
	return ' '.join(cleaned)

	if neighbors:
	name = clean_name(neighbors)

	# -------------------- OCR Pass 2: For Other Fields --------------------
	ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
	ocr_df2 = ocr_df2.dropna(subset=["text"])
	ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]

	def get_value_next_to(keyword, direction="right", max_dist=200):
	match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
	if match.empty:
	return None
	row = match.iloc[0]
	if 'line_num' not in row or 'left' not in row:
	return None
	line = row['line_num']
	x = row['left']
	if direction == "right":
	candidates = ocr_df2[
	(ocr_df2['line_num'] == line) &
	(ocr_df2['left'] > x) &
	(ocr_df2['left'] < x + max_dist)
	].sort_values('left')
	return candidates['text'].tolist()[0] if not candidates.empty else None
	return None

	text = " ".join(ocr_df2['text'])
	email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
	phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)

	raw_text = pytesseract.image_to_string(image, config='--psm 6')

	dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
	dob = dob_match.group(0) if dob_match else "Not found"

	postcode = None
	postcode_after_ch = None
	ch_exists = bool(re.search(r'\bCH\b', raw_text))

	lines = raw_text.splitlines()
	for line in lines:
	if re.search(r'\bCH\b', line):
	match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
	if match:
	postcode_after_ch = match.group(1)
	break

	if postcode_after_ch:
	postcode = postcode_after_ch
	else:
	matches = re.findall(r'(?<!\d\|\w)[0-9]{4}(?!\d\|\w)', raw_text)
	if matches:
	postcode = matches[0]

	if not postcode:
	postcode = "Not found"

	# -------------------- Function List Extraction --------------------
	def extract_functions_block():
	#
	func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
	if func_match.empty:
	return []

	base_y = func_match.iloc[0]['top']

	func_words = ocr_df2[
	(ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
	]

	# Sort by line_num and left to maintain correct reading order
	func_words = func_words.sort_values(by=["line_num", "left"])

	grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()

	clean_funcs = []
	for line in grouped_lines:
	#
	cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
	if len(cleaned) > 1:
	clean_funcs.append(cleaned)
	return clean_funcs

	functions = extract_functions_block()
	# -------------------- Final Output --------------------
	return [
	name if name else "Not found",
	email_match.group(0) if email_match else "Not found",
	phone_match.group(0) if phone_match else "Not found",
	dob,
	postcode,
	get_value_next_to("CurBase") or "Not found",
	get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
	"\n".join(functions) if functions else "Not found"
	]

	except Exception as e:
	return [f"Error: {str(e)}"] + ["Not found"] * 8


	# -------------------- Gradio Interface --------------------

	with gr.Blocks() as demo:
	gr.Markdown("## 📄 Image OCR Field Extractor")
	gr.Markdown("Upload a document image to extract structured data fields.")

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="pil", label=" Upload Your Document")
	submit_btn = gr.Button(" Run Extraction")

	gr.Examples(
	examples=["example_doc.jpeg"],
	inputs=[image_input],
	label=" Example Image (Click to load into uploader)"
	)

	with gr.Column():
	name = gr.Text(label="Name")
	email = gr.Text(label="Email")
	phone = gr.Text(label="Phone")
	dob = gr.Text(label="DOB")
	postcode = gr.Text(label="Postcode")
	prem = gr.Text(label="Prem (CurBase)")
	rate = gr.Text(label="Temp (Hourly Rate)")
	functions = gr.Textbox(label="Functions", lines=4)

	submit_btn.click(fn=extract_fields, inputs=image_input,
	outputs=[name, email, phone, dob, postcode, prem, rate, functions])

	if __name__ == "__main__":
	demo.launch()