Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pytesseract | |
| import cv2 | |
| import pandas as pd | |
| import re | |
| from PIL import Image | |
| import numpy as np | |
| def extract_fields(image): | |
| try: | |
| # -------------------- Image Preparation -------------------- | |
| img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV) | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
| cv2.THRESH_BINARY_INV, 25, 15) | |
| inverted = cv2.bitwise_not(bw) | |
| pil_img = Image.fromarray(inverted) | |
| # -------------------- OCR Pass 1: Name by "Title" -------------------- | |
| ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME) | |
| ocr_df2 = ocr_df2.dropna(subset=["text"]) | |
| ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""] | |
| name = "Not found" | |
| neighbors = [] | |
| # ✅ Fix - Add these lines BEFORE you use `ocr_df` | |
| ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME) | |
| ocr_df = ocr_df.dropna(subset=["text"]) | |
| ocr_df = ocr_df[ocr_df["text"].str.strip() != ""] | |
| title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)] | |
| if not title_matches.empty: | |
| title_info = title_matches.iloc[0] | |
| if 'line_num' in title_info and 'block_num' in title_info: | |
| line_num = title_info['line_num'] | |
| block_num = title_info['block_num'] | |
| same_line = ocr_df[ | |
| (ocr_df['line_num'] == line_num) & | |
| (ocr_df['block_num'] == block_num) | |
| ].copy().sort_values(by='left').reset_index(drop=True) | |
| tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index | |
| if not tit_indices.empty: | |
| idx = tit_indices[0] | |
| if idx + 1 < len(same_line): | |
| neighbors.append(same_line.iloc[idx + 1]['text']) | |
| if idx + 2 < len(same_line): | |
| neighbors.append(same_line.iloc[idx + 2]['text']) | |
| def clean_name(words): | |
| cleaned = [] | |
| for w in words: | |
| w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w) | |
| if w_clean: | |
| cleaned.append(w_clean) | |
| return ' '.join(cleaned) | |
| if neighbors: | |
| name = clean_name(neighbors) | |
| # -------------------- OCR Pass 2: For Other Fields -------------------- | |
| ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME) | |
| ocr_df2 = ocr_df2.dropna(subset=["text"]) | |
| ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""] | |
| def get_value_next_to(keyword, direction="right", max_dist=200): | |
| match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()] | |
| if match.empty: | |
| return None | |
| row = match.iloc[0] | |
| if 'line_num' not in row or 'left' not in row: | |
| return None | |
| line = row['line_num'] | |
| x = row['left'] | |
| if direction == "right": | |
| candidates = ocr_df2[ | |
| (ocr_df2['line_num'] == line) & | |
| (ocr_df2['left'] > x) & | |
| (ocr_df2['left'] < x + max_dist) | |
| ].sort_values('left') | |
| return candidates['text'].tolist()[0] if not candidates.empty else None | |
| return None | |
| text = " ".join(ocr_df2['text']) | |
| email_match = re.search(r'[\w\.-]+@[\w\.-]+', text) | |
| phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text) | |
| raw_text = pytesseract.image_to_string(image, config='--psm 6') | |
| dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text) | |
| dob = dob_match.group(0) if dob_match else "Not found" | |
| postcode = None | |
| postcode_after_ch = None | |
| ch_exists = bool(re.search(r'\bCH\b', raw_text)) | |
| lines = raw_text.splitlines() | |
| for line in lines: | |
| if re.search(r'\bCH\b', line): | |
| match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line) | |
| if match: | |
| postcode_after_ch = match.group(1) | |
| break | |
| if postcode_after_ch: | |
| postcode = postcode_after_ch | |
| else: | |
| matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text) | |
| if matches: | |
| postcode = matches[0] | |
| if not postcode: | |
| postcode = "Not found" | |
| # -------------------- Function List Extraction -------------------- | |
| def extract_functions_block(): | |
| # | |
| func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")] | |
| if func_match.empty: | |
| return [] | |
| base_y = func_match.iloc[0]['top'] | |
| func_words = ocr_df2[ | |
| (ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120) | |
| ] | |
| # Sort by line_num and left to maintain correct reading order | |
| func_words = func_words.sort_values(by=["line_num", "left"]) | |
| grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist() | |
| clean_funcs = [] | |
| for line in grouped_lines: | |
| # | |
| cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip() | |
| if len(cleaned) > 1: | |
| clean_funcs.append(cleaned) | |
| return clean_funcs | |
| functions = extract_functions_block() | |
| # -------------------- Final Output -------------------- | |
| return [ | |
| name if name else "Not found", | |
| email_match.group(0) if email_match else "Not found", | |
| phone_match.group(0) if phone_match else "Not found", | |
| dob, | |
| postcode, | |
| get_value_next_to("CurBase") or "Not found", | |
| get_value_next_to("hourly") or get_value_next_to("rate") or "Not found", | |
| "\n".join(functions) if functions else "Not found" | |
| ] | |
| except Exception as e: | |
| return [f"Error: {str(e)}"] + ["Not found"] * 8 | |
| # -------------------- Gradio Interface -------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 📄 Image OCR Field Extractor") | |
| gr.Markdown("Upload a document image to extract structured data fields.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="pil", label=" Upload Your Document") | |
| submit_btn = gr.Button(" Run Extraction") | |
| gr.Examples( | |
| examples=["example_doc.jpeg"], | |
| inputs=[image_input], | |
| label=" Example Image (Click to load into uploader)" | |
| ) | |
| with gr.Column(): | |
| name = gr.Text(label="Name") | |
| email = gr.Text(label="Email") | |
| phone = gr.Text(label="Phone") | |
| dob = gr.Text(label="DOB") | |
| postcode = gr.Text(label="Postcode") | |
| prem = gr.Text(label="Prem (CurBase)") | |
| rate = gr.Text(label="Temp (Hourly Rate)") | |
| functions = gr.Textbox(label="Functions", lines=4) | |
| submit_btn.click(fn=extract_fields, inputs=image_input, | |
| outputs=[name, email, phone, dob, postcode, prem, rate, functions]) | |
| if __name__ == "__main__": | |
| demo.launch() | |