Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PIL import Image | |
| import pytesseract | |
| import cv2 | |
| import numpy as np | |
| import re | |
| import csv | |
| import tempfile | |
| import os | |
| # Synonyms for features | |
| feature_synonyms = { | |
| "Caller Number Display": ["Caller Number Display", "Caller ID", "Calling Number"], | |
| "Incoming Calls": ["Incoming Calls", "Inbound Calls"], | |
| "Incoming SMS": ["Incoming SMS", "Received SMS", "Inbound SMS"], | |
| } | |
| def find_feature(text, synonyms_list): | |
| for feature_name in synonyms_list: | |
| pattern = rf"({feature_name})\s*(FREE|Included|Yes)?" | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| return match.group(0).strip() | |
| return "" | |
| def parse_text(text): | |
| data = { | |
| "Mobile Number": "", | |
| "Receipt Date": "", | |
| "Total Amount Paid": "", | |
| "Local Data": "", | |
| "Call Time": "", | |
| "Number of SMS": "", | |
| "Roaming Details": "", | |
| "Caller Number Display": "", | |
| "Incoming Calls": "", | |
| "Incoming SMS": "" | |
| } | |
| mobile_match = re.search(r"Mobile\s*No\.?\s*[:\-]?\s*(\d+)", text, re.IGNORECASE) | |
| if mobile_match: | |
| data["Mobile Number"] = mobile_match.group(1) | |
| date_match = re.search(r"Receipt Date\s*[:\-]?\s*([\d]{1,2}[-/][A-Za-z]{3}[-/][\d]{4})", text, re.IGNORECASE) | |
| if date_match: | |
| data["Receipt Date"] = date_match.group(1) | |
| total_match = re.search(r"Total Amount Paid\s*\$?([\d\.,]+)", text, re.IGNORECASE) | |
| if total_match: | |
| data["Total Amount Paid"] = total_match.group(1) | |
| local_data_match = re.search(r"(\d+GB\s*plan.*?)(?:\n|$)", text, re.IGNORECASE) | |
| if local_data_match: | |
| data["Local Data"] = local_data_match.group(1).strip() | |
| call_time_match = re.search(r"(\d+\s*mins.*?)(?:\n|$)", text, re.IGNORECASE) | |
| if call_time_match: | |
| data["Call Time"] = call_time_match.group(1).strip() | |
| sms_match = re.search(r"(\d+\s*SMS.*?)(?:\n|$)", text, re.IGNORECASE) | |
| if sms_match: | |
| data["Number of SMS"] = sms_match.group(1).strip() | |
| roaming_match = re.search(r"(\d+GB.*?Roaming.*?)(?:\n|$)", text, re.IGNORECASE) | |
| if roaming_match: | |
| data["Roaming Details"] = roaming_match.group(1).strip() | |
| data["Caller Number Display"] = find_feature(text, feature_synonyms["Caller Number Display"]) | |
| data["Incoming Calls"] = find_feature(text, feature_synonyms["Incoming Calls"]) | |
| data["Incoming SMS"] = find_feature(text, feature_synonyms["Incoming SMS"]) | |
| return data | |
| def ocr_extract(image): | |
| if image is None: | |
| return "Please upload an image.", None | |
| # Convert to OpenCV format | |
| image_np = np.array(image) | |
| image_cv = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) | |
| # Preprocess image for OCR | |
| gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY) | |
| _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) | |
| # Extract text using pytesseract | |
| text = pytesseract.image_to_string(thresh) | |
| # Parse the extracted text | |
| parsed_data = parse_text(text) | |
| # Create a temp file to save CSV | |
| tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") | |
| writer = csv.DictWriter(tmpfile, fieldnames=parsed_data.keys()) | |
| writer.writeheader() | |
| writer.writerow(parsed_data) | |
| tmpfile_path = tmpfile.name | |
| tmpfile.close() | |
| # Prepare display text as before | |
| display_text = "\n".join(f"{k}: {v}" for k, v in parsed_data.items()) | |
| return display_text, tmpfile_path # Return path string, not BytesIO | |
| iface = gr.Interface( | |
| fn=ocr_extract, | |
| inputs=gr.Image(type="pil"), | |
| outputs=[gr.Textbox(label="Extracted Info"), gr.File(label="Download CSV")], | |
| title="Bill OCR Scanner with CSV Export", | |
| description="Upload a bill image, extract key fields, and download the data as CSV." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(server_name="0.0.0.0", server_port=7860) | |