|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import re |
|
|
from io import StringIO |
|
|
from PyPDF2 import PdfReader |
|
|
|
|
|
st.set_page_config(page_title="Jamabandi OCR Parser", layout="wide") |
|
|
st.title("🏡 Jamabandi OCR Parser") |
|
|
st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.") |
|
|
|
|
|
uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"]) |
|
|
|
|
|
def extract_text(file): |
|
|
if file.name.endswith(".pdf"): |
|
|
reader = PdfReader(file) |
|
|
text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) |
|
|
else: |
|
|
text = file.read().decode("utf-8") |
|
|
return text |
|
|
|
|
|
def parse_jamabandi_table(text): |
|
|
lines = text.splitlines() |
|
|
lines = [line.strip() for line in lines if line.strip()] |
|
|
|
|
|
|
|
|
if len(lines) < 3: |
|
|
return pd.DataFrame() |
|
|
|
|
|
header_line = lines[1] |
|
|
data_lines = lines[2:] |
|
|
|
|
|
|
|
|
headers = re.split(r"\s{2,}", header_line) |
|
|
|
|
|
records = [] |
|
|
for line in data_lines: |
|
|
fields = re.split(r"\s{2,}", line) |
|
|
if len(fields) >= len(headers): |
|
|
record = dict(zip(headers, fields[:len(headers)])) |
|
|
records.append(record) |
|
|
|
|
|
return pd.DataFrame(records) |
|
|
|
|
|
if uploaded_file: |
|
|
raw_text = extract_text(uploaded_file) |
|
|
df = parse_jamabandi_table(raw_text) |
|
|
|
|
|
if not df.empty: |
|
|
st.success("✅ Parsed Jamabandi Records") |
|
|
st.dataframe(df, use_container_width=True) |
|
|
|
|
|
csv = df.to_csv(index=False).encode('utf-8') |
|
|
st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv") |
|
|
else: |
|
|
st.warning("⚠️ No structured records found. Please check the OCR format.") |
|
|
|
|
|
with st.expander("📄 Raw OCR Text"): |
|
|
st.text_area("OCR Preview", raw_text, height=300) |
|
|
|
|
|
else: |
|
|
st.info("Upload a Jamabandi PDF or TXT file to begin.") |
|
|
|