File size: 1,900 Bytes
0f0872f a5addb6 0f0872f a5addb6 0f0872f a5addb6 0f0872f a5addb6 0f0872f 3daceb5 0f0872f 3daceb5 0f0872f 3daceb5 a5addb6 3daceb5 0f0872f a5addb6 0f0872f a5addb6 3daceb5 0f0872f 3daceb5 0f0872f 3daceb5 0f0872f a5addb6 0f0872f a5addb6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import streamlit as st
import pandas as pd
import re
from io import StringIO
from PyPDF2 import PdfReader
st.set_page_config(page_title="Jamabandi OCR Parser", layout="wide")
st.title("🏡 Jamabandi OCR Parser")
st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.")
uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"])
def extract_text(file):
if file.name.endswith(".pdf"):
reader = PdfReader(file)
text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
else:
text = file.read().decode("utf-8")
return text
def parse_jamabandi_table(text):
lines = text.splitlines()
lines = [line.strip() for line in lines if line.strip()]
# Skip metadata row
if len(lines) < 3:
return pd.DataFrame()
header_line = lines[1]
data_lines = lines[2:]
# Split header into columns (basic heuristic)
headers = re.split(r"\s{2,}", header_line)
records = []
for line in data_lines:
fields = re.split(r"\s{2,}", line)
if len(fields) >= len(headers):
record = dict(zip(headers, fields[:len(headers)]))
records.append(record)
return pd.DataFrame(records)
if uploaded_file:
raw_text = extract_text(uploaded_file)
df = parse_jamabandi_table(raw_text)
if not df.empty:
st.success("✅ Parsed Jamabandi Records")
st.dataframe(df, use_container_width=True)
csv = df.to_csv(index=False).encode('utf-8')
st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv")
else:
st.warning("⚠️ No structured records found. Please check the OCR format.")
with st.expander("📄 Raw OCR Text"):
st.text_area("OCR Preview", raw_text, height=300)
else:
st.info("Upload a Jamabandi PDF or TXT file to begin.")
|