khewat / app.py
Fanu2's picture
Update app.py
3daceb5 verified
import streamlit as st
import pandas as pd
import re
from io import StringIO
from PyPDF2 import PdfReader
st.set_page_config(page_title="Jamabandi OCR Parser", layout="wide")
st.title("🏡 Jamabandi OCR Parser")
st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.")
uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"])
def extract_text(file):
if file.name.endswith(".pdf"):
reader = PdfReader(file)
text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
else:
text = file.read().decode("utf-8")
return text
def parse_jamabandi_table(text):
lines = text.splitlines()
lines = [line.strip() for line in lines if line.strip()]
# Skip metadata row
if len(lines) < 3:
return pd.DataFrame()
header_line = lines[1]
data_lines = lines[2:]
# Split header into columns (basic heuristic)
headers = re.split(r"\s{2,}", header_line)
records = []
for line in data_lines:
fields = re.split(r"\s{2,}", line)
if len(fields) >= len(headers):
record = dict(zip(headers, fields[:len(headers)]))
records.append(record)
return pd.DataFrame(records)
if uploaded_file:
raw_text = extract_text(uploaded_file)
df = parse_jamabandi_table(raw_text)
if not df.empty:
st.success("✅ Parsed Jamabandi Records")
st.dataframe(df, use_container_width=True)
csv = df.to_csv(index=False).encode('utf-8')
st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv")
else:
st.warning("⚠️ No structured records found. Please check the OCR format.")
with st.expander("📄 Raw OCR Text"):
st.text_area("OCR Preview", raw_text, height=300)
else:
st.info("Upload a Jamabandi PDF or TXT file to begin.")