import streamlit as st import pandas as pd import re from io import StringIO from PyPDF2 import PdfReader st.set_page_config(page_title="Jamabandi OCR Parser", layout="wide") st.title("🏡 Jamabandi OCR Parser") st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.") uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"]) def extract_text(file): if file.name.endswith(".pdf"): reader = PdfReader(file) text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) else: text = file.read().decode("utf-8") return text def parse_jamabandi_table(text): lines = text.splitlines() lines = [line.strip() for line in lines if line.strip()] # Skip metadata row if len(lines) < 3: return pd.DataFrame() header_line = lines[1] data_lines = lines[2:] # Split header into columns (basic heuristic) headers = re.split(r"\s{2,}", header_line) records = [] for line in data_lines: fields = re.split(r"\s{2,}", line) if len(fields) >= len(headers): record = dict(zip(headers, fields[:len(headers)])) records.append(record) return pd.DataFrame(records) if uploaded_file: raw_text = extract_text(uploaded_file) df = parse_jamabandi_table(raw_text) if not df.empty: st.success("✅ Parsed Jamabandi Records") st.dataframe(df, use_container_width=True) csv = df.to_csv(index=False).encode('utf-8') st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv") else: st.warning("⚠️ No structured records found. Please check the OCR format.") with st.expander("📄 Raw OCR Text"): st.text_area("OCR Preview", raw_text, height=300) else: st.info("Upload a Jamabandi PDF or TXT file to begin.")