Spaces:

Fanu2
/

khewat

Sleeping

App Files Files Community

Fanu2 commited on Aug 18, 2025

Commit

3daceb5

verified ·

1 Parent(s): ef2f69c

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -33

app.py CHANGED Viewed

@@ -8,10 +8,8 @@ st.set_page_config(page_title="Jamabandi OCR Parser", layout="wide")
 st.title("🏡 Jamabandi OCR Parser")
 st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.")
-# File uploader
 uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"])
-# Extract text from file
 def extract_text(file):
     if file.name.endswith(".pdf"):
         reader = PdfReader(file)
@@ -20,47 +18,41 @@ def extract_text(file):
         text = file.read().decode("utf-8")
     return text
-# Parse OCR text into structured records
-def parse_jamabandi(text):
-    records = []
-    text = text.replace("\n", " ").replace(":", " : ")
-    # Extract global fields
-    village = re.search(r"गांव[:\s]*([^\s]+)", text)
-    tehsil = re.search(r"तहसी[lिi]+([^\s]+)", text)
-    district = re.search(r"जिला[:\s]*([^\s]+)", text)
-    khewat = re.search(r"खेवट[^\d]*(\d+)", text)
-    khatoni = re.search(r"खतौनी[^\d]*(\d+)", text)
-    mutation_dates = re.findall(r"तिथि[:\s]*(\d{1,2}-\d{1,2}-\d{4})", text)
-    # Owner blocks
-    owner_blocks = re.findall(r"([^\s]+ सिंह पुत्र[^\s]+)", text)
-    for i, block in enumerate(owner_blocks):
-        record = {
-            "Owner": block.split("पुत्र")[0].strip(),
-            "Father's Name": block.split("पुत्र")[1].strip() if "पुत्र" in block else "",
-            "Village": village.group(1) if village else "",
-            "Tehsil": tehsil.group(1) if tehsil else "",
-            "District": district.group(1) if district else "",
-            "Khewat No": khewat.group(1) if khewat else "",
-            "Khatoni No": khatoni.group(1) if khatoni else "",
-            "Mutation Date": mutation_dates[i] if i < len(mutation_dates) else "",
-        }
-        records.append(record)
     return pd.DataFrame(records)
-# Main logic
 if uploaded_file:
     raw_text = extract_text(uploaded_file)
-    df = parse_jamabandi(raw_text)
-    st.success("✅ Parsed Jamabandi Records")
-    st.dataframe(df, use_container_width=True)
-    csv = df.to_csv(index=False).encode('utf-8')
-    st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv")
     with st.expander("📄 Raw OCR Text"):
         st.text_area("OCR Preview", raw_text, height=300)

 st.title("🏡 Jamabandi OCR Parser")
 st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.")
 uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"])
 def extract_text(file):
     if file.name.endswith(".pdf"):
         reader = PdfReader(file)
         text = file.read().decode("utf-8")
     return text
+def parse_jamabandi_table(text):
+    lines = text.splitlines()
+    lines = [line.strip() for line in lines if line.strip()]
+    # Skip metadata row
+    if len(lines) < 3:
+        return pd.DataFrame()
+    header_line = lines[1]
+    data_lines = lines[2:]
+    # Split header into columns (basic heuristic)
+    headers = re.split(r"\s{2,}", header_line)
+    records = []
+    for line in data_lines:
+        fields = re.split(r"\s{2,}", line)
+        if len(fields) >= len(headers):
+            record = dict(zip(headers, fields[:len(headers)]))
+            records.append(record)
     return pd.DataFrame(records)
 if uploaded_file:
     raw_text = extract_text(uploaded_file)
+    df = parse_jamabandi_table(raw_text)
+    if not df.empty:
+        st.success("✅ Parsed Jamabandi Records")
+        st.dataframe(df, use_container_width=True)
+        csv = df.to_csv(index=False).encode('utf-8')
+        st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv")
+    else:
+        st.warning("⚠️ No structured records found. Please check the OCR format.")
     with st.expander("📄 Raw OCR Text"):
         st.text_area("OCR Preview", raw_text, height=300)