Fanu2 commited on
Commit
3daceb5
·
verified ·
1 Parent(s): ef2f69c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -33
app.py CHANGED
@@ -8,10 +8,8 @@ st.set_page_config(page_title="Jamabandi OCR Parser", layout="wide")
8
  st.title("🏡 Jamabandi OCR Parser")
9
  st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.")
10
 
11
- # File uploader
12
  uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"])
13
 
14
- # Extract text from file
15
  def extract_text(file):
16
  if file.name.endswith(".pdf"):
17
  reader = PdfReader(file)
@@ -20,47 +18,41 @@ def extract_text(file):
20
  text = file.read().decode("utf-8")
21
  return text
22
 
23
- # Parse OCR text into structured records
24
- def parse_jamabandi(text):
25
- records = []
26
- text = text.replace("\n", " ").replace(":", " : ")
 
 
 
27
 
28
- # Extract global fields
29
- village = re.search(r"गांव[:\s]*([^\s]+)", text)
30
- tehsil = re.search(r"तहसी[lिi]+([^\s]+)", text)
31
- district = re.search(r"जिला[:\s]*([^\s]+)", text)
32
- khewat = re.search(r"खेवट[^\d]*(\d+)", text)
33
- khatoni = re.search(r"खतौनी[^\d]*(\d+)", text)
34
- mutation_dates = re.findall(r"तिथि[:\s]*(\d{1,2}-\d{1,2}-\d{4})", text)
35
 
36
- # Owner blocks
37
- owner_blocks = re.findall(r"([^\s]+ सिंह पुत्र[^\s]+)", text)
38
 
39
- for i, block in enumerate(owner_blocks):
40
- record = {
41
- "Owner": block.split("पुत्र")[0].strip(),
42
- "Father's Name": block.split("पुत्र")[1].strip() if "पुत्र" in block else "",
43
- "Village": village.group(1) if village else "",
44
- "Tehsil": tehsil.group(1) if tehsil else "",
45
- "District": district.group(1) if district else "",
46
- "Khewat No": khewat.group(1) if khewat else "",
47
- "Khatoni No": khatoni.group(1) if khatoni else "",
48
- "Mutation Date": mutation_dates[i] if i < len(mutation_dates) else "",
49
- }
50
- records.append(record)
51
 
52
  return pd.DataFrame(records)
53
 
54
- # Main logic
55
  if uploaded_file:
56
  raw_text = extract_text(uploaded_file)
57
- df = parse_jamabandi(raw_text)
58
 
59
- st.success("✅ Parsed Jamabandi Records")
60
- st.dataframe(df, use_container_width=True)
 
61
 
62
- csv = df.to_csv(index=False).encode('utf-8')
63
- st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv")
 
 
64
 
65
  with st.expander("📄 Raw OCR Text"):
66
  st.text_area("OCR Preview", raw_text, height=300)
 
8
  st.title("🏡 Jamabandi OCR Parser")
9
  st.markdown("Upload a Jamabandi PDF or OCR text file to extract structured land record data.")
10
 
 
11
  uploaded_file = st.file_uploader("📤 Upload PDF or TXT", type=["pdf", "txt"])
12
 
 
13
  def extract_text(file):
14
  if file.name.endswith(".pdf"):
15
  reader = PdfReader(file)
 
18
  text = file.read().decode("utf-8")
19
  return text
20
 
21
+ def parse_jamabandi_table(text):
22
+ lines = text.splitlines()
23
+ lines = [line.strip() for line in lines if line.strip()]
24
+
25
+ # Skip metadata row
26
+ if len(lines) < 3:
27
+ return pd.DataFrame()
28
 
29
+ header_line = lines[1]
30
+ data_lines = lines[2:]
 
 
 
 
 
31
 
32
+ # Split header into columns (basic heuristic)
33
+ headers = re.split(r"\s{2,}", header_line)
34
 
35
+ records = []
36
+ for line in data_lines:
37
+ fields = re.split(r"\s{2,}", line)
38
+ if len(fields) >= len(headers):
39
+ record = dict(zip(headers, fields[:len(headers)]))
40
+ records.append(record)
 
 
 
 
 
 
41
 
42
  return pd.DataFrame(records)
43
 
 
44
  if uploaded_file:
45
  raw_text = extract_text(uploaded_file)
46
+ df = parse_jamabandi_table(raw_text)
47
 
48
+ if not df.empty:
49
+ st.success("✅ Parsed Jamabandi Records")
50
+ st.dataframe(df, use_container_width=True)
51
 
52
+ csv = df.to_csv(index=False).encode('utf-8')
53
+ st.download_button("📥 Download CSV", csv, "jamabandi_records.csv", "text/csv")
54
+ else:
55
+ st.warning("⚠️ No structured records found. Please check the OCR format.")
56
 
57
  with st.expander("📄 Raw OCR Text"):
58
  st.text_area("OCR Preview", raw_text, height=300)