chburhan64's picture
Update app.py
1cc33aa verified
import streamlit as st
import requests
import io
import base64
from PyPDF2 import PdfReader
from pdf2image import convert_from_bytes
import pytesseract
import re
# Simulated Services
SERVICES = {
"fbr_tax": {
"name": "FBR Tax Registration",
"form_url": "https://www.sindheducation.gov.pk/Contents/Careers/APPLICATION%20FORM%20ME.pdf"
},
"property": {
"name": "Property Registration (Sindh)",
"form_url": "https://www.sindheducation.gov.pk/Contents/Careers/APPLICATION%20FORM%20ME.pdf"
},
"actual_form": {
"name": "FBR Form",
"form_url": "https://www.themodernfirm.com/wp-content/uploads/2017/12/Sample-Fillable-PDF.pdf"
}
}
# Helper Functions
def extract_text_from_pdf(file_bytes):
with PdfReader(io.BytesIO(file_bytes)) as reader:
return "\n".join(page.extract_text() or "" for page in reader.pages)
def extract_text_with_ocr(file_bytes):
images = convert_from_bytes(file_bytes)
return "\n".join(pytesseract.image_to_string(img) for img in images)
def extract_labels_from_text(file_bytes):
text = extract_text_from_pdf(file_bytes)
if not text.strip():
text = extract_text_with_ocr(file_bytes)
return list(set(re.findall(r"[A-Z ]{3,}", text)))
def extract_all_fields_from_text(text):
lines = text.splitlines()
extracted_data = {}
for line in lines:
if ':' in line:
key, value = line.split(':', 1)
extracted_data[key.strip()] = value.strip()
return extracted_data
def get_field_mapping_from_llm(form_fields, user_data):
mapping = {}
for field in form_fields:
for key in user_data.keys():
if field.lower() in key.lower() or key.lower() in field.lower():
mapping[field] = key
break
return mapping
def reconstruct_user_data(mapping, user_data):
return {field: user_data.get(user_key, "") for field, user_key in mapping.items()}
def extract_acroform_fields(reader):
fields = []
if reader.get_fields():
for key in reader.get_fields().keys():
fields.append(key)
return fields
from PyPDF2 import PdfWriter
def auto_fill_flat_pdf_smart(pdf_bytes, output_path, data):
from PyPDF2 import PdfReader, PdfWriter
reader = PdfReader(io.BytesIO(pdf_bytes))
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
writer.add_metadata({f"/{k}": v for k, v in data.items()})
with open(output_path, "wb") as f:
writer.write(f)
def recommend_tax_form_type(query):
if "income" in query.lower():
return ["Income Tax"]
elif "sales" in query.lower():
return ["Sales Tax"]
else:
return ["General Tax"]
def tax_agent_response(query):
if "form" in query.lower() or "do i need" in query.lower():
return "You will need a form to register."
return ""
# Streamlit App Interface
st.set_page_config(page_title="PDF Form Assistant", layout="centered")
st.title("📄 Smart PDF Form Assistant")
query = st.text_input("How can I assist you today?", placeholder="E.g., I want to register for FBR tax")
if query:
service_key = "fbr_tax" if "fbr" in query or "tax" in query else "property" if "property" in query else None
if service_key:
agent_response = tax_agent_response(query)
session = "Assistant" if "form" in query.lower() or "need" in query.lower() else "Chatbot"
if session == "Assistant":
recommended_types = recommend_tax_form_type(query)
st.success(f"Recommended Form Types: {', '.join(recommended_types)}")
with st.spinner("Downloading form..."):
try:
resp = requests.get(SERVICES["actual_form"]["form_url"], timeout=10)
resp.raise_for_status()
form_bytes = resp.content
except Exception as e:
st.error(f"Failed to download form: {e}")
form_bytes = None
if form_bytes:
if st.radio("Do you want help filling the form?", ["Yes", "No"]) == "Yes":
reader = PdfReader(io.BytesIO(form_bytes))
acro_fields = extract_acroform_fields(reader)
if acro_fields:
st.subheader("Detected Fields:")
st.write(acro_fields)
mode = st.radio("Enter data manually or upload filled data PDF?", ["Manual", "Upload"])
user_data = {}
if mode == "Manual":
for field in acro_fields:
user_data[field] = st.text_input(f"{field}:")
else:
uploaded_file = st.file_uploader("Upload your data PDF", type=["pdf"])
if uploaded_file:
user_text = extract_text_from_pdf(uploaded_file.read())
if not user_text.strip():
uploaded_file.seek(0)
user_text = extract_text_with_ocr(uploaded_file.read())
raw_user_data = extract_all_fields_from_text(user_text)
field_mapping = get_field_mapping_from_llm(acro_fields, raw_user_data)
final_user_data = reconstruct_user_data(field_mapping, raw_user_data)
user_data = final_user_data
if st.button("🔽 Generate Filled Form"):
auto_fill_flat_pdf_smart(form_bytes, "filled_form.pdf", user_data)
with open("filled_form.pdf", "rb") as f:
b64 = base64.b64encode(f.read()).decode()
href = f'<a href="data:application/octet-stream;base64,{b64}" download="filled_form.pdf">📥 Download Filled Form</a>'
st.markdown(href, unsafe_allow_html=True)
else:
st.warning("No AcroForm fields detected. Using flat form filling.")
labels = extract_labels_from_text(form_bytes)
st.write("Detected Labels:", labels)
uploaded_file = st.file_uploader("Upload your data PDF", type=["pdf"])
if uploaded_file:
user_text = extract_text_from_pdf(uploaded_file.read())
if not user_text.strip():
uploaded_file.seek(0)
user_text = extract_text_with_ocr(uploaded_file.read())
raw_user_data = extract_all_fields_from_text(user_text)
field_mapping = get_field_mapping_from_llm(labels, raw_user_data)
final_user_data = reconstruct_user_data(field_mapping, raw_user_data)
if st.button("🔽 Generate Filled Flat Form"):
auto_fill_flat_pdf_smart(form_bytes, "flat_filled_form.pdf", final_user_data)
with open("flat_filled_form.pdf", "rb") as f:
b64 = base64.b64encode(f.read()).decode()
href = f'<a href="data:application/octet-stream;base64,{b64}" download="flat_filled_form.pdf">📥 Download Filled Flat Form</a>'
st.markdown(href, unsafe_allow_html=True)
else:
st.info("You can download the form and fill it manually:")
st.markdown(f"[Open Form]({SERVICES[service_key]['form_url']})")
else:
st.write("🤖 Chatbot mode active.")
else:
st.error("Could not determine the type of service you're looking for.")