Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| import io | |
| import base64 | |
| from PyPDF2 import PdfReader | |
| from pdf2image import convert_from_bytes | |
| import pytesseract | |
| import re | |
| # Simulated Services | |
| SERVICES = { | |
| "fbr_tax": { | |
| "name": "FBR Tax Registration", | |
| "form_url": "https://www.sindheducation.gov.pk/Contents/Careers/APPLICATION%20FORM%20ME.pdf" | |
| }, | |
| "property": { | |
| "name": "Property Registration (Sindh)", | |
| "form_url": "https://www.sindheducation.gov.pk/Contents/Careers/APPLICATION%20FORM%20ME.pdf" | |
| }, | |
| "actual_form": { | |
| "name": "FBR Form", | |
| "form_url": "https://www.themodernfirm.com/wp-content/uploads/2017/12/Sample-Fillable-PDF.pdf" | |
| } | |
| } | |
| # Helper Functions | |
| def extract_text_from_pdf(file_bytes): | |
| with PdfReader(io.BytesIO(file_bytes)) as reader: | |
| return "\n".join(page.extract_text() or "" for page in reader.pages) | |
| def extract_text_with_ocr(file_bytes): | |
| images = convert_from_bytes(file_bytes) | |
| return "\n".join(pytesseract.image_to_string(img) for img in images) | |
| def extract_labels_from_text(file_bytes): | |
| text = extract_text_from_pdf(file_bytes) | |
| if not text.strip(): | |
| text = extract_text_with_ocr(file_bytes) | |
| return list(set(re.findall(r"[A-Z ]{3,}", text))) | |
| def extract_all_fields_from_text(text): | |
| lines = text.splitlines() | |
| extracted_data = {} | |
| for line in lines: | |
| if ':' in line: | |
| key, value = line.split(':', 1) | |
| extracted_data[key.strip()] = value.strip() | |
| return extracted_data | |
| def get_field_mapping_from_llm(form_fields, user_data): | |
| mapping = {} | |
| for field in form_fields: | |
| for key in user_data.keys(): | |
| if field.lower() in key.lower() or key.lower() in field.lower(): | |
| mapping[field] = key | |
| break | |
| return mapping | |
| def reconstruct_user_data(mapping, user_data): | |
| return {field: user_data.get(user_key, "") for field, user_key in mapping.items()} | |
| def extract_acroform_fields(reader): | |
| fields = [] | |
| if reader.get_fields(): | |
| for key in reader.get_fields().keys(): | |
| fields.append(key) | |
| return fields | |
| from PyPDF2 import PdfWriter | |
| def auto_fill_flat_pdf_smart(pdf_bytes, output_path, data): | |
| from PyPDF2 import PdfReader, PdfWriter | |
| reader = PdfReader(io.BytesIO(pdf_bytes)) | |
| writer = PdfWriter() | |
| for page in reader.pages: | |
| writer.add_page(page) | |
| writer.add_metadata({f"/{k}": v for k, v in data.items()}) | |
| with open(output_path, "wb") as f: | |
| writer.write(f) | |
| def recommend_tax_form_type(query): | |
| if "income" in query.lower(): | |
| return ["Income Tax"] | |
| elif "sales" in query.lower(): | |
| return ["Sales Tax"] | |
| else: | |
| return ["General Tax"] | |
| def tax_agent_response(query): | |
| if "form" in query.lower() or "do i need" in query.lower(): | |
| return "You will need a form to register." | |
| return "" | |
| # Streamlit App Interface | |
| st.set_page_config(page_title="PDF Form Assistant", layout="centered") | |
| st.title("📄 Smart PDF Form Assistant") | |
| query = st.text_input("How can I assist you today?", placeholder="E.g., I want to register for FBR tax") | |
| if query: | |
| service_key = "fbr_tax" if "fbr" in query or "tax" in query else "property" if "property" in query else None | |
| if service_key: | |
| agent_response = tax_agent_response(query) | |
| session = "Assistant" if "form" in query.lower() or "need" in query.lower() else "Chatbot" | |
| if session == "Assistant": | |
| recommended_types = recommend_tax_form_type(query) | |
| st.success(f"Recommended Form Types: {', '.join(recommended_types)}") | |
| with st.spinner("Downloading form..."): | |
| try: | |
| resp = requests.get(SERVICES["actual_form"]["form_url"], timeout=10) | |
| resp.raise_for_status() | |
| form_bytes = resp.content | |
| except Exception as e: | |
| st.error(f"Failed to download form: {e}") | |
| form_bytes = None | |
| if form_bytes: | |
| if st.radio("Do you want help filling the form?", ["Yes", "No"]) == "Yes": | |
| reader = PdfReader(io.BytesIO(form_bytes)) | |
| acro_fields = extract_acroform_fields(reader) | |
| if acro_fields: | |
| st.subheader("Detected Fields:") | |
| st.write(acro_fields) | |
| mode = st.radio("Enter data manually or upload filled data PDF?", ["Manual", "Upload"]) | |
| user_data = {} | |
| if mode == "Manual": | |
| for field in acro_fields: | |
| user_data[field] = st.text_input(f"{field}:") | |
| else: | |
| uploaded_file = st.file_uploader("Upload your data PDF", type=["pdf"]) | |
| if uploaded_file: | |
| user_text = extract_text_from_pdf(uploaded_file.read()) | |
| if not user_text.strip(): | |
| uploaded_file.seek(0) | |
| user_text = extract_text_with_ocr(uploaded_file.read()) | |
| raw_user_data = extract_all_fields_from_text(user_text) | |
| field_mapping = get_field_mapping_from_llm(acro_fields, raw_user_data) | |
| final_user_data = reconstruct_user_data(field_mapping, raw_user_data) | |
| user_data = final_user_data | |
| if st.button("🔽 Generate Filled Form"): | |
| auto_fill_flat_pdf_smart(form_bytes, "filled_form.pdf", user_data) | |
| with open("filled_form.pdf", "rb") as f: | |
| b64 = base64.b64encode(f.read()).decode() | |
| href = f'<a href="data:application/octet-stream;base64,{b64}" download="filled_form.pdf">📥 Download Filled Form</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| else: | |
| st.warning("No AcroForm fields detected. Using flat form filling.") | |
| labels = extract_labels_from_text(form_bytes) | |
| st.write("Detected Labels:", labels) | |
| uploaded_file = st.file_uploader("Upload your data PDF", type=["pdf"]) | |
| if uploaded_file: | |
| user_text = extract_text_from_pdf(uploaded_file.read()) | |
| if not user_text.strip(): | |
| uploaded_file.seek(0) | |
| user_text = extract_text_with_ocr(uploaded_file.read()) | |
| raw_user_data = extract_all_fields_from_text(user_text) | |
| field_mapping = get_field_mapping_from_llm(labels, raw_user_data) | |
| final_user_data = reconstruct_user_data(field_mapping, raw_user_data) | |
| if st.button("🔽 Generate Filled Flat Form"): | |
| auto_fill_flat_pdf_smart(form_bytes, "flat_filled_form.pdf", final_user_data) | |
| with open("flat_filled_form.pdf", "rb") as f: | |
| b64 = base64.b64encode(f.read()).decode() | |
| href = f'<a href="data:application/octet-stream;base64,{b64}" download="flat_filled_form.pdf">📥 Download Filled Flat Form</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| else: | |
| st.info("You can download the form and fill it manually:") | |
| st.markdown(f"[Open Form]({SERVICES[service_key]['form_url']})") | |
| else: | |
| st.write("🤖 Chatbot mode active.") | |
| else: | |
| st.error("Could not determine the type of service you're looking for.") | |