Spaces:

NlpA
/

AutoForm

Sleeping

Aadityaramrame commited on Nov 6, 2025

Commit

284e9bf

verified ·

1 Parent(s): 90e84d2

Create pipeline.py

Files changed (1) hide show

pipeline.py ADDED Viewed

+import pytesseract
+from pdf2image import convert_from_path
+import google.generativeai as genai
+import os, json
+# --- Form Schema ---
+FORMS = {
+    "pancard_form": [
+        "Name",
+        "DOB",
+        "Gender",
+        "FatherName",
+        "MotherName",
+        "Address",
+        "City",
+        "State",
+        "Pincode",
+        "Mobile",
+        "Email",
+        "DocumentType",
+        "DocumentNumber",
+        "IssueAuthority",
+        "IssueDate",
+        "ExpiryDate"
+    ]
+}
+# --- Configure Gemini API ---
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# --- Step 1: Extract text from PDF ---
+def extract_text_from_pdf(pdf_path):
+    pages = convert_from_path(pdf_path)
+    text = ""
+    for page in pages:
+        text += pytesseract.image_to_string(page) + "\n"
+    return text.strip()
+# --- Step 2: Extract key-values using Gemini ---
+def extract_key_values_with_gemini(raw_text, form_type="pancard_form"):
+    prompt = f"""
+You are an intelligent document parser.
+Given the following PAN form text, extract only these fields: {FORMS[form_type]}.
+Return the result strictly as JSON key-value pairs.
+Document text:
+{raw_text}
+"""
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    response = model.generate_content(prompt)
+    try:
+        return json.loads(response.text)
+    except Exception:
+        return {"raw_output": response.text}