Spaces:

NlpA
/

AutoForm

Sleeping

Aadityaramrame commited on Nov 6, 2025

Commit

7ad7c0a

verified ·

1 Parent(s): 284e9bf

Update pipeline.py

Files changed (1) hide show

pipeline.py CHANGED Viewed

@@ -6,29 +6,16 @@ import os, json
 # --- Form Schema ---
 FORMS = {
     "pancard_form": [
-        "Name",
-        "DOB",
-        "Gender",
-        "FatherName",
-        "MotherName",
-        "Address",
-        "City",
-        "State",
-        "Pincode",
-        "Mobile",
-        "Email",
-        "DocumentType",
-        "DocumentNumber",
-        "IssueAuthority",
-        "IssueDate",
-        "ExpiryDate"
     ]
 }
-# --- Configure Gemini API ---
-genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
-# --- Step 1: Extract text from PDF ---
 def extract_text_from_pdf(pdf_path):
     pages = convert_from_path(pdf_path)
     text = ""
@@ -36,19 +23,17 @@ def extract_text_from_pdf(pdf_path):
         text += pytesseract.image_to_string(page) + "\n"
     return text.strip()
-# --- Step 2: Extract key-values using Gemini ---
 def extract_key_values_with_gemini(raw_text, form_type="pancard_form"):
     prompt = f"""
 You are an intelligent document parser.
-Given the following PAN form text, extract only these fields: {FORMS[form_type]}.
-Return the result strictly as JSON key-value pairs.
 Document text:
 {raw_text}
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     try:
         return json.loads(response.text)
     except Exception:

 # --- Form Schema ---
 FORMS = {
     "pancard_form": [
+        "Name", "DOB", "Gender", "FatherName", "MotherName",
+        "Address", "City", "State", "Pincode", "Mobile", "Email",
+        "DocumentType", "DocumentNumber", "IssueAuthority",
+        "IssueDate", "ExpiryDate"
     ]
 }
+# --- Configure Gemini ---
+genai.configure(api_key=os.getenv("AIzaSyDn1EJXIB4QzcvUsAlckUTSDoOXYJe00QE"))
 def extract_text_from_pdf(pdf_path):
     pages = convert_from_path(pdf_path)
     text = ""
         text += pytesseract.image_to_string(page) + "\n"
     return text.strip()
 def extract_key_values_with_gemini(raw_text, form_type="pancard_form"):
     prompt = f"""
 You are an intelligent document parser.
+Given the following document text, extract only these fields: {FORMS[form_type]}.
+Return strictly as JSON key-value pairs.
 Document text:
 {raw_text}
 """
     model = genai.GenerativeModel("gemini-1.5-flash")
     response = model.generate_content(prompt)
     try:
         return json.loads(response.text)
     except Exception: