|
|
import pytesseract |
|
|
from pdf2image import convert_from_path |
|
|
import google.generativeai as genai |
|
|
import os, json |
|
|
|
|
|
|
|
|
FORMS = { |
|
|
"pancard_form": [ |
|
|
"Name", "DOB", "Gender", "FatherName", "MotherName", |
|
|
"Address", "City", "State", "Pincode", "Mobile", "Email", |
|
|
"DocumentType", "DocumentNumber", "IssueAuthority", |
|
|
"IssueDate", "ExpiryDate" |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
api_key = os.getenv("GEMINI_API_KEY") |
|
|
if not api_key: |
|
|
raise ValueError("❌ GEMINI_API_KEY not found. Please set it in Hugging Face Space Secrets.") |
|
|
genai.configure(api_key=api_key) |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
pages = convert_from_path(pdf_path) |
|
|
text = "" |
|
|
for page in pages: |
|
|
text += pytesseract.image_to_string(page) + "\n" |
|
|
return text.strip() |
|
|
def extract_key_values_with_gemini(raw_text, form_type="pancard_form"): |
|
|
prompt = f""" |
|
|
You are an intelligent document parser. |
|
|
Given the following document text, extract only these fields: {FORMS[form_type]}. |
|
|
Return strictly as JSON key-value pairs. |
|
|
Document text: |
|
|
{raw_text} |
|
|
""" |
|
|
model = genai.GenerativeModel("models/gemini-2.5-flash") |
|
|
print("Gemini API called successfully ✅") |
|
|
response = model.generate_content(prompt) |
|
|
text = response.text.strip() |
|
|
|
|
|
|
|
|
text = text.replace("```json", "").replace("```", "").strip() |
|
|
|
|
|
try: |
|
|
return json.loads(text) |
|
|
except Exception: |
|
|
return {"raw_output": text} |