Aadityaramrame commited on
Commit
284e9bf
·
verified ·
1 Parent(s): 90e84d2

Create pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +55 -0
pipeline.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytesseract
2
+ from pdf2image import convert_from_path
3
+ import google.generativeai as genai
4
+ import os, json
5
+
6
+ # --- Form Schema ---
7
+ FORMS = {
8
+ "pancard_form": [
9
+ "Name",
10
+ "DOB",
11
+ "Gender",
12
+ "FatherName",
13
+ "MotherName",
14
+ "Address",
15
+ "City",
16
+ "State",
17
+ "Pincode",
18
+ "Mobile",
19
+ "Email",
20
+ "DocumentType",
21
+ "DocumentNumber",
22
+ "IssueAuthority",
23
+ "IssueDate",
24
+ "ExpiryDate"
25
+ ]
26
+ }
27
+
28
+ # --- Configure Gemini API ---
29
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
30
+
31
+ # --- Step 1: Extract text from PDF ---
32
+ def extract_text_from_pdf(pdf_path):
33
+ pages = convert_from_path(pdf_path)
34
+ text = ""
35
+ for page in pages:
36
+ text += pytesseract.image_to_string(page) + "\n"
37
+ return text.strip()
38
+
39
+ # --- Step 2: Extract key-values using Gemini ---
40
+ def extract_key_values_with_gemini(raw_text, form_type="pancard_form"):
41
+ prompt = f"""
42
+ You are an intelligent document parser.
43
+ Given the following PAN form text, extract only these fields: {FORMS[form_type]}.
44
+ Return the result strictly as JSON key-value pairs.
45
+
46
+ Document text:
47
+ {raw_text}
48
+ """
49
+ model = genai.GenerativeModel("gemini-1.5-flash")
50
+ response = model.generate_content(prompt)
51
+
52
+ try:
53
+ return json.loads(response.text)
54
+ except Exception:
55
+ return {"raw_output": response.text}