Aadityaramrame commited on
Commit
7ad7c0a
·
verified ·
1 Parent(s): 284e9bf

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +8 -23
pipeline.py CHANGED
@@ -6,29 +6,16 @@ import os, json
6
  # --- Form Schema ---
7
  FORMS = {
8
  "pancard_form": [
9
- "Name",
10
- "DOB",
11
- "Gender",
12
- "FatherName",
13
- "MotherName",
14
- "Address",
15
- "City",
16
- "State",
17
- "Pincode",
18
- "Mobile",
19
- "Email",
20
- "DocumentType",
21
- "DocumentNumber",
22
- "IssueAuthority",
23
- "IssueDate",
24
- "ExpiryDate"
25
  ]
26
  }
27
 
28
- # --- Configure Gemini API ---
29
- genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
30
 
31
- # --- Step 1: Extract text from PDF ---
32
  def extract_text_from_pdf(pdf_path):
33
  pages = convert_from_path(pdf_path)
34
  text = ""
@@ -36,19 +23,17 @@ def extract_text_from_pdf(pdf_path):
36
  text += pytesseract.image_to_string(page) + "\n"
37
  return text.strip()
38
 
39
- # --- Step 2: Extract key-values using Gemini ---
40
  def extract_key_values_with_gemini(raw_text, form_type="pancard_form"):
41
  prompt = f"""
42
  You are an intelligent document parser.
43
- Given the following PAN form text, extract only these fields: {FORMS[form_type]}.
44
- Return the result strictly as JSON key-value pairs.
45
 
46
  Document text:
47
  {raw_text}
48
  """
49
  model = genai.GenerativeModel("gemini-1.5-flash")
50
  response = model.generate_content(prompt)
51
-
52
  try:
53
  return json.loads(response.text)
54
  except Exception:
 
6
  # --- Form Schema ---
7
  FORMS = {
8
  "pancard_form": [
9
+ "Name", "DOB", "Gender", "FatherName", "MotherName",
10
+ "Address", "City", "State", "Pincode", "Mobile", "Email",
11
+ "DocumentType", "DocumentNumber", "IssueAuthority",
12
+ "IssueDate", "ExpiryDate"
 
 
 
 
 
 
 
 
 
 
 
 
13
  ]
14
  }
15
 
16
+ # --- Configure Gemini ---
17
+ genai.configure(api_key=os.getenv("AIzaSyDn1EJXIB4QzcvUsAlckUTSDoOXYJe00QE"))
18
 
 
19
  def extract_text_from_pdf(pdf_path):
20
  pages = convert_from_path(pdf_path)
21
  text = ""
 
23
  text += pytesseract.image_to_string(page) + "\n"
24
  return text.strip()
25
 
 
26
  def extract_key_values_with_gemini(raw_text, form_type="pancard_form"):
27
  prompt = f"""
28
  You are an intelligent document parser.
29
+ Given the following document text, extract only these fields: {FORMS[form_type]}.
30
+ Return strictly as JSON key-value pairs.
31
 
32
  Document text:
33
  {raw_text}
34
  """
35
  model = genai.GenerativeModel("gemini-1.5-flash")
36
  response = model.generate_content(prompt)
 
37
  try:
38
  return json.loads(response.text)
39
  except Exception: